kd-finetune / app.py
CarolXia's picture
initial commit
5418ef7
raw
history blame
6.62 kB
import streamlit as st
# from gliner import GLiNER
from datasets import load_dataset
import evaluate
import numpy as np
import threading
import time
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
seqeval = evaluate.load("seqeval")
# id2label = {0: "O"}
# label2id = {"O": 0}
# def build_id2label(examples):
# for i, label in enumerate(examples["mbert_token_classes"]):
# if label.startswith("I-") and label not in label2id:
# current_len = len(id2label)
# id2label[current_len] = label
# label2id[label] = current_len
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Load the fine-tuned GLiNER model
st.write('Loading the pretrained model ...')
model_name = "iiiorg/piiranha-v1-detect-personal-information"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model)
# Prepare model for LoRA training
model.train() # model in evaluation mode (dropout modules are activated)
# enable gradient check pointing
model.gradient_checkpointing_enable()
# enable quantized training
model = prepare_model_for_kbit_training(model)
# LoRA config
config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["query_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.TOKEN_CLS
)
# LoRA trainable version of model
model = get_peft_model(model, config)
print(model)
# trainable parameter count
model.print_trainable_parameters()
# # print weights
# pytorch_total_params = sum(p.numel() for p in model.parameters())
# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
if torch.cuda.is_available():
model = model.to("cuda")
# Load data.
raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
raw_dataset = raw_dataset.train_test_split(test_size=0.2)
print(raw_dataset)
print(raw_dataset.column_names)
# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
# inputs = tokenizer(
# raw_dataset['train'][0]['mbert_tokens'],
# truncation=True,
# is_split_into_words=True)
# print(inputs)
# print(inputs.tokens())
# print(inputs.word_ids())
# Build label2id and id2label
st.write("Building label mappings")
label2id = model.config.label2id
id2label = model.config.id2label
# raw_dataset.map(
# build_id2label,
# batched=False)
st.write("id2label: ", model.config.id2label)
st.write("label2id: ", model.config.label2id)
# function to align labels with tokens
# --> special tokens: -100 label id (ignored by cross entropy),
# --> if tokens are inside a word, replace 'B-' with 'I-'
def align_labels_with_tokens(labels):
aligned_label_ids = []
aligned_label_ids.append(-100)
for i, label in enumerate(labels):
if label.startswith("B-"):
label = label.replace("B-", "I-")
aligned_label_ids.append(label2id[label])
aligned_label_ids.append(-100)
return aligned_label_ids
# create tokenize function
def tokenize_function(examples):
# tokenize and truncate text. The examples argument would have already stripped
# the train or test label.
new_labels = []
inputs = tokenizer(
examples['mbert_tokens'],
is_split_into_words=True,
padding=True,
truncation=True,
max_length=512)
for _, labels in enumerate(examples['mbert_token_classes']):
new_labels.append(align_labels_with_tokens(labels))
inputs["labels"] = new_labels
return inputs
# tokenize training and validation datasets
tokenized_data = raw_dataset.map(
tokenize_function,
batched=True)
# data collator
data_collator = DataCollatorForTokenClassification(tokenizer)
st.write(tokenized_data["train"][:2]["labels"])
import os
# Print all CUDA environment variables
for key, value in os.environ.items():
if "CUDA" in key.upper():
print(f"{key}={value}")
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
true_predictions = [
[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 4
output_dir = "xia-lora-deberta-v2"
# define training arguments
training_args = TrainingArguments(
output_dir= output_dir,
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
logging_strategy="epoch",
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
gradient_accumulation_steps=4,
warmup_steps=2,
fp16=True,
optim="paged_adamw_8bit",
)
# configure trainer
trainer = Trainer(
model=model,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
args=training_args,
data_collator=data_collator,
compute_metrics=compute_metrics
)
# train model
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
# renable warnings
model.config.use_cache = True
st.write('Pushing model to huggingface')
# Push model to huggingface
hf_name = 'CarolXia' # your hf username or org name
model_id = hf_name + "/" + output_dir
model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])