import streamlit as st # from gliner import GLiNER from datasets import load_dataset import evaluate import numpy as np import threading import time from peft import prepare_model_for_kbit_training from peft import LoraConfig, get_peft_model, TaskType import torch from torch.profiler import profile, record_function, ProfilerActivity from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments seqeval = evaluate.load("seqeval") # id2label = {0: "O"} # label2id = {"O": 0} # def build_id2label(examples): # for i, label in enumerate(examples["mbert_token_classes"]): # if label.startswith("I-") and label not in label2id: # current_len = len(id2label) # id2label[current_len] = label # label2id[label] = current_len print(f"Is CUDA available: {torch.cuda.is_available()}") # True if torch.cuda.is_available(): print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") # Load the fine-tuned GLiNER model st.write('Loading the pretrained model ...') model_name = "iiiorg/piiranha-v1-detect-personal-information" model = AutoModelForTokenClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) print(model) # Prepare model for LoRA training model.train() # model in evaluation mode (dropout modules are activated) # enable gradient check pointing model.gradient_checkpointing_enable() # enable quantized training model = prepare_model_for_kbit_training(model) # LoRA config config = LoraConfig( r=8, lora_alpha=32, target_modules=["query_proj"], lora_dropout=0.05, bias="none", task_type=TaskType.TOKEN_CLS ) # LoRA trainable version of model model = get_peft_model(model, config) print(model) # trainable parameter count model.print_trainable_parameters() # # print weights # pytorch_total_params = sum(p.numel() for p in model.parameters()) # torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) # print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}') if torch.cuda.is_available(): model = model.to("cuda") # Load data. raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]') # raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en")) raw_dataset = raw_dataset.train_test_split(test_size=0.2) print(raw_dataset) print(raw_dataset.column_names) # raw_dataset = raw_dataset.select_columns(["mbert_tokens"]) # raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens") # raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels") # inputs = tokenizer( # raw_dataset['train'][0]['mbert_tokens'], # truncation=True, # is_split_into_words=True) # print(inputs) # print(inputs.tokens()) # print(inputs.word_ids()) # Build label2id and id2label st.write("Building label mappings") label2id = model.config.label2id id2label = model.config.id2label # raw_dataset.map( # build_id2label, # batched=False) st.write("id2label: ", model.config.id2label) st.write("label2id: ", model.config.label2id) # function to align labels with tokens # --> special tokens: -100 label id (ignored by cross entropy), # --> if tokens are inside a word, replace 'B-' with 'I-' def align_labels_with_tokens(labels): aligned_label_ids = [] aligned_label_ids.append(-100) for i, label in enumerate(labels): if label.startswith("B-"): label = label.replace("B-", "I-") aligned_label_ids.append(label2id[label]) aligned_label_ids.append(-100) return aligned_label_ids # create tokenize function def tokenize_function(examples): # tokenize and truncate text. The examples argument would have already stripped # the train or test label. new_labels = [] inputs = tokenizer( examples['mbert_tokens'], is_split_into_words=True, padding=True, truncation=True, max_length=512) for _, labels in enumerate(examples['mbert_token_classes']): new_labels.append(align_labels_with_tokens(labels)) inputs["labels"] = new_labels return inputs # tokenize training and validation datasets tokenized_data = raw_dataset.map( tokenize_function, batched=True) # data collator data_collator = DataCollatorForTokenClassification(tokenizer) st.write(tokenized_data["train"][:2]["labels"]) import os # Print all CUDA environment variables for key, value in os.environ.items(): if "CUDA" in key.upper(): print(f"{key}={value}") def compute_metrics(eval_preds): logits, labels = eval_preds predictions = np.argmax(logits, axis=-1) # Remove ignored index (special tokens) and convert to labels true_labels = [[id2label[l] for l in label if l != -100] for label in labels] true_predictions = [ [id2label[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels) return { "precision": all_metrics["overall_precision"], "recall": all_metrics["overall_recall"], "f1": all_metrics["overall_f1"], "accuracy": all_metrics["overall_accuracy"], } # hyperparameters lr = 2e-4 batch_size = 4 num_epochs = 4 output_dir = "xia-lora-deberta-v2" # define training arguments training_args = TrainingArguments( output_dir= output_dir, learning_rate=lr, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=num_epochs, weight_decay=0.01, logging_strategy="epoch", evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, gradient_accumulation_steps=4, warmup_steps=2, fp16=True, optim="paged_adamw_8bit", ) # configure trainer trainer = Trainer( model=model, train_dataset=tokenized_data["train"], eval_dataset=tokenized_data["test"], args=training_args, data_collator=data_collator, compute_metrics=compute_metrics ) # train model model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() # renable warnings model.config.use_cache = True st.write('Pushing model to huggingface') # Push model to huggingface hf_name = 'CarolXia' # your hf username or org name model_id = hf_name + "/" + output_dir model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"]) trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])