Spaces:

CarolXia
/

lora-finetune

Sleeping

File size: 6,619 Bytes

61ad5f0

import streamlit as st
# from gliner import GLiNER
from datasets import load_dataset
import evaluate
import numpy as np
import threading
import time
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments


seqeval = evaluate.load("seqeval")

# id2label = {0: "O"}
# label2id = {"O": 0}
# def build_id2label(examples):
#     for i, label in enumerate(examples["mbert_token_classes"]):
#         if label.startswith("I-") and label not in label2id:
#             current_len = len(id2label)
#             id2label[current_len] = label
#             label2id[label] = current_len

print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

# Load the fine-tuned GLiNER model
st.write('Loading the pretrained model ...')
model_name = "iiiorg/piiranha-v1-detect-personal-information"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(model)

# Prepare model for LoRA training
model.train() # model in evaluation mode (dropout modules are activated)
# enable gradient check pointing
model.gradient_checkpointing_enable()

# enable quantized training
model = prepare_model_for_kbit_training(model)

# LoRA config
config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.TOKEN_CLS
)

# LoRA trainable version of model
model = get_peft_model(model, config)

print(model)
# trainable parameter count
model.print_trainable_parameters()

# # print weights
# pytorch_total_params = sum(p.numel() for p in model.parameters())
# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')

if torch.cuda.is_available():
    model = model.to("cuda")

# Load data. 
raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
raw_dataset = raw_dataset.train_test_split(test_size=0.2)
print(raw_dataset)
print(raw_dataset.column_names)
# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")

# inputs = tokenizer(
#     raw_dataset['train'][0]['mbert_tokens'],
#     truncation=True,
#     is_split_into_words=True)
# print(inputs)
# print(inputs.tokens())
# print(inputs.word_ids())

# Build label2id and id2label
st.write("Building label mappings")
label2id = model.config.label2id
id2label = model.config.id2label 
# raw_dataset.map(
#     build_id2label,
#     batched=False)

st.write("id2label: ", model.config.id2label)
st.write("label2id: ", model.config.label2id)

# function to align labels with tokens 
# --> special tokens: -100 label id (ignored by cross entropy),
# --> if tokens are inside a word, replace 'B-' with 'I-' 
def align_labels_with_tokens(labels):
    aligned_label_ids = []
    aligned_label_ids.append(-100)
    for i, label in enumerate(labels):
        if label.startswith("B-"):
            label = label.replace("B-", "I-")
        aligned_label_ids.append(label2id[label])
    aligned_label_ids.append(-100)
    return aligned_label_ids

# create tokenize function
def tokenize_function(examples):
    # tokenize and truncate text. The examples argument would have already stripped
    # the train or test label.
    new_labels = []
    inputs = tokenizer(
        examples['mbert_tokens'],
        is_split_into_words=True,
        padding=True,
        truncation=True,
        max_length=512)
    for _, labels in enumerate(examples['mbert_token_classes']):
        new_labels.append(align_labels_with_tokens(labels))

    inputs["labels"] = new_labels
    return inputs

# tokenize training and validation datasets
tokenized_data = raw_dataset.map(
    tokenize_function,
    batched=True)
# data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

st.write(tokenized_data["train"][:2]["labels"])

import os

# Print all CUDA environment variables
for key, value in os.environ.items():
    if "CUDA" in key.upper():
        print(f"{key}={value}")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 4
output_dir = "xia-lora-deberta-v2" 

# define training arguments
training_args = TrainingArguments(
    output_dir= output_dir,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",
)

# configure trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
model.config.use_cache = True

st.write('Pushing model to huggingface')

# Push model to huggingface
hf_name = 'CarolXia' # your hf username or org name
model_id = hf_name + "/" + output_dir
model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])