Spaces:
Sleeping
Sleeping
File size: 6,619 Bytes
61ad5f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import streamlit as st
# from gliner import GLiNER
from datasets import load_dataset
import evaluate
import numpy as np
import threading
import time
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model, TaskType
import torch
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
seqeval = evaluate.load("seqeval")
# id2label = {0: "O"}
# label2id = {"O": 0}
# def build_id2label(examples):
# for i, label in enumerate(examples["mbert_token_classes"]):
# if label.startswith("I-") and label not in label2id:
# current_len = len(id2label)
# id2label[current_len] = label
# label2id[label] = current_len
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
# Load the fine-tuned GLiNER model
st.write('Loading the pretrained model ...')
model_name = "iiiorg/piiranha-v1-detect-personal-information"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model)
# Prepare model for LoRA training
model.train() # model in evaluation mode (dropout modules are activated)
# enable gradient check pointing
model.gradient_checkpointing_enable()
# enable quantized training
model = prepare_model_for_kbit_training(model)
# LoRA config
config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["query_proj"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.TOKEN_CLS
)
# LoRA trainable version of model
model = get_peft_model(model, config)
print(model)
# trainable parameter count
model.print_trainable_parameters()
# # print weights
# pytorch_total_params = sum(p.numel() for p in model.parameters())
# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
if torch.cuda.is_available():
model = model.to("cuda")
# Load data.
raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
raw_dataset = raw_dataset.train_test_split(test_size=0.2)
print(raw_dataset)
print(raw_dataset.column_names)
# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
# inputs = tokenizer(
# raw_dataset['train'][0]['mbert_tokens'],
# truncation=True,
# is_split_into_words=True)
# print(inputs)
# print(inputs.tokens())
# print(inputs.word_ids())
# Build label2id and id2label
st.write("Building label mappings")
label2id = model.config.label2id
id2label = model.config.id2label
# raw_dataset.map(
# build_id2label,
# batched=False)
st.write("id2label: ", model.config.id2label)
st.write("label2id: ", model.config.label2id)
# function to align labels with tokens
# --> special tokens: -100 label id (ignored by cross entropy),
# --> if tokens are inside a word, replace 'B-' with 'I-'
def align_labels_with_tokens(labels):
aligned_label_ids = []
aligned_label_ids.append(-100)
for i, label in enumerate(labels):
if label.startswith("B-"):
label = label.replace("B-", "I-")
aligned_label_ids.append(label2id[label])
aligned_label_ids.append(-100)
return aligned_label_ids
# create tokenize function
def tokenize_function(examples):
# tokenize and truncate text. The examples argument would have already stripped
# the train or test label.
new_labels = []
inputs = tokenizer(
examples['mbert_tokens'],
is_split_into_words=True,
padding=True,
truncation=True,
max_length=512)
for _, labels in enumerate(examples['mbert_token_classes']):
new_labels.append(align_labels_with_tokens(labels))
inputs["labels"] = new_labels
return inputs
# tokenize training and validation datasets
tokenized_data = raw_dataset.map(
tokenize_function,
batched=True)
# data collator
data_collator = DataCollatorForTokenClassification(tokenizer)
st.write(tokenized_data["train"][:2]["labels"])
import os
# Print all CUDA environment variables
for key, value in os.environ.items():
if "CUDA" in key.upper():
print(f"{key}={value}")
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
true_predictions = [
[id2label[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
# hyperparameters
lr = 2e-4
batch_size = 4
num_epochs = 4
output_dir = "xia-lora-deberta-v2"
# define training arguments
training_args = TrainingArguments(
output_dir= output_dir,
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01,
logging_strategy="epoch",
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
gradient_accumulation_steps=4,
warmup_steps=2,
fp16=True,
optim="paged_adamw_8bit",
)
# configure trainer
trainer = Trainer(
model=model,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
args=training_args,
data_collator=data_collator,
compute_metrics=compute_metrics
)
# train model
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
trainer.train()
# renable warnings
model.config.use_cache = True
st.write('Pushing model to huggingface')
# Push model to huggingface
hf_name = 'CarolXia' # your hf username or org name
model_id = hf_name + "/" + output_dir
model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
|