CarolXia commited on
Commit
61ad5f0
·
0 Parent(s):

initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +212 -0
  4. requirements.txt +14 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Traceforce
3
+ emoji: 🦀
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.40.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: other
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # from gliner import GLiNER
3
+ from datasets import load_dataset
4
+ import evaluate
5
+ import numpy as np
6
+ import threading
7
+ import time
8
+ from peft import prepare_model_for_kbit_training
9
+ from peft import LoraConfig, get_peft_model, TaskType
10
+ import torch
11
+ from torch.profiler import profile, record_function, ProfilerActivity
12
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
13
+
14
+
15
+ seqeval = evaluate.load("seqeval")
16
+
17
+ # id2label = {0: "O"}
18
+ # label2id = {"O": 0}
19
+ # def build_id2label(examples):
20
+ # for i, label in enumerate(examples["mbert_token_classes"]):
21
+ # if label.startswith("I-") and label not in label2id:
22
+ # current_len = len(id2label)
23
+ # id2label[current_len] = label
24
+ # label2id[label] = current_len
25
+
26
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
27
+ # True
28
+ if torch.cuda.is_available():
29
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
30
+
31
+ # Load the fine-tuned GLiNER model
32
+ st.write('Loading the pretrained model ...')
33
+ model_name = "iiiorg/piiranha-v1-detect-personal-information"
34
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
35
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
36
+
37
+ print(model)
38
+
39
+ # Prepare model for LoRA training
40
+ model.train() # model in evaluation mode (dropout modules are activated)
41
+ # enable gradient check pointing
42
+ model.gradient_checkpointing_enable()
43
+
44
+ # enable quantized training
45
+ model = prepare_model_for_kbit_training(model)
46
+
47
+ # LoRA config
48
+ config = LoraConfig(
49
+ r=8,
50
+ lora_alpha=32,
51
+ target_modules=["query_proj"],
52
+ lora_dropout=0.05,
53
+ bias="none",
54
+ task_type=TaskType.TOKEN_CLS
55
+ )
56
+
57
+ # LoRA trainable version of model
58
+ model = get_peft_model(model, config)
59
+
60
+ print(model)
61
+ # trainable parameter count
62
+ model.print_trainable_parameters()
63
+
64
+ # # print weights
65
+ # pytorch_total_params = sum(p.numel() for p in model.parameters())
66
+ # torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
67
+ # print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
68
+
69
+ if torch.cuda.is_available():
70
+ model = model.to("cuda")
71
+
72
+ # Load data.
73
+ raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
74
+ # raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
75
+ raw_dataset = raw_dataset.train_test_split(test_size=0.2)
76
+ print(raw_dataset)
77
+ print(raw_dataset.column_names)
78
+ # raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
79
+ # raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
80
+ # raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
81
+
82
+ # inputs = tokenizer(
83
+ # raw_dataset['train'][0]['mbert_tokens'],
84
+ # truncation=True,
85
+ # is_split_into_words=True)
86
+ # print(inputs)
87
+ # print(inputs.tokens())
88
+ # print(inputs.word_ids())
89
+
90
+ # Build label2id and id2label
91
+ st.write("Building label mappings")
92
+ label2id = model.config.label2id
93
+ id2label = model.config.id2label
94
+ # raw_dataset.map(
95
+ # build_id2label,
96
+ # batched=False)
97
+
98
+ st.write("id2label: ", model.config.id2label)
99
+ st.write("label2id: ", model.config.label2id)
100
+
101
+ # function to align labels with tokens
102
+ # --> special tokens: -100 label id (ignored by cross entropy),
103
+ # --> if tokens are inside a word, replace 'B-' with 'I-'
104
+ def align_labels_with_tokens(labels):
105
+ aligned_label_ids = []
106
+ aligned_label_ids.append(-100)
107
+ for i, label in enumerate(labels):
108
+ if label.startswith("B-"):
109
+ label = label.replace("B-", "I-")
110
+ aligned_label_ids.append(label2id[label])
111
+ aligned_label_ids.append(-100)
112
+ return aligned_label_ids
113
+
114
+ # create tokenize function
115
+ def tokenize_function(examples):
116
+ # tokenize and truncate text. The examples argument would have already stripped
117
+ # the train or test label.
118
+ new_labels = []
119
+ inputs = tokenizer(
120
+ examples['mbert_tokens'],
121
+ is_split_into_words=True,
122
+ padding=True,
123
+ truncation=True,
124
+ max_length=512)
125
+ for _, labels in enumerate(examples['mbert_token_classes']):
126
+ new_labels.append(align_labels_with_tokens(labels))
127
+
128
+ inputs["labels"] = new_labels
129
+ return inputs
130
+
131
+ # tokenize training and validation datasets
132
+ tokenized_data = raw_dataset.map(
133
+ tokenize_function,
134
+ batched=True)
135
+ # data collator
136
+ data_collator = DataCollatorForTokenClassification(tokenizer)
137
+
138
+ st.write(tokenized_data["train"][:2]["labels"])
139
+
140
+ import os
141
+
142
+ # Print all CUDA environment variables
143
+ for key, value in os.environ.items():
144
+ if "CUDA" in key.upper():
145
+ print(f"{key}={value}")
146
+
147
+ def compute_metrics(eval_preds):
148
+ logits, labels = eval_preds
149
+ predictions = np.argmax(logits, axis=-1)
150
+
151
+ # Remove ignored index (special tokens) and convert to labels
152
+ true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
153
+ true_predictions = [
154
+ [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
155
+ for prediction, label in zip(predictions, labels)
156
+ ]
157
+ all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
158
+ return {
159
+ "precision": all_metrics["overall_precision"],
160
+ "recall": all_metrics["overall_recall"],
161
+ "f1": all_metrics["overall_f1"],
162
+ "accuracy": all_metrics["overall_accuracy"],
163
+ }
164
+
165
+ # hyperparameters
166
+ lr = 2e-4
167
+ batch_size = 4
168
+ num_epochs = 4
169
+ output_dir = "xia-lora-deberta-v2"
170
+
171
+ # define training arguments
172
+ training_args = TrainingArguments(
173
+ output_dir= output_dir,
174
+ learning_rate=lr,
175
+ per_device_train_batch_size=batch_size,
176
+ per_device_eval_batch_size=batch_size,
177
+ num_train_epochs=num_epochs,
178
+ weight_decay=0.01,
179
+ logging_strategy="epoch",
180
+ evaluation_strategy="epoch",
181
+ save_strategy="epoch",
182
+ load_best_model_at_end=True,
183
+ gradient_accumulation_steps=4,
184
+ warmup_steps=2,
185
+ fp16=True,
186
+ optim="paged_adamw_8bit",
187
+ )
188
+
189
+ # configure trainer
190
+ trainer = Trainer(
191
+ model=model,
192
+ train_dataset=tokenized_data["train"],
193
+ eval_dataset=tokenized_data["test"],
194
+ args=training_args,
195
+ data_collator=data_collator,
196
+ compute_metrics=compute_metrics
197
+ )
198
+
199
+ # train model
200
+ model.config.use_cache = False # silence the warnings. Please re-enable for inference!
201
+ trainer.train()
202
+
203
+ # renable warnings
204
+ model.config.use_cache = True
205
+
206
+ st.write('Pushing model to huggingface')
207
+
208
+ # Push model to huggingface
209
+ hf_name = 'CarolXia' # your hf username or org name
210
+ model_id = hf_name + "/" + output_dir
211
+ model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
212
+ trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto-gptq
2
+ bitsandbytes
3
+ datasets
4
+ evaluate
5
+ seqeval
6
+ gliner
7
+ torch>=2.0.0
8
+ transformers>=4.38.2
9
+ huggingface_hub>=0.21.4
10
+ onnxruntime
11
+ optimum
12
+ peft
13
+ sentencepiece
14
+ tqdm