praff1234 commited on
Commit
bdc29c8
1 Parent(s): 74c8aa5

adding trainer code

Browse files
FlanT5-train-test-idiomSimplifier.csv ADDED
The diff for this file is too large to render. See raw diff
 
dialog_summary.csv ADDED
The diff for this file is too large to render. See raw diff
 
trainer_code.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import *
2
+
3
+ # create_repo(repo_id="test-model")
4
+ import pandas as pd
5
+
6
+ from datasets import load_dataset
7
+
8
+
9
+ df_train = pd.read_csv("/home/prafull/apps_all/flan_tuning/FlanT5-train-test-idiomSimplifier.csv")
10
+ complex_sentences = df_train["Idiom sentences"].to_list()
11
+ simple_sentences = df_train["English casual"].to_list()
12
+
13
+ data_dict = {
14
+ "dialogue": complex_sentences,
15
+ "summary": simple_sentences
16
+ }
17
+
18
+ df_train_new = pd.DataFrame(data_dict)
19
+ # random shuffling
20
+ df_train_shuffled = df_train_new.sample(frac = 1, random_state=1)
21
+ # # Save pre-processed final data
22
+ df_train_shuffled.head(1000).to_csv("dialog_summary.csv", encoding="utf-8", index=False)
23
+
24
+ dataset = load_dataset("csv", data_files="dialog_summary.csv", split='train')
25
+
26
+ dataset = dataset.train_test_split(test_size=0.05)
27
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
28
+ model_id="google/flan-t5-base"
29
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
30
+
31
+ from datasets import concatenate_datasets
32
+
33
+ # The maximum total input sequence length after tokenization.
34
+ # Sequences longer than this will be truncated, sequences shorter will be padded.
35
+ tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
36
+ max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
37
+ print(f"Max source length: {max_source_length}")
38
+
39
+ max_target_length = max_source_length + 10
40
+ print(f"Max Target length: {max_target_length}")
41
+
42
+
43
+ def preprocess_function(sample,padding="max_length"):
44
+ # add prefix to the input for t5
45
+ inputs = ["Easy to understand Sentence without idioms and jargons: " + item for item in sample["dialogue"]]
46
+
47
+ # tokenize inputs
48
+ model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
49
+
50
+ # Tokenize targets with the `text_target` keyword argument
51
+ labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)
52
+
53
+ # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
54
+ # padding in the loss.
55
+ if padding == "max_length":
56
+ labels["input_ids"] = [
57
+ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
58
+ ]
59
+
60
+ model_inputs["labels"] = labels["input_ids"]
61
+ return model_inputs
62
+
63
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary"])
64
+ print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")
65
+
66
+
67
+ from transformers import AutoModelForSeq2SeqLM
68
+
69
+ # huggingface hub model id
70
+ model_id="google/flan-t5-base"
71
+
72
+ # load model from the hub
73
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
74
+
75
+ import evaluate
76
+ import nltk
77
+ import numpy as np
78
+ from nltk.tokenize import sent_tokenize
79
+
80
+ # Metric
81
+ metric = evaluate.load("rouge")
82
+
83
+ # helper function to postprocess text
84
+ def postprocess_text(preds, labels):
85
+ preds = [pred.strip() for pred in preds]
86
+ labels = [label.strip() for label in labels]
87
+
88
+ # rougeLSum expects newline after each sentence
89
+ preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
90
+ labels = ["\n".join(sent_tokenize(label)) for label in labels]
91
+
92
+ return preds, labels
93
+
94
+ def compute_metrics(eval_preds):
95
+ preds, labels = eval_preds
96
+ if isinstance(preds, tuple):
97
+ preds = preds[0]
98
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
99
+ # Replace -100 in the labels as we can't decode them.
100
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
101
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
102
+
103
+ # Some simple post-processing
104
+ decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
105
+
106
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
107
+ result = {k: round(v * 100, 4) for k, v in result.items()}
108
+ prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
109
+ result["gen_len"] = np.mean(prediction_lens)
110
+ return result
111
+
112
+
113
+ from transformers import DataCollatorForSeq2Seq
114
+
115
+ # we want to ignore tokenizer pad token in the loss
116
+ label_pad_token_id = -100
117
+ # Data collator
118
+ data_collator = DataCollatorForSeq2Seq(
119
+ tokenizer,
120
+ model=model,
121
+ label_pad_token_id=label_pad_token_id,
122
+ pad_to_multiple_of=8
123
+ )
124
+
125
+ import torch
126
+
127
+ torch.cuda.set_device(0)
128
+ print(torch.cuda.current_device())
129
+
130
+ from huggingface_hub import HfFolder
131
+ from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
132
+
133
+
134
+
135
+ repository_id = f"flan-tuning"
136
+
137
+
138
+ # Define training args
139
+ training_args = Seq2SeqTrainingArguments(
140
+ overwrite_output_dir=True,
141
+ output_dir=repository_id,
142
+ per_device_train_batch_size=8,
143
+ per_device_eval_batch_size=8,
144
+ predict_with_generate=True,
145
+ fp16=False, # Overflows with fp16
146
+ learning_rate=5e-5,
147
+ num_train_epochs=1,
148
+ # logging & evaluation strategies
149
+ logging_dir=f"{repository_id}/logs",
150
+ logging_strategy="steps",
151
+ logging_steps=500,
152
+ evaluation_strategy="epoch",
153
+ save_strategy="epoch",
154
+ save_total_limit=2,
155
+ load_best_model_at_end=True,
156
+ # metric_for_best_model="overall_f1",
157
+ # push to hub parameters
158
+ report_to="tensorboard",
159
+ push_to_hub=False,
160
+ hub_strategy="every_save",
161
+ hub_model_id=repository_id,
162
+ hub_token=HfFolder.get_token(),
163
+ )
164
+
165
+ # Create Trainer instance
166
+ trainer = Seq2SeqTrainer(
167
+ model=model,
168
+ args=training_args,
169
+ data_collator=data_collator,
170
+ train_dataset=tokenized_dataset["train"],
171
+ eval_dataset=tokenized_dataset["test"],
172
+ compute_metrics=compute_metrics,
173
+ )
174
+
175
+ trainer.train()
176
+
177
+ # trainer.model.save_pretrained("/home/prafull/apps_all/ChatGPT_Playground/Flan_models/flan-t5-LARGE-IDIOM-24k", from_pt=True)
178
+ # tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
179
+
180
+
181
+ # PUSH TO HUB ------------
182
+
183
+ # Save our tokenizer and create model card
184
+ tokenizer.save_pretrained(repository_id)
185
+ trainer.create_model_card()
186
+ # Push the results to the hub
187
+ trainer.push_to_hub()