Homeskills commited on
Commit
5f732b0
·
1 Parent(s): 8606f69

add functions and helper functions from original repo

Browse files
Files changed (3) hide show
  1. lambda_function.py +42 -0
  2. simple_inference.py +29 -0
  3. utils.py +205 -0
lambda_function.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from InstructABSA.utils import T5Generator
2
+
3
+ print('Mode set to: Individual sample inference')
4
+
5
+
6
+ # Create T5 model object along with instructions (taken from `instructions.py`)
7
+ model_checkpoint = "./Models/joint_task/kevinscariajoint_tk-instruct-base-def-pos-neg-neut-combined-robs_experiment"
8
+ t5_exp = T5Generator(model_checkpoint)
9
+ print("Model loaded from: ", model_checkpoint)
10
+ bos_instruction_id = """Definition: The output will be the aspects (both implicit and explicit) and the aspect's sentiment polarity. In cases where there are no aspects the output should be noaspectterm:none.
11
+ Positive example 1-
12
+ input: With the great variety on the menu , I eat here often and never get bored.
13
+ output: menu:positive
14
+ Positive example 2-
15
+ input: Great food, good size menu, great service and an unpretentious setting.
16
+ output: food:positive, menu:positive, service:positive, setting:positive
17
+ Negative example 1-
18
+ input: They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.
19
+ output: toast:negative, mayonnaise:negative, bacon:negative, ingredients:negative, plate:negative
20
+ Negative example 2-
21
+ input: The seats are uncomfortable if you are sitting against the wall on wooden benches.
22
+ output: seats:negative
23
+ Neutral example 1-
24
+ input: I asked for seltzer with lime, no ice.
25
+ output: seltzer with lime:neutral
26
+ Neutral example 2-
27
+ input: They wouldn't even let me finish my glass of wine before offering another.
28
+ output: glass of wine:neutral
29
+ Now complete the following example-
30
+ input: """
31
+ eos_instruction = ' \noutput:'
32
+
33
+ # Get input from user
34
+ user_input = input("Enter sentence for inference: ")
35
+ # format and tokenize input
36
+ model_input = bos_instruction_id + user_input + eos_instruction
37
+ input_ids = t5_exp.tokenizer(model_input, return_tensors="pt").input_ids
38
+ # generate output
39
+ outputs = t5_exp.model.generate(input_ids, max_length=128)
40
+ # decode output and print
41
+ print('Model output: ', t5_exp.tokenizer.decode(outputs[0], skip_special_tokens=True))
42
+
simple_inference.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from InstructABSA.utils import T5Generator
2
+ from instructions import InstructionsHandler
3
+
4
+ # Set Global Values
5
+ instruct_handler = InstructionsHandler()
6
+
7
+ # Load instruction set 2 for ASPE
8
+ instruct_handler.load_instruction_set2()
9
+
10
+ print('Mode set to: Individual sample inference')
11
+
12
+
13
+ # Create T5 model object
14
+ model_checkpoint = "./Models/joint_task/kevinscariajoint_tk-instruct-base-def-pos-neg-neut-combined-robs_experiment"
15
+ t5_exp = T5Generator(model_checkpoint)
16
+ print("Model loaded from: ", model_checkpoint)
17
+ bos_instruction_id = instruct_handler.aspe['bos_instruct2']
18
+ eos_instruction = instruct_handler.aspe['eos_instruct']
19
+
20
+ # Get input from user
21
+ user_input = input("Enter sentence for inference: ")
22
+ # format and tokenize input
23
+ model_input = bos_instruction_id + user_input + eos_instruction
24
+ input_ids = t5_exp.tokenizer(model_input, return_tensors="pt").input_ids
25
+ # generate output
26
+ outputs = t5_exp.model.generate(input_ids, max_length=128)
27
+ # decode output and print
28
+ print('Model output: ', t5_exp.tokenizer.decode(outputs[0], skip_special_tokens=True))
29
+
utils.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
3
+ import torch
4
+ from torch.utils.data import DataLoader
5
+ from torch.nn.utils.rnn import pad_sequence
6
+ from tqdm import tqdm
7
+ from transformers import (
8
+ DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM,
9
+ Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer
10
+ )
11
+
12
+
13
+ class T5Generator:
14
+ def __init__(self, model_checkpoint):
15
+ self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
16
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
17
+ self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
18
+ self.device = 'cuda' if torch.backends.cuda.is_built() else ('mps' if torch.backends.mps.is_built() else 'cpu')
19
+
20
+ def tokenize_function_inputs(self, sample):
21
+ """
22
+ Udf to tokenize the input dataset.
23
+ """
24
+ model_inputs = self.tokenizer(sample['text'], max_length=512, truncation=True)
25
+ labels = self.tokenizer(sample["labels"], max_length=64, truncation=True)
26
+ model_inputs["labels"] = labels["input_ids"]
27
+ return model_inputs
28
+
29
+ def train(self, tokenized_datasets, **kwargs):
30
+ """
31
+ Train the generative model.
32
+ """
33
+ #Set training arguments
34
+ args = Seq2SeqTrainingArguments(
35
+ **kwargs
36
+ )
37
+
38
+ # Define trainer object
39
+ trainer = Seq2SeqTrainer(
40
+ self.model,
41
+ args,
42
+ train_dataset=tokenized_datasets["train"],
43
+ eval_dataset=tokenized_datasets["validation"] if tokenized_datasets.get("validation") is not None else None,
44
+ tokenizer=self.tokenizer,
45
+ data_collator=self.data_collator,
46
+ )
47
+ print("Trainer device:", trainer.args.device)
48
+
49
+ # Finetune the model
50
+ torch.cuda.empty_cache()
51
+ print('\nModel training started ....')
52
+ trainer.train()
53
+
54
+ # Save best model
55
+ trainer.save_model()
56
+ return trainer
57
+
58
+ def get_labels(self, tokenized_dataset, batch_size = 4, max_length = 128, sample_set = 'train'):
59
+ """
60
+ Get the predictions from the trained model.
61
+ """
62
+ def collate_fn(batch):
63
+ input_ids = [torch.tensor(example['input_ids']) for example in batch]
64
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
65
+ return input_ids
66
+
67
+ dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
68
+ predicted_output = []
69
+ self.model.to(self.device)
70
+ print('Model loaded to: ', self.device)
71
+
72
+ for batch in tqdm(dataloader):
73
+ batch = batch.to(self.device)
74
+ output_ids = self.model.generate(batch, max_length = max_length)
75
+ output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
76
+ for output_text in output_texts:
77
+ predicted_output.append(output_text)
78
+ return predicted_output
79
+
80
+ def get_metrics(self, y_true, y_pred, is_triplet_extraction=False):
81
+ total_pred = 0
82
+ total_gt = 0
83
+ tp = 0
84
+ if not is_triplet_extraction:
85
+ for gt, pred in zip(y_true, y_pred):
86
+ gt_list = gt.split(', ')
87
+ pred_list = pred.split(', ')
88
+ total_pred+=len(pred_list)
89
+ total_gt+=len(gt_list)
90
+ for gt_val in gt_list:
91
+ for pred_val in pred_list:
92
+ if pred_val in gt_val or gt_val in pred_val:
93
+ tp+=1
94
+ break
95
+
96
+ else:
97
+ for gt, pred in zip(y_true, y_pred):
98
+ gt_list = gt.split(', ')
99
+ pred_list = pred.split(', ')
100
+ total_pred+=len(pred_list)
101
+ total_gt+=len(gt_list)
102
+ for gt_val in gt_list:
103
+ gt_asp = gt_val.split(':')[0]
104
+
105
+ try:
106
+ gt_op = gt_val.split(':')[1]
107
+ except:
108
+ continue
109
+
110
+ try:
111
+ gt_sent = gt_val.split(':')[2]
112
+ except:
113
+ continue
114
+
115
+ for pred_val in pred_list:
116
+ pr_asp = pred_val.split(':')[0]
117
+
118
+ try:
119
+ pr_op = pred_val.split(':')[1]
120
+ except:
121
+ continue
122
+
123
+ try:
124
+ pr_sent = gt_val.split(':')[2]
125
+ except:
126
+ continue
127
+
128
+ if pr_asp in gt_asp and pr_op in gt_op and gt_sent == pr_sent:
129
+ tp+=1
130
+
131
+ p = tp/total_pred
132
+ r = tp/total_gt
133
+ return p, r, 2*p*r/(p+r), None
134
+
135
+
136
+ class T5Classifier:
137
+ def __init__(self, model_checkpoint):
138
+ self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, force_download = True)
139
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, force_download = True)
140
+ self.data_collator = DataCollatorForSeq2Seq(self.tokenizer)
141
+ self.device = 'cuda' if torch.has_cuda else ('mps' if torch.has_mps else 'cpu')
142
+
143
+ def tokenize_function_inputs(self, sample):
144
+ """
145
+ Udf to tokenize the input dataset.
146
+ """
147
+ sample['input_ids'] = self.tokenizer(sample["text"], max_length = 512, truncation = True).input_ids
148
+ sample['labels'] = self.tokenizer(sample["labels"], max_length = 64, truncation = True).input_ids
149
+ return sample
150
+
151
+ def train(self, tokenized_datasets, **kwargs):
152
+ """
153
+ Train the generative model.
154
+ """
155
+
156
+ # Set training arguments
157
+ args = Seq2SeqTrainingArguments(
158
+ **kwargs
159
+ )
160
+
161
+ # Define trainer object
162
+ trainer = Trainer(
163
+ self.model,
164
+ args,
165
+ train_dataset=tokenized_datasets["train"],
166
+ eval_dataset=tokenized_datasets["validation"] if tokenized_datasets.get("validation") is not None else None,
167
+ tokenizer=self.tokenizer,
168
+ data_collator = self.data_collator
169
+ )
170
+ print("Trainer device:", trainer.args.device)
171
+
172
+ # Finetune the model
173
+ torch.cuda.empty_cache()
174
+ print('\nModel training started ....')
175
+ trainer.train()
176
+
177
+ # Save best model
178
+ trainer.save_model()
179
+ return trainer
180
+
181
+ def get_labels(self, tokenized_dataset, batch_size = 4, sample_set = 'train'):
182
+ """
183
+ Get the predictions from the trained model.
184
+ """
185
+ def collate_fn(batch):
186
+ input_ids = [torch.tensor(example['input_ids']) for example in batch]
187
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
188
+ return input_ids
189
+
190
+ dataloader = DataLoader(tokenized_dataset[sample_set], batch_size=batch_size, collate_fn=collate_fn)
191
+ predicted_output = []
192
+ self.model.to(self.device)
193
+ print('Model loaded to: ', self.device)
194
+
195
+ for batch in tqdm(dataloader):
196
+ batch = batch.to(self.device)
197
+ output_ids = self.model.generate(batch)
198
+ output_texts = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
199
+ for output_text in output_texts:
200
+ predicted_output.append(output_text)
201
+ return predicted_output
202
+
203
+ def get_metrics(self, y_true, y_pred):
204
+ return precision_score(y_true, y_pred, average='macro'), recall_score(y_true, y_pred, average='macro'), \
205
+ f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)