Spaces:

TraceForce
/

varun-kd-finetune

Sleeping

App Files Files Community

Varun Wadhwa commited on 17 days ago

Commit

0444fba

unverified ·

1 Parent(s): 8e7d1ea

Logs

Browse files

Files changed (1) hide show

app.py +41 -39

app.py CHANGED Viewed

@@ -78,32 +78,31 @@ print(raw_dataset.column_names)
 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
-def align_labels_with_tokens(labels, word_ids, max_length):
     aligned_label_ids = []
-    for word_id in word_ids:
-        if word_id is None:
-            aligned_label_ids.append(-100)
-        else:
-            aligned_label_ids.append(label2id[labels[word_id]].replace("B-", "I-"))
-    # Pad to max length
-    aligned_label_ids += [-100] * (max_length - len(aligned_label_ids))
     return aligned_label_ids
 # create tokenize function
 def tokenize_function(examples):
     inputs = tokenizer(
         examples['mbert_tokens'],
         is_split_into_words=True,
         truncation=True,
-        max_length=512,
-        padding="max_length"
-    )
-    word_ids = inputs.word_ids()
-    inputs["labels"] = [
-        align_labels_with_tokens(labels, word_ids, tokenizer.model_max_length)
-        for labels in examples['mbert_token_classes']
-    ]
     return inputs
 # tokenize training and validation datasets
@@ -112,43 +111,46 @@ tokenized_data = raw_dataset.map(
     batched=True)
 tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 # data collator
-data_collator = DataCollatorForTokenClassification(
-    tokenizer, padding=True, truncation=True, max_length=512
-)
 st.write(tokenized_data["train"][:2]["labels"])
 # Function to evaluate model performance
 def evaluate_model(model, dataloader, device):
-    model.eval()
-    all_preds, all_labels = [], []
     with torch.no_grad():
         for batch in dataloader:
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
-            labels = batch['labels'].to(device)
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs.logits
-            preds = torch.argmax(logits, dim=-1)
-            # Mask out padding tokens (-100 in labels)
-            mask = labels != -100
-            valid_preds = preds[mask]
-            valid_labels = labels[mask]
-            all_preds.extend(valid_preds.cpu().numpy())
-            all_labels.extend(valid_labels.cpu().numpy())
-    # Convert to numpy arrays for metrics calculation
-    all_preds = np.array(all_preds)
-    all_labels = np.array(all_labels)
     accuracy = accuracy_score(all_labels, all_preds)
-    precision, recall, f1, _ = precision_recall_fscore_support(
-        all_labels, all_preds, average='micro'
-    )
     return accuracy, precision, recall, f1

 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
+def align_labels_with_tokens(labels):
     aligned_label_ids = []
+    aligned_label_ids.append(-100)
+    for i, label in enumerate(labels):
+        if label.startswith("B-"):
+            label = label.replace("B-", "I-")
+        aligned_label_ids.append(label2id[label])
+    aligned_label_ids.append(-100)
     return aligned_label_ids
 # create tokenize function
 def tokenize_function(examples):
+    # tokenize and truncate text. The examples argument would have already stripped
+    # the train or test label.
+    new_labels = []
     inputs = tokenizer(
         examples['mbert_tokens'],
         is_split_into_words=True,
+        padding=True,
         truncation=True,
+        max_length=512)
+    for _, labels in enumerate(examples['mbert_token_classes']):
+        new_labels.append(align_labels_with_tokens(labels))
+    inputs["labels"] = new_labels
     return inputs
 # tokenize training and validation datasets
     batched=True)
 tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 # data collator
+data_collator = DataCollatorForTokenClassification(tokenizer)
 st.write(tokenized_data["train"][:2]["labels"])
 # Function to evaluate model performance
 def evaluate_model(model, dataloader, device):
+    model.eval()  # Set model to evaluation mode
+    all_preds = []
+    all_labels = []
+    # Disable gradient calculations
     with torch.no_grad():
         for batch in dataloader:
             input_ids = batch['input_ids'].to(device)
             attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device).cpu().numpy()
+            # Forward pass to get logits
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs.logits
+            # Get predictions
+            preds = torch.argmax(logits, dim=-1).cpu().numpy()
+            all_preds.extend(preds)
+            all_labels.extend(labels)
+    # Calculate evaluation metrics
+    print("evaluate_model sizes")
+    print("Shape of preds:", all_preds.shape)
+    print("Shape of labels:", all_labels.shape)
+    all_preds = np.asarray(all_preds, dtype=np.float32)
+    all_labels = np.asarray(all_labels, dtype=np.float32)
+    print("Flattened sizes")
+    print(all_preds.size)
+    print(all_labels.size)
+    all_preds = all_preds.flatten()
+    all_labels = all_labels.flatten()
     accuracy = accuracy_score(all_labels, all_preds)
+    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
     return accuracy, precision, recall, f1