Spaces:

TraceForce
/

varun-kd-finetune

Sleeping

Varun Wadhwa commited on 17 days ago

Commit

bfb1e05

unverified ·

1 Parent(s): 933c489

Logs

Files changed (1) hide show

app.py CHANGED Viewed

@@ -78,12 +78,21 @@ print(raw_dataset.column_names)
 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
-def align_labels_with_tokens(labels):
     aligned_label_ids = []
-    for i, label in enumerate(labels):
-        if label.startswith("B-"):
-            label = label.replace("B-", "I-")
-        aligned_label_ids.append(label2id[label])
     return aligned_label_ids
 # create tokenize function
@@ -97,8 +106,9 @@ def tokenize_function(examples):
         padding=True,
         truncation=True,
         max_length=512)
-    for _, labels in enumerate(examples['mbert_token_classes']):
-        new_labels.append(align_labels_with_tokens(labels))
     print("Printing partial input with tokenized output")
     print(inputs.tokens()[:1000])
     print(inputs.word_ids()[:1000])

 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
+def align_labels_with_tokens(label, word_ids):
     aligned_label_ids = []
+    previous_word_idx = None
+    for word_idx in word_ids:  # Set the special tokens to -100.
+        if word_idx is None:
+            aligned_label_ids.append(-100)
+        elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+            if label.startswith("B-"):
+                print(word_idx)
+                print(label)
+                label = label.replace("B-", "I-")
+            aligned_label_ids.append(label[word_idx])
+        else:
+            aligned_label_ids.append(-100)
+        previous_word_idx = word_idx
     return aligned_label_ids
 # create tokenize function
         padding=True,
         truncation=True,
         max_length=512)
+    for i, label in enumerate(examples['mbert_token_classes']):
+        word_ids = inputs.word_ids(batch_index=i)
+        new_labels.append(align_labels_with_tokens(label, word_ids))
     print("Printing partial input with tokenized output")
     print(inputs.tokens()[:1000])
     print(inputs.word_ids()[:1000])