Spaces:
Sleeping
Sleeping
Varun Wadhwa
commited on
Logs
Browse files
app.py
CHANGED
@@ -78,12 +78,21 @@ print(raw_dataset.column_names)
|
|
78 |
# function to align labels with tokens
|
79 |
# --> special tokens: -100 label id (ignored by cross entropy),
|
80 |
# --> if tokens are inside a word, replace 'B-' with 'I-'
|
81 |
-
def align_labels_with_tokens(
|
82 |
aligned_label_ids = []
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
return aligned_label_ids
|
88 |
|
89 |
# create tokenize function
|
@@ -97,8 +106,9 @@ def tokenize_function(examples):
|
|
97 |
padding=True,
|
98 |
truncation=True,
|
99 |
max_length=512)
|
100 |
-
for
|
101 |
-
|
|
|
102 |
print("Printing partial input with tokenized output")
|
103 |
print(inputs.tokens()[:1000])
|
104 |
print(inputs.word_ids()[:1000])
|
|
|
78 |
# function to align labels with tokens
|
79 |
# --> special tokens: -100 label id (ignored by cross entropy),
|
80 |
# --> if tokens are inside a word, replace 'B-' with 'I-'
|
81 |
+
def align_labels_with_tokens(label, word_ids):
|
82 |
aligned_label_ids = []
|
83 |
+
previous_word_idx = None
|
84 |
+
for word_idx in word_ids: # Set the special tokens to -100.
|
85 |
+
if word_idx is None:
|
86 |
+
aligned_label_ids.append(-100)
|
87 |
+
elif word_idx != previous_word_idx: # Only label the first token of a given word.
|
88 |
+
if label.startswith("B-"):
|
89 |
+
print(word_idx)
|
90 |
+
print(label)
|
91 |
+
label = label.replace("B-", "I-")
|
92 |
+
aligned_label_ids.append(label[word_idx])
|
93 |
+
else:
|
94 |
+
aligned_label_ids.append(-100)
|
95 |
+
previous_word_idx = word_idx
|
96 |
return aligned_label_ids
|
97 |
|
98 |
# create tokenize function
|
|
|
106 |
padding=True,
|
107 |
truncation=True,
|
108 |
max_length=512)
|
109 |
+
for i, label in enumerate(examples['mbert_token_classes']):
|
110 |
+
word_ids = inputs.word_ids(batch_index=i)
|
111 |
+
new_labels.append(align_labels_with_tokens(label, word_ids))
|
112 |
print("Printing partial input with tokenized output")
|
113 |
print(inputs.tokens()[:1000])
|
114 |
print(inputs.word_ids()[:1000])
|