Varun Wadhwa commited on
Commit
bfb1e05
·
unverified ·
1 Parent(s): 933c489
Files changed (1) hide show
  1. app.py +17 -7
app.py CHANGED
@@ -78,12 +78,21 @@ print(raw_dataset.column_names)
78
  # function to align labels with tokens
79
  # --> special tokens: -100 label id (ignored by cross entropy),
80
  # --> if tokens are inside a word, replace 'B-' with 'I-'
81
- def align_labels_with_tokens(labels):
82
  aligned_label_ids = []
83
- for i, label in enumerate(labels):
84
- if label.startswith("B-"):
85
- label = label.replace("B-", "I-")
86
- aligned_label_ids.append(label2id[label])
 
 
 
 
 
 
 
 
 
87
  return aligned_label_ids
88
 
89
  # create tokenize function
@@ -97,8 +106,9 @@ def tokenize_function(examples):
97
  padding=True,
98
  truncation=True,
99
  max_length=512)
100
- for _, labels in enumerate(examples['mbert_token_classes']):
101
- new_labels.append(align_labels_with_tokens(labels))
 
102
  print("Printing partial input with tokenized output")
103
  print(inputs.tokens()[:1000])
104
  print(inputs.word_ids()[:1000])
 
78
  # function to align labels with tokens
79
  # --> special tokens: -100 label id (ignored by cross entropy),
80
  # --> if tokens are inside a word, replace 'B-' with 'I-'
81
+ def align_labels_with_tokens(label, word_ids):
82
  aligned_label_ids = []
83
+ previous_word_idx = None
84
+ for word_idx in word_ids: # Set the special tokens to -100.
85
+ if word_idx is None:
86
+ aligned_label_ids.append(-100)
87
+ elif word_idx != previous_word_idx: # Only label the first token of a given word.
88
+ if label.startswith("B-"):
89
+ print(word_idx)
90
+ print(label)
91
+ label = label.replace("B-", "I-")
92
+ aligned_label_ids.append(label[word_idx])
93
+ else:
94
+ aligned_label_ids.append(-100)
95
+ previous_word_idx = word_idx
96
  return aligned_label_ids
97
 
98
  # create tokenize function
 
106
  padding=True,
107
  truncation=True,
108
  max_length=512)
109
+ for i, label in enumerate(examples['mbert_token_classes']):
110
+ word_ids = inputs.word_ids(batch_index=i)
111
+ new_labels.append(align_labels_with_tokens(label, word_ids))
112
  print("Printing partial input with tokenized output")
113
  print(inputs.tokens()[:1000])
114
  print(inputs.word_ids()[:1000])