Fix bug
Browse files- tokenization_chatglm.py +8 -0
tokenization_chatglm.py
CHANGED
@@ -176,6 +176,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
176 |
mask_token='[MASK]',
|
177 |
gmask_token='[gMASK]',
|
178 |
padding_side="left",
|
|
|
|
|
179 |
num_image_tokens=20000,
|
180 |
**kwargs
|
181 |
) -> None:
|
@@ -188,6 +190,8 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
188 |
end_token=end_token,
|
189 |
mask_token=mask_token,
|
190 |
gmask_token=gmask_token,
|
|
|
|
|
191 |
num_image_tokens=num_image_tokens,
|
192 |
**kwargs
|
193 |
)
|
@@ -402,6 +406,10 @@ class ChatGLMTokenizer(PreTrainedTokenizer):
|
|
402 |
encoded_inputs["attention_mask"] = attention_mask
|
403 |
|
404 |
if "position_ids" not in encoded_inputs:
|
|
|
|
|
|
|
|
|
405 |
position_ids = np.arange(seq_length, dtype=np.int64)
|
406 |
mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
|
407 |
if mask_token in required_input:
|
|
|
176 |
mask_token='[MASK]',
|
177 |
gmask_token='[gMASK]',
|
178 |
padding_side="left",
|
179 |
+
pad_token="<pad>",
|
180 |
+
unk_token="<unk>",
|
181 |
num_image_tokens=20000,
|
182 |
**kwargs
|
183 |
) -> None:
|
|
|
190 |
end_token=end_token,
|
191 |
mask_token=mask_token,
|
192 |
gmask_token=gmask_token,
|
193 |
+
pad_token=pad_token,
|
194 |
+
unk_token=unk_token,
|
195 |
num_image_tokens=num_image_tokens,
|
196 |
**kwargs
|
197 |
)
|
|
|
406 |
encoded_inputs["attention_mask"] = attention_mask
|
407 |
|
408 |
if "position_ids" not in encoded_inputs:
|
409 |
+
if bos_token_id in required_input:
|
410 |
+
context_length = required_input.index(bos_token_id)
|
411 |
+
else:
|
412 |
+
context_length = seq_length
|
413 |
position_ids = np.arange(seq_length, dtype=np.int64)
|
414 |
mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
|
415 |
if mask_token in required_input:
|