Fix generation with latest transformers
#1
by
kylesayrs
- opened
- modeling_deepseek.py +1 -1
- tokenization_moonshot.py +4 -1
modeling_deepseek.py
CHANGED
@@ -1653,7 +1653,7 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
|
|
1653 |
if isinstance(past_key_values, Cache):
|
1654 |
cache_length = past_key_values.get_seq_length()
|
1655 |
past_length = past_key_values.seen_tokens
|
1656 |
-
max_cache_length = past_key_values.
|
1657 |
else:
|
1658 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
1659 |
max_cache_length = None
|
|
|
1653 |
if isinstance(past_key_values, Cache):
|
1654 |
cache_length = past_key_values.get_seq_length()
|
1655 |
past_length = past_key_values.seen_tokens
|
1656 |
+
max_cache_length = past_key_values.get_max_cache_shape()
|
1657 |
else:
|
1658 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
1659 |
max_cache_length = None
|
tokenization_moonshot.py
CHANGED
@@ -12,6 +12,7 @@ from typing import (
|
|
12 |
Union,
|
13 |
Optional,
|
14 |
)
|
|
|
15 |
from shutil import copyfile
|
16 |
import numpy as np
|
17 |
from tiktoken.load import load_tiktoken_bpe
|
@@ -226,8 +227,10 @@ class TikTokenTokenizer(PreTrainedTokenizer):
|
|
226 |
if len(kwargs) > 0:
|
227 |
return super().decode(token_ids, **kwargs)
|
228 |
|
229 |
-
if
|
230 |
token_ids = [token_ids]
|
|
|
|
|
231 |
|
232 |
return self.model.decode(cast(List[int], token_ids))
|
233 |
|
|
|
12 |
Union,
|
13 |
Optional,
|
14 |
)
|
15 |
+
import torch
|
16 |
from shutil import copyfile
|
17 |
import numpy as np
|
18 |
from tiktoken.load import load_tiktoken_bpe
|
|
|
227 |
if len(kwargs) > 0:
|
228 |
return super().decode(token_ids, **kwargs)
|
229 |
|
230 |
+
if isinstance(token_ids, int):
|
231 |
token_ids = [token_ids]
|
232 |
+
if isinstance(token_ids, torch.Tensor):
|
233 |
+
token_ids = token_ids.tolist()
|
234 |
|
235 |
return self.model.decode(cast(List[int], token_ids))
|
236 |
|