Fix generation with latest transformers

#1
Files changed (2) hide show
  1. modeling_deepseek.py +1 -1
  2. tokenization_moonshot.py +4 -1
modeling_deepseek.py CHANGED
@@ -1653,7 +1653,7 @@ class DeepseekV3ForCausalLM(DeepseekV3PreTrainedModel):
1653
  if isinstance(past_key_values, Cache):
1654
  cache_length = past_key_values.get_seq_length()
1655
  past_length = past_key_values.seen_tokens
1656
- max_cache_length = past_key_values.get_max_length()
1657
  else:
1658
  cache_length = past_length = past_key_values[0][0].shape[2]
1659
  max_cache_length = None
 
1653
  if isinstance(past_key_values, Cache):
1654
  cache_length = past_key_values.get_seq_length()
1655
  past_length = past_key_values.seen_tokens
1656
+ max_cache_length = past_key_values.get_max_cache_shape()
1657
  else:
1658
  cache_length = past_length = past_key_values[0][0].shape[2]
1659
  max_cache_length = None
tokenization_moonshot.py CHANGED
@@ -12,6 +12,7 @@ from typing import (
12
  Union,
13
  Optional,
14
  )
 
15
  from shutil import copyfile
16
  import numpy as np
17
  from tiktoken.load import load_tiktoken_bpe
@@ -226,8 +227,10 @@ class TikTokenTokenizer(PreTrainedTokenizer):
226
  if len(kwargs) > 0:
227
  return super().decode(token_ids, **kwargs)
228
 
229
- if type(token_ids) is int:
230
  token_ids = [token_ids]
 
 
231
 
232
  return self.model.decode(cast(List[int], token_ids))
233
 
 
12
  Union,
13
  Optional,
14
  )
15
+ import torch
16
  from shutil import copyfile
17
  import numpy as np
18
  from tiktoken.load import load_tiktoken_bpe
 
227
  if len(kwargs) > 0:
228
  return super().decode(token_ids, **kwargs)
229
 
230
+ if isinstance(token_ids, int):
231
  token_ids = [token_ids]
232
+ if isinstance(token_ids, torch.Tensor):
233
+ token_ids = token_ids.tolist()
234
 
235
  return self.model.decode(cast(List[int], token_ids))
236