Unable to load tokenizer
#5
by
abhinavkulkarni
- opened
import torch
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
model_id = "togethercomputer/StripedHyena-Nous-7B"
# Config
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
# Tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
except:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
I get the following error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[4], line 3
2 try:
----> 3 tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
4 except:
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/configuration_utils.py:265, in PretrainedConfig.__getattribute__(self, key)
264 key = super().__getattribute__("attribute_map")[key]
--> 265 return super().__getattribute__(key)
AttributeError: 'StripedHyenaConfig' object has no attribute 'tokenizer_name'
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
Cell In[4], line 5
3 tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, trust_remote_code=True)
4 except:
----> 5 tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:787, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
783 if tokenizer_class is None:
784 raise ValueError(
785 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
786 )
--> 787 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
789 # Otherwise we have to be creative.
790 # if model is an encoder decoder, the encoder tokenizer class is used by default
791 if isinstance(config, EncoderDecoderConfig):
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2028, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, *init_inputs, **kwargs)
2025 else:
2026 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2028 return cls._from_pretrained(
2029 resolved_vocab_files,
2030 pretrained_model_name_or_path,
2031 init_configuration,
2032 *init_inputs,
2033 token=token,
2034 cache_dir=cache_dir,
2035 local_files_only=local_files_only,
2036 _commit_hash=commit_hash,
2037 _is_local=is_local,
2038 **kwargs,
2039 )
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:2260, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, *init_inputs, **kwargs)
2258 # Instantiate the tokenizer.
2259 try:
-> 2260 tokenizer = cls(*init_inputs, **init_kwargs)
2261 except OSError:
2262 raise OSError(
2263 "Unable to load vocabulary from file. "
2264 "Please check that the provided vocabulary is accessible and not corrupted."
2265 )
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/llama/tokenization_llama.py:178, in LlamaTokenizer.__init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, use_default_system_prompt, spaces_between_special_tokens, legacy, **kwargs)
176 self.add_eos_token = add_eos_token
177 self.use_default_system_prompt = use_default_system_prompt
--> 178 self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
180 super().__init__(
181 bos_token=bos_token,
182 eos_token=eos_token,
(...)
192 **kwargs,
193 )
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/transformers/models/llama/tokenization_llama.py:203, in LlamaTokenizer.get_spm_processor(self, from_slow)
201 tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
202 if self.legacy or from_slow: # no dependency on protobuf
--> 203 tokenizer.Load(self.vocab_file)
204 return tokenizer
206 with open(self.vocab_file, "rb") as f:
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/sentencepiece/__init__.py:905, in SentencePieceProcessor.Load(self, model_file, model_proto)
903 if model_proto:
904 return self.LoadFromSerializedProto(model_proto)
--> 905 return self.LoadFromFile(model_file)
File ~/miniconda3/envs/transformers/lib/python3.10/site-packages/sentencepiece/__init__.py:310, in SentencePieceProcessor.LoadFromFile(self, arg)
309 def LoadFromFile(self, arg):
--> 310 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
TypeError: not a string
Thanks for the model update!
When loading the config using AutoConfig
, the attribute config.tokenizer_name
does not exist.
Is there a reason why you are using the parameter use_fast=False
on the exception block and not on the initial try block? As the tokeniser would load as expected removing use_fast
from the except block where you are loading the model from the model_id
abhinavkulkarni
changed discussion status to
closed