tokenizer error

by hiauiarau - opened about 15 hours ago

about 15 hours ago

•

Код
MODEL_NAME = "yandex/YandexGPT-5-Lite-8B-pretrain"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, legacy=False)
Вызывает ошибку Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted.

transformers 4.49.0

vorobyov01

Yandex org about 14 hours ago

Привет! Скинь, пожалуйста, полный лог ошибки

Вызывает ошибку Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted.

Установлен ли protobuf?
https://github.com/protocolbuffers/protobuf/tree/master/python#installation

hiauiarau

about 12 hours ago

•

edited about 12 hours ago

protobuf установлен

OSError Traceback (most recent call last)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\tokenization_utils_base.py:2292, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2291 try:
-> 2292 tokenizer = cls(*init_inputs, **init_kwargs)
2293 except import_protobuf_decode_error():

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\models\llama\tokenization_llama_fast.py:157, in LlamaTokenizerFast.init(self, vocab_file, tokenizer_file, clean_up_tokenization_spaces, unk_token, bos_token, eos_token, add_bos_token, add_eos_token, use_default_system_prompt, legacy, add_prefix_space, **kwargs)
155 kwargs["from_slow"] = True
--> 157 super().init(
158 vocab_file=vocab_file,
159 tokenizer_file=tokenizer_file,
160 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
161 unk_token=unk_token,
162 bos_token=bos_token,
163 eos_token=eos_token,
164 add_bos_token=add_bos_token,
165 add_eos_token=add_eos_token,
166 use_default_system_prompt=use_default_system_prompt,
167 add_prefix_space=add_prefix_space,
168 legacy=legacy,
169 **kwargs,
170 )
171 self._add_bos_token = add_bos_token

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\tokenization_utils_fast.py:120, in PreTrainedTokenizerFast.init(self, *args, **kwargs)
118 elif slow_tokenizer:
119 # We need to convert a slow tokenizer to build the backend
--> 120 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
121 elif gguf_file is not None:
122 # We need to convert a slow tokenizer to build the backend

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\convert_slow_tokenizer.py:1717, in convert_slow_tokenizer(transformer_tokenizer, from_tiktoken)
1716 converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
-> 1717 return converter_class(transformer_tokenizer).converted()
1719 else:

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\convert_slow_tokenizer.py:647, in SpmConverter.converted(self)
646 def converted(self) -> Tokenizer:
--> 647 tokenizer = self.tokenizer(self.proto)
649 # Tokenizer assemble

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\convert_slow_tokenizer.py:588, in SpmConverter.tokenizer(self, proto)
587 elif model_type == 2:
--> 588 _, merges = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
589 bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\convert_slow_tokenizer.py:92, in SentencePieceExtractor.init(self, model)
91 self.sp = SentencePieceProcessor()
---> 92 self.sp.Load(model)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sentencepiece_init_.py:961, in SentencePieceProcessor.Load(self, model_file, model_proto)
960 return self.LoadFromSerializedProto(model_proto)
--> 961 return self.LoadFromFile(model_file)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sentencepiece_init_.py:316, in SentencePieceProcessor.LoadFromFile(self, arg)
315 def LoadFromFile(self, arg):
--> 316 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)

OSError: Not found: "C:\Users\Сергей.cache\huggingface\hub\models--yandex--YandexGPT-5-Lite-8B-pretrain\snapshots\e080b39f663a0503d8607e4ecba0a4cfae429a5a\tokenizer.model": No such file or directory Error #2

During handling of the above exception, another exception occurred:

OSError Traceback (most recent call last)
Cell In[3], line 3
1 MODEL_NAME = "yandex/YandexGPT-5-Lite-8B-pretrain"
----> 3 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, legacy=False)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\models\auto\tokenization_auto.py:944, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
940 if tokenizer_class is None:
941 raise ValueError(
942 f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
943 )
--> 944 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
946 # Otherwise we have to be creative.
947 # if model is an encoder decoder, the encoder tokenizer class is used by default
948 if isinstance(config, EncoderDecoderConfig):

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\tokenization_utils_base.py:2052, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2049 else:
2050 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2052 return cls._from_pretrained(
2053 resolved_vocab_files,
2054 pretrained_model_name_or_path,
2055 init_configuration,
2056 *init_inputs,
2057 token=token,
2058 cache_dir=cache_dir,
2059 local_files_only=local_files_only,
2060 _commit_hash=commit_hash,
2061 _is_local=is_local,
2062 trust_remote_code=trust_remote_code,
2063 **kwargs,
2064 )

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\tokenization_utils_base.py:2307, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2305 return False
2306 except OSError:
-> 2307 raise OSError(
2308 "Unable to load vocabulary from file. "
2309 "Please check that the provided vocabulary is accessible and not corrupted."
2310 )
2312 if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
2313 logger.info(
2314 "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
2315 " fine-tuned or trained."
2316 )

OSError: Unable to load vocabulary from file. Please check that the provided vocabulary is accessible and not corrupted.

vorobyov01

Yandex org about 10 hours ago

•

edited about 10 hours ago

OSError: Not found: "C:\Users\Сергей.cache\huggingface\hub\models--yandex--YandexGPT-5-Lite-8B-pretrain\snapshots\e080b39f663a0503d8607e4ecba0a4cfae429a5a\tokenizer.model": No such file or directory Error #2

Сложно точно понять связана ошибка с отсутствием файла или с самим файлом. Попробуй такие варианты:

Проверь что C:\Users\Сергей.cache\huggingface\hub\models--yandex--YandexGPT-5-Lite-8B-pretrain\snapshots\e080b39f663a0503d8607e4ecba0a4cfae429a5a\tokenizer.model скачался.
Попробуй скачать репозиторий локально (через git-lfs) и запустить токенизатор, передав в MODEL_NAME путь до папки с моделью
Попробую LlamaTokenizer("path_to_repo/tokenizer.model", legacy=True, use_fast=False)
Попробуй sentencepiece (пример есть в карточке модели)

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment