Why this checkpoint doesn't have a nemo file?
#3
by
w332323
- opened
How can i load the checkpoint file? I dowload the files to local but I don't know how to load this.
My script is as follow:
python /opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py \
rm_model_file=/workspace/llama3_rm_hp2/ \
trainer.num_nodes=1 \
trainer.devices=8 \
++model.tensor_model_parallel_size=8 \
++model.pipeline_model_parallel_size=1 \
inference.micro_batch_size=2 \
inference.port=1424
The error is:
[NeMo I 2024-08-27 12:21:11 tokenizer_utils:182] Getting HuggingFace AutoTokenizer with pretrained_model_name: meta-llama/Meta-Llama-3-70B
Error executing job with overrides: ['rm_model_file=/workspace/llama3_rm_hp2/', 'trainer.num_nodes=1', 'trainer.devices=8', '++model.tensor_model_parallel_size=8', '++model.pipeline_model_parallel_size=1', 'inference.micro_batch_size=2', 'inference.port=1424']
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 198, in _new_conn
sock = connection.create_connection(
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
raise err
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
sock.connect(sa)
OSError: [Errno 101] Network is unreachable
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 793, in urlopen
response = self._make_request(
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 491, in _make_request
raise new_e
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 467, in _make_request
self._validate_conn(conn)
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 1099, in _validate_conn
conn.connect()
File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 616, in connect
self.sock = sock = self._new_conn()
File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 213, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x7fdb75febeb0>: Failed to establish a new connection: [Errno 101] Network is unreachable
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/requests/adapters.py", line 486, in send
resp = conn.urlopen(
File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 847, in urlopen
retries = retries.increment(
File "/usr/local/lib/python3.10/dist-packages/urllib3/util/retry.py", line 515, in increment
raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /meta-llama/Meta-Llama-3-70B/resolve/main/config.json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fdb75febeb0>: Failed to establish a new connection: [Errno 101] Network is unreachable'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1238, in hf_hub_download
metadata = get_hf_file_metadata(
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1631, in get_hf_file_metadata
r = _request_wrapper(
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 385, in _request_wrapper
response = _request_wrapper(
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 408, in _request_wrapper
response = get_session().request(method=method, url=url, **params)
File "/usr/local/lib/python3.10/dist-packages/requests/sessions.py", line 589, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.10/dist-packages/requests/sessions.py", line 703, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_http.py", line 67, in send
return super().send(request, *args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/requests/adapters.py", line 519, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /meta-llama/Meta-Llama-3-70B/resolve/main/config.json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fdb75febeb0>: Failed to establish a new connection: [Errno 101] Network is unreachable'))"), '(Request ID: afc672eb-ad18-40fe-b317-56ff8023d552)')
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py", line 385, in cached_file
resolved_file = hf_hub_download(
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_validators.py", line 118, in _inner_fn
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py", line 1371, in hf_hub_download
raise LocalEntryNotFoundError(
huggingface_hub.utils._errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/opt/NeMo/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py", line 67, in __init__
self.tokenizer = AUTOTOKENIZER.from_pretrained(
File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/tokenization_auto.py", line 773, in from_pretrained
config = AutoConfig.from_pretrained(
File "/usr/local/lib/python3.10/dist-packages/transformers/models/auto/configuration_auto.py", line 1100, in from_pretrained
config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/configuration_utils.py", line 634, in get_config_dict
config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/configuration_utils.py", line 689, in _get_config_dict
resolved_config_file = cached_file(
File "/usr/local/lib/python3.10/dist-packages/transformers/utils/hub.py", line 425, in cached_file
raise EnvironmentError(
OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Meta-Llama-3-70B is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/NeMo-Aligner/examples/nlp/gpt/serve_reward_model.py", line 53, in main
ptl_model = load_from_nemo(reward_model_cls, cfg.model, trainer, strict=True, restore_path=cfg.rm_model_file,)
File "/opt/NeMo-Aligner/nemo_aligner/utils/utils.py", line 93, in load_from_nemo
model = cls.restore_from(
File "/opt/NeMo/nemo/collections/nlp/models/nlp_model.py", line 465, in restore_from
return super().restore_from(
File "/opt/NeMo/nemo/core/classes/modelPT.py", line 449, in restore_from
instance = cls._save_restore_connector.restore_from(
File "/opt/NeMo-Aligner/nemo_aligner/utils/utils.py", line 46, in restore_from
return super().restore_from(*args, **kwargs)
File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 1056, in restore_from
loaded_params = super().load_config_and_state_dict(
File "/opt/NeMo/nemo/core/connectors/save_restore_connector.py", line 164, in load_config_and_state_dict
instance = calling_cls.from_config_dict(config=conf, trainer=trainer)
File "/opt/NeMo/nemo/core/classes/common.py", line 518, in from_config_dict
raise e
File "/opt/NeMo/nemo/core/classes/common.py", line 510, in from_config_dict
instance = cls(cfg=config, trainer=trainer)
File "/opt/NeMo-Aligner/nemo_aligner/models/nlp/gpt/megatron_gpt_regression_reward_model.py", line 39, in __init__
super().__init__(cfg, trainer=trainer)
File "/opt/NeMo-Aligner/nemo_aligner/models/nlp/gpt/megatron_gpt_reward_model.py", line 57, in __init__
super().__init__(cfg, trainer=trainer)
File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 243, in __init__
super().__init__(cfg, trainer=trainer, no_lm_init=True)
File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 213, in __init__
self._build_tokenizer()
File "/opt/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 382, in _build_tokenizer
self.tokenizer = get_nmt_tokenizer(
File "/opt/NeMo/nemo/collections/nlp/modules/common/tokenizer_utils.py", line 183, in get_nmt_tokenizer
return AutoTokenizer(
File "/opt/NeMo/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py", line 82, in __init__
raise ValueError(
ValueError: Unable to instantiate HuggingFace AUTOTOKENIZER for meta-llama/Meta-Llama-3-70B. Exception: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like meta-llama/Meta-Llama-3-70B is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
w332323
changed discussion title from
Why this checkpoint does't have a nemo file?
to Why this checkpoint doesn't have a nemo file?