How much VRAM does this model need?
I have an Nvidia A10 (24GB of VRAM) but I'm getting out of memory errors.
model_name = "teknium/OpenHermes-2.5-Mistral-7B"
def load_model(model_name: str):
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
with torch.device("cuda:0"):
model = transformers.AutoModelForCausalLM.from_pretrained(model_name).eval()
return tokenizer, model
tokenizer, model = load_model(model_name)
OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 13.00 MiB is free. Including non-PyTorch memory, this process has 21.96 GiB memory in use. Of the allocated memory 21.58 GiB is allocated by PyTorch, and 99.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF...
I assumed 24GB would be enough for a 7B model, how much VRAM do I need to run this model?
I have an Nvidia A10 (24GB of VRAM) but I'm getting out of memory errors.
model_name = "teknium/OpenHermes-2.5-Mistral-7B" def load_model(model_name: str): tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) with torch.device("cuda:0"): model = transformers.AutoModelForCausalLM.from_pretrained(model_name).eval() return tokenizer, model tokenizer, model = load_model(model_name)
OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 13.00 MiB is free. Including non-PyTorch memory, this process has 21.96 GiB memory in use. Of the allocated memory 21.58 GiB is allocated by PyTorch, and 99.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF...
I assumed 24GB would be enough for a 7B model, how much VRAM do I need to run this model?
You're loading it likely in fp32. in fp32, it needs 28GB. In fp/bf16 it needs 14GB, in 8bit, 7GB, and in 4bit, ~4GB - add 1GB to all for CUDA Kernel
@teknium Thanks for the response, this may be a naive question but how do I load in 16/8 bit?
I've tried loading in bf16:
model_name = "teknium/OpenHermes-2.5-Mistral-7B"
def load_model(model_name: str):
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
with torch.device("cuda:0"):
model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
return tokenizer, model
tokenizer, model = load_model(model_name)
which gave:
Loading checkpoint shards: 0%| | 0/2 [00:49<?, ?it/s]
---------------------------------------------------------------------------
OutOfMemoryError Traceback (most recent call last)
Cell In[3], line 19
15 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
17 return model
---> 19 model = load_model(model_name)
Cell In[3], line 15
11 def load_model(model_name: str):
12 #tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
14 with torch.device("cuda:0"):
---> 15 model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
17 return model
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:566, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
564 elif type(config) in cls._model_mapping.keys():
565 model_class = _get_model_class(config, cls._model_mapping)
--> 566 return model_class.from_pretrained(
567 pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
568 )
569 raise ValueError(
570 f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
571 f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
572 )
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:3706, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
3697 if dtype_orig is not None:
3698 torch.set_default_dtype(dtype_orig)
3699 (
3700 model,
3701 missing_keys,
3702 unexpected_keys,
3703 mismatched_keys,
3704 offload_index,
3705 error_msgs,
-> 3706 ) = cls._load_pretrained_model(
3707 model,
3708 state_dict,
3709 loaded_state_dict_keys, # XXX: rename?
3710 resolved_archive_file,
3711 pretrained_model_name_or_path,
3712 ignore_mismatched_sizes=ignore_mismatched_sizes,
3713 sharded_metadata=sharded_metadata,
3714 _fast_init=_fast_init,
3715 low_cpu_mem_usage=low_cpu_mem_usage,
3716 device_map=device_map,
3717 offload_folder=offload_folder,
3718 offload_state_dict=offload_state_dict,
3719 dtype=torch_dtype,
3720 is_quantized=(getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES),
3721 keep_in_fp32_modules=keep_in_fp32_modules,
3722 )
3724 model.is_loaded_in_4bit = load_in_4bit
3725 model.is_loaded_in_8bit = load_in_8bit
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:4091, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, is_quantized, keep_in_fp32_modules)
4089 if shard_file in disk_only_shard_files:
4090 continue
-> 4091 state_dict = load_state_dict(shard_file)
4093 # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
4094 # matching the weights in the model.
4095 mismatched_keys += _find_mismatched_keys(
4096 state_dict,
4097 model_state_dict,
(...)
4101 ignore_mismatched_sizes,
4102 )
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:510, in load_state_dict(checkpoint_file)
505 if metadata.get("format") not in ["pt", "tf", "flax"]:
506 raise OSError(
507 f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
508 "you save your model with the `save_pretrained` method."
509 )
--> 510 return safe_load_file(checkpoint_file)
511 try:
512 if (
513 is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0
514 ) or (is_fsdp_enabled() and not is_local_dist_rank_0()):
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/safetensors/torch.py:310, in load_file(filename, device)
308 with safe_open(filename, framework="pt", device=device) as f:
309 for k in f.keys():
--> 310 result[k] = f.get_tensor(k)
311 return result
File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/utils/_device.py:77, in DeviceContext.__torch_function__(self, func, types, args, kwargs)
75 if func in _device_constructors() and kwargs.get('device') is None:
76 kwargs['device'] = self.device
---> 77 return func(*args, **kwargs)
OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 59.00 MiB is free. Including non-PyTorch memory, this process has 21.92 GiB memory in use. Of the allocated memory 21.44 GiB is allocated by PyTorch, and 203.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
For reference/context, I can load the quantised GGUF version of this model without issue.
Use this inference code from the repo
https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/transformers_inference.py
Let me know if it still has issues 🤗
I'm using the inference script above with the following prompt:
prompts = [
"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Explain how viruses casue diseases<|im_end|>
<|im_start|>assistant""",
]
for chat in prompts:
print(chat)
input_ids = tokenizer(chat, return_tensors="pt").input_ids.to("cuda")
generated_ids = model.generate(input_ids, max_new_tokens=1000,repetition_penalty=1.1, do_sample=False, eos_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
print(f"Response: {response}")
After about 13 minutes it generates this error: "IndexError: piece id is out of range."
I've tried 4bit as well, still not able to get a response. I have an RTX 3080 16Gb.
Any suggestions on how to make it work? Thank you.
Can you share the full log
Thank you
@teknium
for responding so fast. here's the traceback:
{
"name": "IndexError",
"message": "piece id is out of range.",
"stack": "---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[6], line 14
12 #generated_ids = model.generate(input_ids, max_new_tokens=1000, temperature=0.8, repetition_penalty=1.1, do_sample=True, eos_token_id=tokenizer.eos_token_id)
13 generated_ids = model.generate(input_ids, max_new_tokens=100,repetition_penalty=1.1, do_sample=False, eos_token_id=tokenizer.eos_token_id)
---> 14 response = tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
15 print(f"Response: {response}")
File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils_base.py:3750, in PreTrainedTokenizerBase.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3747 # Convert inputs to python lists
3748 token_ids = to_py_obj(token_ids)
-> 3750 return self._decode(
3751 token_ids=token_ids,
3752 skip_special_tokens=skip_special_tokens,
3753 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3754 **kwargs,
3755 )
File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils.py:1001, in PreTrainedTokenizer._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
991 def _decode(
992 self,
993 token_ids: List[int],
(...)
997 **kwargs,
998 ) -> str:
999 self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-> 1001 filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
1002 legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
1003 token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
1004 }
1005 # To avoid mixing byte-level and unicode for byte-level BPT
1006 # we need to build string separately for added tokens and byte-level tokens
1007 # cf. https://github.com/huggingface/transformers/issues/1133
File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils.py:982, in PreTrainedTokenizer.convert_ids_to_tokens(self, ids, skip_special_tokens)
980 tokens.append(self._added_tokens_decoder[index].content)
981 else:
--> 982 tokens.append(self._convert_id_to_token(index))
983 return tokens
File /usr/local/lib/python3.9/dist-packages/transformers/models/llama/tokenization_llama.py:280, in LlamaTokenizer._convert_id_to_token(self, index)
278 def _convert_id_to_token(self, index):
279 """Converts an index (integer) in a token (str) using the vocab."""
--> 280 token = self.sp_model.IdToPiece(index)
281 return token
File /usr/local/lib/python3.9/dist-packages/sentencepiece/init.py:1045, in _batchnize.._batched_func(self, arg)
1043 return [_func(self, n) for n in arg]
1044 else:
-> 1045 return _func(self, arg)
File /usr/local/lib/python3.9/dist-packages/sentencepiece/init.py:1038, in _batchnize.._func(v, n)
1036 def _func(v, n):
1037 if type(n) is int and (n < 0 or n >= v.piece_size()):
-> 1038 raise IndexError('piece id is out of range.')
1039 return func(v, n)
IndexError: piece id is out of range."
}
thank you.