mobiuslabsgmbh
/

Llama-3-8b-instruct_2bitgs64_hqq

Text Generation

Model card Files Files and versions Community

mobicham commited on 5 days ago

Commit

e4b888a

·

verified ·

1 Parent(s): 195ffb9

Update README.md

Files changed (1) hide show

README.md +9 -4

README.md CHANGED Viewed

@@ -58,17 +58,22 @@ from hqq.core.quantize import *
 from hqq.utils.patching import *
 from hqq.utils.generation_hf import HFGenerator
-backend = 'bitblas' #"bitblas" #None
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
-model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora').eval();
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Use optimized inference kernels
 ###################################################
-prepare_for_inference(model, backend=backend, allow_merge=False) #It takes a while...
 #Generate
 ###################################################

 from hqq.utils.patching import *
 from hqq.utils.generation_hf import HFGenerator
+#Settings
+###################################################
+backend       = "bitblas" #bitblas or gemlite for 2-bit runtime
+compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
+device        = 'cuda:0'
+cache_dir     = '.'
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
+model     = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device, adapter='adapter_v0.1.lora').eval();
+tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
 #Use optimized inference kernels
 ###################################################
+prepare_for_inference(model, backend=backend) #It takes a while...
 #Generate
 ###################################################