mobiuslabsgmbh
/

Llama-3-8b-instruct_2bitgs64_hqq

Text Generation

Model card Files Files and versions Community

mobicham commited on 9 days ago

Commit

195ffb9

·

verified ·

1 Parent(s): ddd3622

Update README.md

Files changed (1) hide show

README.md +5 -11

README.md CHANGED Viewed

@@ -45,7 +45,7 @@ While this is significantly better than the best 2-bit Llama3-8B model reported
 ## Usage
 First, install the dependecies:
 ```
-pip install hqq==0.1.8
 pip install bitblas
 ```
@@ -58,23 +58,17 @@ from hqq.core.quantize import *
 from hqq.utils.patching import *
 from hqq.utils.generation_hf import HFGenerator
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
-model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
-patch_linearlayers(model, patch_add_quant_config,
-                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
-model.eval();
-cleanup()
 #Use optimized inference kernels
 ###################################################
-HQQLinear.set_backend(HQQBackend.PYTORCH)
-#prepare_for_inference(model) #default backend
-prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
 #Generate
 ###################################################

 ## Usage
 First, install the dependecies:
 ```
+pip install git+https://github.com/mobiusml/hqq
 pip install bitblas
 ```
 from hqq.utils.patching import *
 from hqq.utils.generation_hf import HFGenerator
+backend = 'bitblas' #"bitblas" #None
 #Load the model
 ###################################################
 model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
+model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora').eval();
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Use optimized inference kernels
 ###################################################
+prepare_for_inference(model, backend=backend, allow_merge=False) #It takes a while...
 #Generate
 ###################################################