Update README.md
Browse files
README.md
CHANGED
@@ -45,7 +45,7 @@ While this is significantly better than the best 2-bit Llama3-8B model reported
|
|
45 |
## Usage
|
46 |
First, install the dependecies:
|
47 |
```
|
48 |
-
pip install hqq
|
49 |
pip install bitblas
|
50 |
```
|
51 |
|
@@ -58,23 +58,17 @@ from hqq.core.quantize import *
|
|
58 |
from hqq.utils.patching import *
|
59 |
from hqq.utils.generation_hf import HFGenerator
|
60 |
|
|
|
|
|
61 |
#Load the model
|
62 |
###################################################
|
63 |
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
|
64 |
-
model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
|
65 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
66 |
|
67 |
-
patch_linearlayers(model, patch_add_quant_config,
|
68 |
-
BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
|
69 |
-
|
70 |
-
model.eval();
|
71 |
-
cleanup()
|
72 |
-
|
73 |
#Use optimized inference kernels
|
74 |
###################################################
|
75 |
-
|
76 |
-
#prepare_for_inference(model) #default backend
|
77 |
-
prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
|
78 |
|
79 |
#Generate
|
80 |
###################################################
|
|
|
45 |
## Usage
|
46 |
First, install the dependecies:
|
47 |
```
|
48 |
+
pip install git+https://github.com/mobiusml/hqq
|
49 |
pip install bitblas
|
50 |
```
|
51 |
|
|
|
58 |
from hqq.utils.patching import *
|
59 |
from hqq.utils.generation_hf import HFGenerator
|
60 |
|
61 |
+
backend = 'bitblas' #"bitblas" #None
|
62 |
+
|
63 |
#Load the model
|
64 |
###################################################
|
65 |
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
|
66 |
+
model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora').eval();
|
67 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
#Use optimized inference kernels
|
70 |
###################################################
|
71 |
+
prepare_for_inference(model, backend=backend, allow_merge=False) #It takes a while...
|
|
|
|
|
72 |
|
73 |
#Generate
|
74 |
###################################################
|