Update README.md
Browse files
README.md
CHANGED
@@ -58,17 +58,22 @@ from hqq.core.quantize import *
|
|
58 |
from hqq.utils.patching import *
|
59 |
from hqq.utils.generation_hf import HFGenerator
|
60 |
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
#Load the model
|
64 |
###################################################
|
65 |
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
|
66 |
-
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=
|
67 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
68 |
|
69 |
#Use optimized inference kernels
|
70 |
###################################################
|
71 |
-
prepare_for_inference(model, backend=backend
|
72 |
|
73 |
#Generate
|
74 |
###################################################
|
|
|
58 |
from hqq.utils.patching import *
|
59 |
from hqq.utils.generation_hf import HFGenerator
|
60 |
|
61 |
+
#Settings
|
62 |
+
###################################################
|
63 |
+
backend = "bitblas" #bitblas or gemlite for 2-bit runtime
|
64 |
+
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
|
65 |
+
device = 'cuda:0'
|
66 |
+
cache_dir = '.'
|
67 |
|
68 |
#Load the model
|
69 |
###################################################
|
70 |
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
|
71 |
+
model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device, adapter='adapter_v0.1.lora').eval();
|
72 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
|
73 |
|
74 |
#Use optimized inference kernels
|
75 |
###################################################
|
76 |
+
prepare_for_inference(model, backend=backend) #It takes a while...
|
77 |
|
78 |
#Generate
|
79 |
###################################################
|