Text Generation
Transformers
mobicham commited on
Commit
195ffb9
·
verified ·
1 Parent(s): ddd3622

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -11
README.md CHANGED
@@ -45,7 +45,7 @@ While this is significantly better than the best 2-bit Llama3-8B model reported
45
  ## Usage
46
  First, install the dependecies:
47
  ```
48
- pip install hqq==0.1.8
49
  pip install bitblas
50
  ```
51
 
@@ -58,23 +58,17 @@ from hqq.core.quantize import *
58
  from hqq.utils.patching import *
59
  from hqq.utils.generation_hf import HFGenerator
60
 
 
 
61
  #Load the model
62
  ###################################################
63
  model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
64
- model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
65
  tokenizer = AutoTokenizer.from_pretrained(model_id)
66
 
67
- patch_linearlayers(model, patch_add_quant_config,
68
- BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))
69
-
70
- model.eval();
71
- cleanup()
72
-
73
  #Use optimized inference kernels
74
  ###################################################
75
- HQQLinear.set_backend(HQQBackend.PYTORCH)
76
- #prepare_for_inference(model) #default backend
77
- prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
78
 
79
  #Generate
80
  ###################################################
 
45
  ## Usage
46
  First, install the dependecies:
47
  ```
48
+ pip install git+https://github.com/mobiusml/hqq
49
  pip install bitblas
50
  ```
51
 
 
58
  from hqq.utils.patching import *
59
  from hqq.utils.generation_hf import HFGenerator
60
 
61
+ backend = 'bitblas' #"bitblas" #None
62
+
63
  #Load the model
64
  ###################################################
65
  model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
66
+ model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora').eval();
67
  tokenizer = AutoTokenizer.from_pretrained(model_id)
68
 
 
 
 
 
 
 
69
  #Use optimized inference kernels
70
  ###################################################
71
+ prepare_for_inference(model, backend=backend, allow_merge=False) #It takes a while...
 
 
72
 
73
  #Generate
74
  ###################################################