Text Generation
Transformers
mobicham commited on
Commit
e4b888a
·
verified ·
1 Parent(s): 195ffb9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -4
README.md CHANGED
@@ -58,17 +58,22 @@ from hqq.core.quantize import *
58
  from hqq.utils.patching import *
59
  from hqq.utils.generation_hf import HFGenerator
60
 
61
- backend = 'bitblas' #"bitblas" #None
 
 
 
 
 
62
 
63
  #Load the model
64
  ###################################################
65
  model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
66
- model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora').eval();
67
- tokenizer = AutoTokenizer.from_pretrained(model_id)
68
 
69
  #Use optimized inference kernels
70
  ###################################################
71
- prepare_for_inference(model, backend=backend, allow_merge=False) #It takes a while...
72
 
73
  #Generate
74
  ###################################################
 
58
  from hqq.utils.patching import *
59
  from hqq.utils.generation_hf import HFGenerator
60
 
61
+ #Settings
62
+ ###################################################
63
+ backend = "bitblas" #bitblas or gemlite for 2-bit runtime
64
+ compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
65
+ device = 'cuda:0'
66
+ cache_dir = '.'
67
 
68
  #Load the model
69
  ###################################################
70
  model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
71
+ model = AutoHQQHFModel.from_quantized(model_id, cache_dir=cache_dir, compute_dtype=compute_dtype, device=device, adapter='adapter_v0.1.lora').eval();
72
+ tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
73
 
74
  #Use optimized inference kernels
75
  ###################################################
76
+ prepare_for_inference(model, backend=backend) #It takes a while...
77
 
78
  #Generate
79
  ###################################################