appoose commited on
Commit
9ed79d7
1 Parent(s): 48d4273

adding streaming in the example provided

Browse files
Files changed (1) hide show
  1. README.md +33 -5
README.md CHANGED
@@ -33,6 +33,9 @@ The difference between this model and https://huggingface.co/mobiuslabsgmbh/Mixt
33
  ### Basic Usage
34
  To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
35
  ``` Python
 
 
 
36
  model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
37
  #Load the model
38
  from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
@@ -46,11 +49,35 @@ model = HQQModelForCausalLM.from_quantized(model_id)
46
  from hqq.core.quantize import *
47
  HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
48
 
49
- #Text Generation
50
- prompt = "<s> [INST] How do I build a car? [/INST] "
51
- inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
52
- outputs = model.generate(**(inputs.to('cuda')), max_new_tokens=1000)
53
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ```
55
 
56
 
@@ -60,6 +87,7 @@ You can reproduce the model using the following quant configs:
60
 
61
  ``` Python
62
  from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
 
63
  model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
64
  model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
65
 
 
33
  ### Basic Usage
34
  To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
35
  ``` Python
36
+ import transformers
37
+ from threading import Thread
38
+
39
  model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
40
  #Load the model
41
  from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
 
49
  from hqq.core.quantize import *
50
  HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
51
 
52
+
53
+ def chat_processor(chat, max_new_tokens=100, do_sample=True):
54
+ tokenizer.use_default_system_prompt = False
55
+ streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
56
+
57
+ generate_params = dict(
58
+ tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
59
+ streamer=streamer,
60
+ max_new_tokens=max_new_tokens,
61
+ do_sample=do_sample,
62
+ top_p=0.90,
63
+ top_k=50,
64
+ temperature= 0.6,
65
+ num_beams=1,
66
+ repetition_penalty=1.2,
67
+ )
68
+
69
+ t = Thread(target=model.generate, kwargs=generate_params)
70
+ t.start()
71
+ outputs = []
72
+ for text in streamer:
73
+ outputs.append(text)
74
+ print(text, end="", flush=True)
75
+
76
+ return outputs
77
+
78
+ ################################################################################################
79
+ #Generation
80
+ outputs = chat_processor("How do I build a car?", max_new_tokens=1000, do_sample=False)
81
  ```
82
 
83
 
 
87
 
88
  ``` Python
89
  from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
90
+
91
  model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
92
  model = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)
93