mobiuslabsgmbh
/

Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ

@@ -33,6 +33,9 @@ The difference between this model and https://huggingface.co/mobiuslabsgmbh/Mixt
 ### Basic Usage
 To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
 ``` Python
 model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
 #Load the model
 from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
@@ -46,11 +49,35 @@ model     = HQQModelForCausalLM.from_quantized(model_id)
 from hqq.core.quantize import *
 HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
-#Text Generation
-prompt = "<s> [INST] How do I build a car? [/INST] "
-inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
-outputs = model.generate(**(inputs.to('cuda')), max_new_tokens=1000)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
@@ -60,6 +87,7 @@ You can reproduce the model using the following quant configs:
 ``` Python
 from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
 model_id  = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 model     = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)

 ### Basic Usage
 To run the model, install the HQQ library from https://github.com/mobiusml/hqq and use it as follows:
 ``` Python
+import transformers
+from threading import Thread
 model_id = 'mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-2bit-metaoffload-HQQ'
 #Load the model
 from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
 from hqq.core.quantize import *
 HQQLinear.set_backend(HQQBackend.ATEN_BACKPROP)
+def chat_processor(chat, max_new_tokens=100, do_sample=True):
+    tokenizer.use_default_system_prompt = False
+    streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_params = dict(
+        tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        top_p=0.90,
+        top_k=50,
+        temperature= 0.6,
+        num_beams=1,
+        repetition_penalty=1.2,
+    )
+    t = Thread(target=model.generate, kwargs=generate_params)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        print(text, end="", flush=True)
+    return outputs
+################################################################################################
+#Generation
+outputs = chat_processor("How do I build a car?", max_new_tokens=1000, do_sample=False)
 ```
 ``` Python
 from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
 model_id  = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 model     = HQQModelForCausalLM.from_pretrained(model_id, use_auth_token=hf_auth, cache_dir=cache_path)