|
--- |
|
license: apache-2.0 |
|
train: false |
|
inference: false |
|
pipeline_tag: text-generation |
|
--- |
|
This is an <a href="https://github.com/mobiusml/hqq/">HQQ</a> all 4-bit (group-size=64) quantized <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x7B-Instruct-v0.1</a> model. |
|
|
|
|
|
## Usage |
|
First, install the dependecies: |
|
``` |
|
pip install git+https://github.com/mobiusml/hqq.git; |
|
pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend |
|
pip install bitblas #to use the bitblas backend |
|
``` |
|
|
|
Then you can use the sample code below: |
|
## Transformers 🤗 |
|
``` Python |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from hqq.utils.patching import * |
|
from hqq.core.quantize import * |
|
from hqq.utils.generation_hf import patch_model_for_compiled_runtime |
|
|
|
#Settings |
|
################################################### |
|
backend = "gemlite" #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit) |
|
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16 |
|
device = 'cuda:0' |
|
cache_dir = '.' |
|
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf" |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir=cache_dir, device_map=device, attn_implementation="sdpa") |
|
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir) |
|
|
|
#Use optimized inference kernels |
|
######################################################################## |
|
prepare_for_inference(model, backend=backend, verbose=True) |
|
|
|
#Load gemlite cache for faster warm-up |
|
if(backend == 'gemlite'): |
|
import gemlite |
|
gemlite.core.GemLiteLinear.load_config('gemlite_config.json') |
|
|
|
#Generate |
|
######################################################################## |
|
from hqq.utils.generation_hf import HFGenerator |
|
#Mixtral doesn't support cuda graphs with HF unfortuantely... |
|
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) |
|
|
|
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial", |
|
compile_options={"mode": "max-autotune-no-cudagraphs"} |
|
)#.enable_cuda_graph() |
|
|
|
gen.generate("Write an essay about large language models", print_tokens=True) |
|
|
|
######################################################################## |
|
# #Inference with model,generate() |
|
# from hqq.utils.generation_hf import patch_model_for_compiled_runtime |
|
|
|
# patch_model_for_compiled_runtime(model, tokenizer, pre_compile=False) |
|
|
|
# prompt = "Write an essay about large language models." |
|
# inputs = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True) |
|
# outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id) |
|
# #print(tokenizer.decode(outputs[0]) |
|
|
|
######################################################################## |
|
#Save gemlite cache |
|
if(backend == 'gemlite'): |
|
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') |
|
``` |
|
|
|
## VLLM |
|
Run with <a href="https://github.com/vllm-project/vllm/">vllm</a>: |
|
```Python |
|
################################################################## |
|
import torch |
|
import torch.nn as nn |
|
from typing import Optional |
|
from vllm.model_executor.layers.linear import RowParallelLinear |
|
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig |
|
class MixtralMLPRowParallel(nn.Module): |
|
|
|
def __init__( |
|
self, |
|
num_experts: int, |
|
hidden_size: int, |
|
intermediate_size: int, |
|
quant_config: Optional[QuantizationConfig] = None, |
|
) -> None: |
|
super().__init__() |
|
self.num_experts = num_experts |
|
self.ffn_dim = intermediate_size |
|
self.hidden_dim = hidden_size |
|
|
|
self.w1 = RowParallelLinear(self.hidden_dim, |
|
self.ffn_dim, |
|
bias=False, |
|
quant_config=quant_config) |
|
self.w2 = RowParallelLinear(self.ffn_dim, |
|
self.hidden_dim, |
|
bias=False, |
|
quant_config=quant_config) |
|
self.w3 = RowParallelLinear(self.hidden_dim, |
|
self.ffn_dim, |
|
bias=False, |
|
quant_config=quant_config) |
|
|
|
# TODO: Use vllm's SiluAndMul |
|
self.act_fn = nn.SiLU() |
|
|
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: |
|
w1_out, _ = self.w1(hidden_states) |
|
w1_out = self.act_fn(w1_out) |
|
w3_out, _ = self.w3(hidden_states) |
|
current_hidden_states = w1_out * w3_out |
|
current_hidden_states, _ = self.w2(current_hidden_states) |
|
return current_hidden_states |
|
|
|
import vllm.model_executor.models.mixtral_quant as mixtral_quant |
|
mixtral_quant.MixtralMLP = MixtralMLPRowParallel |
|
################################################################## |
|
|
|
from vllm import LLM |
|
from vllm.sampling_params import SamplingParams |
|
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf" |
|
|
|
llm = LLM(model=model_id, gpu_memory_utilization=0.80) |
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024) |
|
outputs = llm.generate(["What is the capital of Germany?"], sampling_params) |
|
print(outputs[0].outputs[0].text) |
|
``` |
|
|