This is an HQQ all 4-bit (group-size=64) quantized Mixtral-8x7B-Instruct-v0.1 model.
Usage
First, install the dependecies:
pip install git+https://github.com/mobiusml/hqq.git;
pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend
pip install bitblas #to use the bitblas backend
Then you can use the sample code below:
Transformers 馃
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import patch_model_for_compiled_runtime
#Settings
###################################################
backend = "gemlite" #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
device = 'cuda:0'
cache_dir = '.'
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir=cache_dir, device_map=device, attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)
#Use optimized inference kernels
########################################################################
prepare_for_inference(model, backend=backend, verbose=True)
#Load gemlite cache for faster warm-up
if(backend == 'gemlite'):
import gemlite
gemlite.core.GemLiteLinear.load_config('gemlite_config.json')
#Generate
########################################################################
from hqq.utils.generation_hf import HFGenerator
#Mixtral doesn't support cuda graphs with HF unfortuantely...
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None)
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial",
compile_options={"mode": "max-autotune-no-cudagraphs"}
)#.enable_cuda_graph()
gen.generate("Write an essay about large language models", print_tokens=True)
########################################################################
# #Inference with model,generate()
# from hqq.utils.generation_hf import patch_model_for_compiled_runtime
# patch_model_for_compiled_runtime(model, tokenizer, pre_compile=False)
# prompt = "Write an essay about large language models."
# inputs = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
# outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id)
# #print(tokenizer.decode(outputs[0])
########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
VLLM
Run with vllm:
##################################################################
import torch
import torch.nn as nn
from typing import Optional
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
class MixtralMLPRowParallel(nn.Module):
def __init__(
self,
num_experts: int,
hidden_size: int,
intermediate_size: int,
quant_config: Optional[QuantizationConfig] = None,
) -> None:
super().__init__()
self.num_experts = num_experts
self.ffn_dim = intermediate_size
self.hidden_dim = hidden_size
self.w1 = RowParallelLinear(self.hidden_dim,
self.ffn_dim,
bias=False,
quant_config=quant_config)
self.w2 = RowParallelLinear(self.ffn_dim,
self.hidden_dim,
bias=False,
quant_config=quant_config)
self.w3 = RowParallelLinear(self.hidden_dim,
self.ffn_dim,
bias=False,
quant_config=quant_config)
# TODO: Use vllm's SiluAndMul
self.act_fn = nn.SiLU()
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
w1_out, _ = self.w1(hidden_states)
w1_out = self.act_fn(w1_out)
w3_out, _ = self.w3(hidden_states)
current_hidden_states = w1_out * w3_out
current_hidden_states, _ = self.w2(current_hidden_states)
return current_hidden_states
import vllm.model_executor.models.mixtral_quant as mixtral_quant
mixtral_quant.MixtralMLP = MixtralMLPRowParallel
##################################################################
from vllm import LLM
from vllm.sampling_params import SamplingParams
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"
llm = LLM(model=model_id, gpu_memory_utilization=0.80)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
print(outputs[0].outputs[0].text)
- Downloads last month
- 26
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model authors have turned it off explicitly.