This is an HQQ all 4-bit (group-size=64) quantized Mixtral-8x7B-Instruct-v0.1 model.

Usage

First, install the dependecies:

pip install git+https://github.com/mobiusml/hqq.git;
pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend
pip install bitblas #to use the bitblas backend

Then you can use the sample code below:

Transformers 馃

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import patch_model_for_compiled_runtime

#Settings
###################################################
backend       = "gemlite" #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
device        = 'cuda:0'
cache_dir     = '.'
model_id      = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

model     = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir=cache_dir, device_map=device, attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

#Use optimized inference kernels
########################################################################
prepare_for_inference(model, backend=backend, verbose=True) 

#Load gemlite cache for faster warm-up
if(backend == 'gemlite'):
    import gemlite
    gemlite.core.GemLiteLinear.load_config('gemlite_config.json')

#Generate
########################################################################
from hqq.utils.generation_hf import HFGenerator
#Mixtral doesn't support cuda graphs with HF unfortuantely...
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None)

gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial", 
                                    compile_options={"mode": "max-autotune-no-cudagraphs"} 
                                    )#.enable_cuda_graph()

gen.generate("Write an essay about large language models", print_tokens=True)

########################################################################
# #Inference with model,generate()
# from hqq.utils.generation_hf import patch_model_for_compiled_runtime

# patch_model_for_compiled_runtime(model, tokenizer, pre_compile=False) 

# prompt  = "Write an essay about large language models."
# inputs  = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
# outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id) 
# #print(tokenizer.decode(outputs[0])

########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
    gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json') 

VLLM

Run with vllm:

##################################################################
import torch
import torch.nn as nn
from typing import Optional
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
class MixtralMLPRowParallel(nn.Module):

    def __init__(
        self,
        num_experts: int,
        hidden_size: int,
        intermediate_size: int,
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.num_experts = num_experts
        self.ffn_dim = intermediate_size
        self.hidden_dim = hidden_size

        self.w1 = RowParallelLinear(self.hidden_dim,
                                   self.ffn_dim,
                                   bias=False,
                                   quant_config=quant_config)
        self.w2 = RowParallelLinear(self.ffn_dim,
                                   self.hidden_dim,
                                   bias=False,
                                   quant_config=quant_config)
        self.w3 = RowParallelLinear(self.hidden_dim,
                                   self.ffn_dim,
                                   bias=False,
                                   quant_config=quant_config)

        # TODO: Use vllm's SiluAndMul
        self.act_fn = nn.SiLU()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        w1_out, _ = self.w1(hidden_states)
        w1_out = self.act_fn(w1_out)
        w3_out, _ = self.w3(hidden_states)
        current_hidden_states = w1_out * w3_out
        current_hidden_states, _ = self.w2(current_hidden_states)
        return current_hidden_states

import vllm.model_executor.models.mixtral_quant as mixtral_quant
mixtral_quant.MixtralMLP = MixtralMLPRowParallel
##################################################################

from vllm import LLM
from vllm.sampling_params import SamplingParams
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

llm = LLM(model=model_id, gpu_memory_utilization=0.80)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
print(outputs[0].outputs[0].text)
Downloads last month
26
Safetensors
Model size
24.9B params
Tensor type
I64
FP16
U8
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model authors have turned it off explicitly.