mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf

This is an HQQ all 4-bit (group-size=64) quantized Mixtral-8x7B-Instruct-v0.1 model.

Usage

First, install the dependecies:

pip install git+https://github.com/mobiusml/hqq.git;
pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend
pip install bitblas #to use the bitblas backend

Then you can use the sample code below:

Transformers 🤗

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from hqq.utils.patching import *
from hqq.core.quantize import *
from hqq.utils.generation_hf import patch_model_for_compiled_runtime

#Settings
###################################################
backend       = "gemlite" #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
device        = 'cuda:0'
cache_dir     = '.'
model_id      = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

model     = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir=cache_dir, device_map=device, attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

#Use optimized inference kernels
########################################################################
prepare_for_inference(model, backend=backend, verbose=True) 

#Load gemlite cache for faster warm-up
if(backend == 'gemlite'):
    import gemlite
    gemlite.core.GemLiteLinear.load_config('gemlite_config.json')

#Generate
########################################################################
from hqq.utils.generation_hf import HFGenerator
#Mixtral doesn't support cuda graphs with HF unfortuantely...
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None)

gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial", 
                                    compile_options={"mode": "max-autotune-no-cudagraphs"} 
                                    )#.enable_cuda_graph()

gen.generate("Write an essay about large language models", print_tokens=True)

########################################################################
# #Inference with model,generate()
# from hqq.utils.generation_hf import patch_model_for_compiled_runtime

# patch_model_for_compiled_runtime(model, tokenizer, pre_compile=False) 

# prompt  = "Write an essay about large language models."
# inputs  = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
# outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id) 
# #print(tokenizer.decode(outputs[0])

########################################################################
#Save gemlite cache
if(backend == 'gemlite'):
    gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')

VLLM

Run with vllm:

##################################################################
import torch
import torch.nn as nn
from typing import Optional
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
class MixtralMLPRowParallel(nn.Module):

    def __init__(
        self,
        num_experts: int,
        hidden_size: int,
        intermediate_size: int,
        quant_config: Optional[QuantizationConfig] = None,
    ) -> None:
        super().__init__()
        self.num_experts = num_experts
        self.ffn_dim = intermediate_size
        self.hidden_dim = hidden_size

        self.w1 = RowParallelLinear(self.hidden_dim,
                                   self.ffn_dim,
                                   bias=False,
                                   quant_config=quant_config)
        self.w2 = RowParallelLinear(self.ffn_dim,
                                   self.hidden_dim,
                                   bias=False,
                                   quant_config=quant_config)
        self.w3 = RowParallelLinear(self.hidden_dim,
                                   self.ffn_dim,
                                   bias=False,
                                   quant_config=quant_config)

        # TODO: Use vllm's SiluAndMul
        self.act_fn = nn.SiLU()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        w1_out, _ = self.w1(hidden_states)
        w1_out = self.act_fn(w1_out)
        w3_out, _ = self.w3(hidden_states)
        current_hidden_states = w1_out * w3_out
        current_hidden_states, _ = self.w2(current_hidden_states)
        return current_hidden_states

import vllm.model_executor.models.mixtral_quant as mixtral_quant
mixtral_quant.MixtralMLP = MixtralMLPRowParallel
##################################################################

from vllm import LLM
from vllm.sampling_params import SamplingParams
model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

llm = LLM(model=model_id, gpu_memory_utilization=0.80)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
print(outputs[0].outputs[0].text)