Update README.md

db9ef4d verified 11 days ago

5.57 kB

	---
	license: apache-2.0
	train: false
	inference: false
	pipeline_tag: text-generation
	---
	This is an <a href="https://github.com/mobiusml/hqq/">HQQ</a> all 4-bit (group-size=64) quantized <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1">Mixtral-8x7B-Instruct-v0.1</a> model.


	## Usage
	First, install the dependecies:
	```
	pip install git+https://github.com/mobiusml/hqq.git;
	pip install git+https://github.com/mobiusml/gemlite.git; #to use the gemlite backend
	pip install bitblas #to use the bitblas backend
	```

	Then you can use the sample code below:
	## Transformers 🤗
	``` Python
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from hqq.utils.patching import *
	from hqq.core.quantize import *
	from hqq.utils.generation_hf import patch_model_for_compiled_runtime

	#Settings
	###################################################
	backend = "gemlite" #"torchao_int4" (4-bit only) or "bitblas" (4-bit + 2-bit) or "gemlite" (8-bit, 4-bit, 2-bit, 1-bit)
	compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
	device = 'cuda:0'
	cache_dir = '.'
	model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

	model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, cache_dir=cache_dir, device_map=device, attn_implementation="sdpa")
	tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir)

	#Use optimized inference kernels
	########################################################################
	prepare_for_inference(model, backend=backend, verbose=True)

	#Load gemlite cache for faster warm-up
	if(backend == 'gemlite'):
	import gemlite
	gemlite.core.GemLiteLinear.load_config('gemlite_config.json')

	#Generate
	########################################################################
	from hqq.utils.generation_hf import HFGenerator
	#Mixtral doesn't support cuda graphs with HF unfortuantely...
	#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None)

	gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial",
	compile_options={"mode": "max-autotune-no-cudagraphs"}
	)#.enable_cuda_graph()

	gen.generate("Write an essay about large language models", print_tokens=True)

	########################################################################
	# #Inference with model,generate()
	# from hqq.utils.generation_hf import patch_model_for_compiled_runtime

	# patch_model_for_compiled_runtime(model, tokenizer, pre_compile=False)

	# prompt = "Write an essay about large language models."
	# inputs = tokenizer.apply_chat_template([{"role":"user", "content":prompt}], tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
	# outputs = model.generate(**inputs.to(model.device), max_new_tokens=1000, cache_implementation="static", pad_token_id=tokenizer.pad_token_id)
	# #print(tokenizer.decode(outputs[0])

	########################################################################
	#Save gemlite cache
	if(backend == 'gemlite'):
	gemlite.core.GemLiteLinear.cache_config('/tmp/gemlite_config.json')
	```

	## VLLM
	Run with <a href="https://github.com/vllm-project/vllm/">vllm</a>:
	```Python
	##################################################################
	import torch
	import torch.nn as nn
	from typing import Optional
	from vllm.model_executor.layers.linear import RowParallelLinear
	from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
	class MixtralMLPRowParallel(nn.Module):

	def __init__(
	self,
	num_experts: int,
	hidden_size: int,
	intermediate_size: int,
	quant_config: Optional[QuantizationConfig] = None,
	) -> None:
	super().__init__()
	self.num_experts = num_experts
	self.ffn_dim = intermediate_size
	self.hidden_dim = hidden_size

	self.w1 = RowParallelLinear(self.hidden_dim,
	self.ffn_dim,
	bias=False,
	quant_config=quant_config)
	self.w2 = RowParallelLinear(self.ffn_dim,
	self.hidden_dim,
	bias=False,
	quant_config=quant_config)
	self.w3 = RowParallelLinear(self.hidden_dim,
	self.ffn_dim,
	bias=False,
	quant_config=quant_config)

	# TODO: Use vllm's SiluAndMul
	self.act_fn = nn.SiLU()

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	w1_out, _ = self.w1(hidden_states)
	w1_out = self.act_fn(w1_out)
	w3_out, _ = self.w3(hidden_states)
	current_hidden_states = w1_out * w3_out
	current_hidden_states, _ = self.w2(current_hidden_states)
	return current_hidden_states

	import vllm.model_executor.models.mixtral_quant as mixtral_quant
	mixtral_quant.MixtralMLP = MixtralMLPRowParallel
	##################################################################

	from vllm import LLM
	from vllm.sampling_params import SamplingParams
	model_id = "mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1_4bitgs64_hqq_hf"

	llm = LLM(model=model_id, gpu_memory_utilization=0.80)
	sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)
	outputs = llm.generate(["What is the capital of Germany?"], sampling_params)
	print(outputs[0].outputs[0].text)
	```