Spaces:

HugThang
/

EndPointMistral

Sleeping

EndPointMistral / inference.py

Thang

Complete API

7051c9e 10 months ago

1.27 kB

	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
	import torch

	model_name = "mistralai/Mistral-7B-Instruct-v0.2"

	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	)


	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	trust_remote_code=True,
	device_map="auto",
	low_cpu_mem_usage=True,
	# load_in_4bit = True,
	quantization_config = bnb_config
	)



	def generate_text(messages):

	encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
	no_token_encodeds = tokenizer.apply_chat_template(messages, tokenize=False).replace('<s>', "").replace('</s>', "")

	output = model.generate(
	encodeds,
	max_length=200,
	do_sample=True,
	top_k=10,
	num_return_sequences=1,
	eos_token_id=tokenizer.eos_token_id,
	)

	output_text = tokenizer.decode(output[0], skip_special_tokens=True)
	return output_text[len(no_token_encodeds) + 2:]

	# # Remove Prompt Echo from Generated Text
	# cleaned_output_text = output_text.replace(input_text, "")
	# return cleaned_output_text