from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline import torch model_name = "mistralai/Mistral-7B-Instruct-v0.2" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, ) tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto", low_cpu_mem_usage=True, # load_in_4bit = True, quantization_config = bnb_config ) def generate_text(messages): encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt") no_token_encodeds = tokenizer.apply_chat_template(messages, tokenize=False).replace('', "").replace('', "") output = model.generate( encodeds, max_length=200, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, ) output_text = tokenizer.decode(output[0], skip_special_tokens=True) return output_text[len(no_token_encodeds) + 2:] # # Remove Prompt Echo from Generated Text # cleaned_output_text = output_text.replace(input_text, "") # return cleaned_output_text