from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel import torch import gradio as gr base_model_id = "mistralai/Mistral-7B-Instruct-v0.2" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=False, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) base_model = AutoModelForCausalLM.from_pretrained( base_model_id, device_map="auto", quantization_config=bnb_config, ) tokenizer = AutoTokenizer.from_pretrained( base_model_id, add_bos_token=True, trust_remote_code=True ) ft_model = PeftModel.from_pretrained( base_model, "minhcrafters/Mistral-7B-Instruct-v0.2-mental-health-finetuned" ) def respond(query): eval_prompt = """Patient's Query:\n\n {} ###\n\n""".format(query) model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda") output = ft_model.generate( input_ids=model_input["input_ids"].to("cuda"), attention_mask=model_input["attention_mask"], max_new_tokens=125, repetition_penalty=1.15, ) result = tokenizer.decode(output[0], skip_special_tokens=True).replace( eval_prompt, "" ) return result def chat_response(message, history): return respond(message) demo = gr.ChatInterface(chat_response) demo.launch()