GSridhar1982's picture
Updated with unsloth inference pipeline
3f577ae verified
import gradio as gr
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
import torch
import nltk
nltk.download('punkt')
import re
qa_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text
def generate_answer(model_name,question, load_in_4bit=True): # Added load_in_4bit as a parameter with a default value
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_name, # YOUR MODEL YOU USED FOR TRAINING
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
#load_in_8bit_fp32_cpu_offload=True, # Add this line to enable CPU offloading
device_map={"":0} # Add this line to specify GPU 0 for model placement
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
qa_prompt.format(
"Please provide the answer for the question", # instruction
question, # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
predicted_answer = tokenizer.batch_decode(outputs)
response = predicted_answer[0]
response = response.split("### Response:")[-1].strip()
response = "".join(response)
response = response.replace("</s>", "")
return response
iface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Dropdown(choices=["GSridhar1982/AIML_QA_Mistral7B_FineTuned_Unsloth","GSridhar1982/AIML_QA_Mistral7B_FineTuned_Unsloth"], label="Select Model"),
gr.Textbox(lines=5, label="Question")
],
outputs=gr.Textbox(label="Answer")
)
iface.launch(debug=True)