Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import spaces | |
# Load the model and tokenizer | |
model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B" | |
device = "cuda" if torch.cuda.is_available() else "cpu" # Detect GPU or default to CPU | |
dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use bfloat16 for mixed precision on GPU | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device) # Ensure model is on the correct device | |
model.eval() # Ensure the model is in evaluation mode | |
# Define the system prompt | |
system_prompt = ( | |
"You are Dark-Hermes, a helpful and intelligent chatbot. " | |
"You always provide concise, accurate, and polite responses to user questions. " | |
"If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'" | |
) | |
# Limit chat history length | |
MAX_HISTORY_LENGTH = 5 # Keep only the last 5 turns to prevent excessive context size | |
def chatbot(message, history): | |
# Limit chat history length | |
history = history[-MAX_HISTORY_LENGTH:] | |
# Prepare the conversation prompt | |
conversation = system_prompt + "\n" | |
conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history]) | |
conversation += f"User: {message}\nBot:" | |
# Tokenize and move inputs to the correct device and dtype | |
inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024) | |
input_ids = inputs["input_ids"].to(device) # Keep input_ids as Long type | |
attention_mask = inputs["attention_mask"].to(device).to(dtype) # Convert attention_mask to dtype | |
# Generate response | |
outputs = model.generate( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
max_length=1024, | |
num_return_sequences=1, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
# Decode response | |
response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip() | |
# Update chat history | |
history.append((message, response)) | |
return history, "" | |
# Create the Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot") | |
gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!") | |
chatbot_component = gr.Chatbot([], elem_id="chatbot") | |
state = gr.State([]) | |
with gr.Row(): | |
txt = gr.Textbox( | |
show_label=False, | |
placeholder="Type your message here...", | |
submit_btn=True | |
) | |
txt.submit(chatbot, [txt, state], [chatbot_component, state]) | |
# Launch the interface | |
if __name__ == "__main__": | |
demo.launch() |