import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B" device = "cuda" if torch.cuda.is_available() else "cpu" # Detect GPU or default to CPU dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use bfloat16 for mixed precision on GPU tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device) # Ensure model is on the correct device model.eval() # Ensure the model is in evaluation mode # Define the system prompt system_prompt = ( "You are Dark-Hermes, a helpful and intelligent chatbot. " "You always provide concise, accurate, and polite responses to user questions. " "If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'" ) # Limit chat history length MAX_HISTORY_LENGTH = 5 # Keep only the last 5 turns to prevent excessive context size @spaces.GPU # For ZeroGPU compatibility def chatbot(message, history): # Keep only the most recent entries in history history = history[-MAX_HISTORY_LENGTH:] # Combine the system prompt with chat history and user message conversation = system_prompt + "\n" conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history]) conversation += f"User: {message}\nBot:" # Tokenize the input and move it to the correct device and dtype inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024).to(device, dtype=dtype) # Generate a response outputs = model.generate( **inputs, max_length=1024, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id ) # Decode the generated text response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip() # Append the response to the chat history history.append((message, response)) return history, "" # Create the Gradio interface iface = gr.ChatInterface( fn=chatbot, title="Dark-Hermes3-Llama3.2-3B Chatbot", description="A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!", examples=["Hello!", "How are you?", "Tell me a joke.", "What is AI?"] ) # Launch the interface if __name__ == "__main__": iface.launch()