import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B" device = "cuda" if torch.cuda.is_available() else "cpu" # Detect GPU or default to CPU dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use bfloat16 for mixed precision on GPU tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device) # Ensure model is on the correct device model.eval() # Ensure the model is in evaluation mode # Define the system prompt system_prompt = ( "You are Dark-Hermes, a helpful and intelligent chatbot. " "You always provide concise, accurate, and polite responses to user questions. " "If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'" ) # Limit chat history length MAX_HISTORY_LENGTH = 5 # Keep only the last 5 turns to prevent excessive context size @spaces.GPU def chatbot(message, history): # Limit chat history length history = history[-MAX_HISTORY_LENGTH:] # Prepare the conversation prompt conversation = system_prompt + "\n" conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history]) conversation += f"User: {message}\nBot:" # Tokenize and move inputs to the correct device and dtype inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024) input_ids = inputs["input_ids"].to(device) # Keep input_ids as Long type attention_mask = inputs["attention_mask"].to(device).to(dtype) # Convert attention_mask to dtype # Generate response outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=1024, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id ) # Decode response response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip() # Update chat history history.append((message, response)) return history, "" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot") gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!") chatbot_component = gr.Chatbot([], elem_id="chatbot") state = gr.State([]) with gr.Row(): txt = gr.Textbox( show_label=False, placeholder="Type your message here...", submit_btn=True ) txt.submit(chatbot, [txt, state], [chatbot_component, state]) # Launch the interface if __name__ == "__main__": demo.launch()