Spaces:

mrcuddle
/

Dark-Hermes3.2

Sleeping

App Files Files Community

Dark-Hermes3.2 / app.py

mrcuddle

Update app.py

8dd8689 verified 24 days ago

raw

history blame contribute delete

2.73 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import spaces

	# Load the model and tokenizer
	model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B"
	device = "cuda" if torch.cuda.is_available() else "cpu" # Detect GPU or default to CPU
	dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use bfloat16 for mixed precision on GPU
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device) # Ensure model is on the correct device
	model.eval() # Ensure the model is in evaluation mode

	# Define the system prompt
	system_prompt = (
	"You are Dark-Hermes, a helpful and intelligent chatbot. "
	"You always provide concise, accurate, and polite responses to user questions. "
	"If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'"
	)

	# Limit chat history length
	MAX_HISTORY_LENGTH = 5 # Keep only the last 5 turns to prevent excessive context size

	@spaces.GPU
	def chatbot(message, history):
	# Limit chat history length
	history = history[-MAX_HISTORY_LENGTH:]

	# Prepare the conversation prompt
	conversation = system_prompt + "\n"
	conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history])
	conversation += f"User: {message}\nBot:"

	# Tokenize and move inputs to the correct device and dtype
	inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024)
	input_ids = inputs["input_ids"].to(device) # Keep input_ids as Long type
	attention_mask = inputs["attention_mask"].to(device).to(dtype) # Convert attention_mask to dtype

	# Generate response
	outputs = model.generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	max_length=1024,
	num_return_sequences=1,
	pad_token_id=tokenizer.eos_token_id
	)

	# Decode response
	response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip()

	# Update chat history
	history.append((message, response))
	return history, ""

	# Create the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot")
	gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!")

	chatbot_component = gr.Chatbot([], elem_id="chatbot")
	state = gr.State([])

	with gr.Row():
	txt = gr.Textbox(
	show_label=False,
	placeholder="Type your message here...",
	submit_btn=True
	)

	txt.submit(chatbot, [txt, state], [chatbot_component, state])

	# Launch the interface
	if __name__ == "__main__":
	demo.launch()