Spaces:
Sleeping
Sleeping
File size: 2,734 Bytes
aea9dab 3811739 3ddade0 34b0985 3ddade0 3811739 8dd8689 61d7cec 0c92bcf 3811739 0c92bcf 3ddade0 3811739 3ddade0 c75f1c5 e57b01b 34b0985 3ddade0 cfbcd31 e57b01b cfbcd31 3ddade0 cfbcd31 041f1cc cfbcd31 3ddade0 041f1cc 61d7cec 3ddade0 cfbcd31 3ddade0 cfbcd31 c75f1c5 3811739 c134b6d 6d59193 1d4637f 6d59193 1d4637f b3828aa b5fe45d 1d4637f 6d59193 3811739 3ddade0 c134b6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces
# Load the model and tokenizer
model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B"
device = "cuda" if torch.cuda.is_available() else "cpu" # Detect GPU or default to CPU
dtype = torch.bfloat16 if device == "cuda" else torch.float32 # Use bfloat16 for mixed precision on GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device) # Ensure model is on the correct device
model.eval() # Ensure the model is in evaluation mode
# Define the system prompt
system_prompt = (
"You are Dark-Hermes, a helpful and intelligent chatbot. "
"You always provide concise, accurate, and polite responses to user questions. "
"If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'"
)
# Limit chat history length
MAX_HISTORY_LENGTH = 5 # Keep only the last 5 turns to prevent excessive context size
@spaces.GPU
def chatbot(message, history):
# Limit chat history length
history = history[-MAX_HISTORY_LENGTH:]
# Prepare the conversation prompt
conversation = system_prompt + "\n"
conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history])
conversation += f"User: {message}\nBot:"
# Tokenize and move inputs to the correct device and dtype
inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024)
input_ids = inputs["input_ids"].to(device) # Keep input_ids as Long type
attention_mask = inputs["attention_mask"].to(device).to(dtype) # Convert attention_mask to dtype
# Generate response
outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_length=1024,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id
)
# Decode response
response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip()
# Update chat history
history.append((message, response))
return history, ""
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot")
gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!")
chatbot_component = gr.Chatbot([], elem_id="chatbot")
state = gr.State([])
with gr.Row():
txt = gr.Textbox(
show_label=False,
placeholder="Type your message here...",
submit_btn=True
)
txt.submit(chatbot, [txt, state], [chatbot_component, state])
# Launch the interface
if __name__ == "__main__":
demo.launch() |