Spaces:

mrcuddle
/

Dark-Hermes3.2

Sleeping

File size: 2,734 Bytes

aea9dab
3811739
3ddade0
34b0985
3ddade0
3811739
8dd8689
61d7cec
0c92bcf
3811739
0c92bcf
3ddade0
3811739
3ddade0
 
 
 
 
 
c75f1c5
e57b01b
 
 
34b0985
3ddade0
cfbcd31
e57b01b
 
cfbcd31
3ddade0
 
 
cfbcd31
 
 
041f1cc
 
cfbcd31
 
3ddade0
041f1cc
 
61d7cec
 
3ddade0
 
cfbcd31
 
3ddade0
cfbcd31
 
c75f1c5
 
3811739
c134b6d
6d59193
1d4637f
 
6d59193
1d4637f
 
 
 
 
 
 
b3828aa
b5fe45d
1d4637f
 
6d59193
3811739
3ddade0
c134b6d

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces

# Load the model and tokenizer
model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B"
device = "cuda" if torch.cuda.is_available() else "cpu"  # Detect GPU or default to CPU
dtype = torch.bfloat16 if device == "cuda" else torch.float32  # Use bfloat16 for mixed precision on GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device)  # Ensure model is on the correct device
model.eval()  # Ensure the model is in evaluation mode

# Define the system prompt
system_prompt = (
    "You are Dark-Hermes, a helpful and intelligent chatbot. "
    "You always provide concise, accurate, and polite responses to user questions. "
    "If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'"
)

# Limit chat history length
MAX_HISTORY_LENGTH = 5  # Keep only the last 5 turns to prevent excessive context size

@spaces.GPU
def chatbot(message, history):
    # Limit chat history length
    history = history[-MAX_HISTORY_LENGTH:]

    # Prepare the conversation prompt
    conversation = system_prompt + "\n"
    conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history])
    conversation += f"User: {message}\nBot:"

    # Tokenize and move inputs to the correct device and dtype
    inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs["input_ids"].to(device)  # Keep input_ids as Long type
    attention_mask = inputs["attention_mask"].to(device).to(dtype)  # Convert attention_mask to dtype

    # Generate response
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=1024,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip()

    # Update chat history
    history.append((message, response))
    return history, ""

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot")
    gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!")

    chatbot_component = gr.Chatbot([], elem_id="chatbot")
    state = gr.State([])

    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Type your message here...",
            submit_btn=True
        )

    txt.submit(chatbot, [txt, state], [chatbot_component, state])

# Launch the interface
if __name__ == "__main__":
    demo.launch()