import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import spaces

# Load the model and tokenizer
model_name = "mrcuddle/Dark-Hermes3-Llama3.2-3B"
device = "cuda" if torch.cuda.is_available() else "cpu"  # Detect GPU or default to CPU
dtype = torch.bfloat16 if device == "cuda" else torch.float32  # Use bfloat16 for mixed precision on GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device)  # Ensure model is on the correct device
model.eval()  # Ensure the model is in evaluation mode

# Define the system prompt
system_prompt = (
    "You are Dark-Hermes, a helpful and intelligent chatbot. "
    "You always provide concise, accurate, and polite responses to user questions. "
    "If you don't know an answer, say 'I'm not sure about that, but I can try to help further!'"
)

# Limit chat history length
MAX_HISTORY_LENGTH = 5  # Keep only the last 5 turns to prevent excessive context size

@spaces.GPU
def chatbot(message, history):
    # Limit chat history length
    history = history[-MAX_HISTORY_LENGTH:]

    # Prepare the conversation prompt
    conversation = system_prompt + "\n"
    conversation += "".join([f"User: {msg}\nBot: {resp}\n" for msg, resp in history])
    conversation += f"User: {message}\nBot:"

    # Tokenize and move inputs to the correct device and dtype
    inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=1024)
    input_ids = inputs["input_ids"].to(device)  # Keep input_ids as Long type
    attention_mask = inputs["attention_mask"].to(device).to(dtype)  # Convert attention_mask to dtype

    # Generate response
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=1024,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Bot:")[-1].strip()

    # Update chat history
    history.append((message, response))
    return history, ""

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Dark-Hermes3-Llama3.2-3B Chatbot")
    gr.Markdown("A chatbot interface powered by the Dark-Hermes3-Llama3.2-3B model. Ask me anything!")

    chatbot_component = gr.Chatbot([], elem_id="chatbot")
    state = gr.State([])

    with gr.Row():
        txt = gr.Textbox(
            show_label=False,
            placeholder="Type your message here...",
            submit_btn=True
        )

    txt.submit(chatbot, [txt, state], [chatbot_component, state])

# Launch the interface
if __name__ == "__main__":
    demo.launch()