import gradio as gr
from llama_cpp import Llama
import torch
import os
import huggingface_hub
import re

# Constants
MODEL_REPO = "Marcus-Arcadius/nephra_v1.0-Q8_0-GGUF"
MODEL_FILENAME = "nephra_v1.0-q8_0.gguf"
MODEL_PATH = os.path.join(os.path.dirname(__file__), MODEL_FILENAME)

def clean_response(text):
    """Clean up model response by removing artifacts and duplicates"""
    # Remove [INST], [/INST], and similar tags
    text = re.sub(r'\[/?INST\]', '', text)
    # Remove code blocks that contain 'greet()'
    text = re.sub(r'```python.*?greet\(\).*?```', '', text, flags=re.DOTALL)
    # Remove remaining code blocks
    text = re.sub(r'```.*?```', '', text, flags=re.DOTALL)
    # Remove "A:" prefixes
    text = re.sub(r'A:', '', text)
    # Remove duplicate responses
    text = re.sub(r'(Hello!.*?about\?)\s*\1', r'\1', text, flags=re.DOTALL)
    # Remove <> and ~~ markers
    text = re.sub(r'[<>~]', '', text)
    # Clean up extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def format_prompt(message, system_prompt):
    """Format the prompt in the way the model expects"""
    return f"<|system|>{system_prompt}</s><|user|>{message}</s><|assistant|>"

def download_model():
    if not os.path.exists(MODEL_PATH):
        print(f"Downloading model from {MODEL_REPO}...")
        try:
            huggingface_hub.hf_hub_download(
                repo_id=MODEL_REPO,
                filename=MODEL_FILENAME,
                local_dir=os.path.dirname(__file__),
                local_dir_use_symlinks=False
            )
            print("Model downloaded successfully!")
        except Exception as e:
            print(f"Error downloading model: {str(e)}")
            raise

def initialize_model():
    download_model()
    return Llama(
        model_path=MODEL_PATH,
        n_ctx=2048,
        n_gpu_layers=-1 if torch.cuda.is_available() else 0
    )

print("Starting model initialization...")
model = initialize_model()
print("Model initialized successfully!")

def generate_response(message, system_prompt, temperature, max_tokens, top_p, presence_penalty, frequency_penalty, history):
    """Generate a response using the Nephra model"""
    # Format all previous conversation into context
    context = ""
    for human, assistant in history:
        context += f"<|user|>{human}</s><|assistant|>{assistant}</s>"
    
    # Add current message to prompt
    full_prompt = f"{format_prompt(message, system_prompt)}"
    
    # Generate response
    response = model.create_completion(
        prompt=full_prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        presence_penalty=presence_penalty,
        frequency_penalty=frequency_penalty,
        stop=["</s>", "<|user|>"]  # Stop at end of response or new user message
    )

    # Clean up the response
    assistant_message = clean_response(response["choices"][0]["text"])
    history.append((message, assistant_message))
    
    return "", history

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Nephra v1.0 Chatbot (Average Time: 35s)")
    
    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=600)
            msg = gr.Textbox(label="Message", placeholder="Type your message here...")
            clear = gr.Button("Clear")
            
        with gr.Column(scale=1):
            system_prompt = gr.Textbox(
                label="System Prompt",
                value="You are a helpful AI assistant.",
                lines=3
            )
            with gr.Accordion("Model Parameters", open=False):
                temperature = gr.Slider(0.0, 2.0, value=0.7, label="Temperature")
                max_tokens = gr.Slider(50, 4096, value=1024, step=1, label="Max Tokens")
                top_p = gr.Slider(0.0, 1.0, value=0.9, label="Top P")
                presence_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Presence Penalty")
                frequency_penalty = gr.Slider(-2.0, 2.0, value=0.0, label="Frequency Penalty")

    # Set up event handlers
    msg.submit(
        generate_response,
        [msg, system_prompt, temperature, max_tokens, top_p, presence_penalty, frequency_penalty, chatbot],
        [msg, chatbot]
    )
    
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()