import gradio as gr

import os
import json

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "sayeed99/meta-llama3-8b-xtherapy-bnb-4bit", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!
formatted_string = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are Anna, a helpful AI assistant for mental therapy assistance developed by a team of developers at xnetics. If you do not know the user's name, start by asking the name. If you do not know details about user, ask them."

# Function to format the string
def format_chat_data(data):
    formatted_output = []
    if data["role"] == "assistant":
        value = data["content"]
        formatted_output.append("<|eot_id|><|start_header_id|>assistant<|end_header_id|>" + value)
    else:
        formatted_output.append("<|eot_id|><|start_header_id|>user<|end_header_id|>" + data["content"])

    return "".join(formatted_output)

def formatting_prompts_funcV2(examples):
    conversations = examples
    text = formatted_string
    for conversation in conversations:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = text + format_chat_data(conversation)
    return text

def get_last_assistant_message(text):
    # Split the text by 'assistant' to isolate assistant's messages
    parts = text.split('<|start_header_id|>assistant<|end_header_id|>')
    
    # The last part is the last assistant message
    # Remove leading/trailing whitespace and return
    last_message = parts[-1].strip()
    last_message = cleanup(last_message)
    return last_message


def cleanup(text):
    # Check if the string ends with 'eot_id'
    if text.endswith('<|eot_id|>'):
        # Remove the last 10 characters
        return text[:-10]
    else:
        return text

# Define a function to handle the conversation and update the session
def handle_conversation(user_input):

    historyPrompt = formatting_prompts_funcV2(user_input)

    historyPrompt = historyPrompt + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    inputs = tokenizer(
        [
            historyPrompt
        ], return_tensors="pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True)
    decoded_outputs = tokenizer.batch_decode(outputs)[0]
    # decoded_outputs = "Hello Welcome"
    last_message = get_last_assistant_message(decoded_outputs)

    # Return the AI response
    return last_message

def complete(messages):
    ai_response = handle_conversation(messages)
    return ai_response


def predict(message, history):
    history_openai_format = []
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human })
        history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": message})
  
    response = complete(history_openai_format)
    print(response)

    partial_message = ""
    for chunk in response:
        if chunk is not None:
              partial_message = partial_message + chunk
              yield partial_message

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    predict
)


if __name__ == "__main__":
    demo.launch()