from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gradio as gr
import os

# Define the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('stabilityai/stablelm-zephyr-3b')
model = AutoModelForCausalLM.from_pretrained(
    'stabilityai/stablelm-zephyr-3b',
    trust_remote_code=True,
    device_map="auto"
)

class ChatBot:
    def __init__(self):
        self.history = []

    def predict(self, user_input, system_prompt="You are an expert analyst and provide assessment:"):
        prompt = [{'role': 'user', 'content': user_input + "\n" + system_prompt + ":"}]
        inputs = tokenizer.apply_chat_template(
            prompt,
            add_generation_prompt=True,
            return_tensors='pt'
        )

        # Generate a response using the model
        tokens = model.generate(,
                                max_new_tokens=250,
                                temperature=0.8,
                                do_sample=False
        )

        # Decode the response
        response_text = tokenizer.decode(tokens[0], skip_special_tokens=False)

        # Free up memory
        del tokens
        torch.cuda.empty_cache()

        return response_text

bot = ChatBot()

title = "👋🏻Welcome to 🌟Tonic's🗽Stable🌟LM 3B🚀Chat"

description = """
You can use this Space to test out the current model [stabilityai/stablelm-zephyr-3b](
"""

examples = [["What is the proper treatment for buccal herpes?", "Please provide information on the most effective antiviral medications and home remedies for treating buccal herpes."]]

iface = gr.Interface(
    fn=bot.predict,
    title=title,
    description=description,
    examples=examples,
    inputs=["text", "text"],
    outputs="text",
    theme="ParityError/Anime"
)

iface.launch()