import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio

# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

def speech_to_text(speech):
    """Converts speech to text using the ASR pipeline."""
    return asr(speech)["text"]

def classify_mood(input_string):
    """Classifies the mood based on keywords in the input string."""
    input_string = input_string.lower()
    mood_words = {"happy", "sad", "instrumental", "party"}
    for word in mood_words:
        if word in input_string:
            return word, True
    return None, False

def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        mood, is_classified = classify_mood(output)
        if is_classified:
            playlist_message = f"Playing {mood.capitalize()} playlist for you!"
            return playlist_message
    return output

def format_prompt(message, history):
    """Formats the prompt including fixed instructions and conversation history."""
    fixed_prompt = """
    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".

    Note: Do not write anything else other than the classified mood if classified.

    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.

    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.

    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.

    [Examples omitted for brevity]
    """
    prompt = f"{fixed_prompt}"
    for user_prompt, bot_response in history:
        prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}"
    prompt += f"\nUser: {message}\nLLM Response:"
    return prompt

async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def chatbot(audio, history):
    if audio is None:
        return "", history
    
    text = speech_to_text(audio)
    response = generate(text, history)
    history.append((text, response))
    
    return "", history

def text_input(text, history):
    response = generate(text, history)
    history.append((text, response))
    return "", history

async def generate_audio(history):
    if len(history) > 0:
        last_response = history[-1][1]
        audio_path = await text_to_speech(last_response)
        return audio_path
    return None

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("# Mood-Based Music Recommender with Voice Chat")
    
    chatbot = gr.Chatbot()
    audio_input = gr.Audio(source="microphone", type="filepath")
    text_input = gr.Textbox(placeholder="Type your message here...")
    audio_output = gr.Audio(label="AI Response")

    audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot])
    text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot])
    
    chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output])

if __name__ == "__main__":
    demo.launch()