import gradio as gr from huggingface_hub import InferenceClient from transformers import pipeline import edge_tts import tempfile import asyncio # Initialize the inference client with your Hugging Face token client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1") # Initialize the ASR pipeline asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h") def speech_to_text(speech): """Converts speech to text using the ASR pipeline.""" return asr(speech)["text"] def classify_mood(input_string): """Classifies the mood based on keywords in the input string.""" input_string = input_string.lower() mood_words = {"happy", "sad", "instrumental", "party"} for word in mood_words: if word in input_string: return word, True return None, False def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0): temperature = float(temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(top_p) generate_kwargs = dict( temperature=temperature, max_new_tokens=max_new_tokens, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, seed=42, ) formatted_prompt = format_prompt(prompt, history) stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = "" for response in stream: output += response.token.text mood, is_classified = classify_mood(output) if is_classified: playlist_message = f"Playing {mood.capitalize()} playlist for you!" return playlist_message return output def format_prompt(message, history): """Formats the prompt including fixed instructions and conversation history.""" fixed_prompt = """ You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy". Note: Do not write anything else other than the classified mood if classified. Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood. Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood. Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples. [Examples omitted for brevity] """ prompt = f"{fixed_prompt}" for user_prompt, bot_response in history: prompt += f"\nUser: {user_prompt}\nLLM Response: {bot_response}" prompt += f"\nUser: {message}\nLLM Response:" return prompt async def text_to_speech(text): communicate = edge_tts.Communicate(text) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name await communicate.save(tmp_path) return tmp_path def chatbot(audio, history): if audio is None: return "", history text = speech_to_text(audio) response = generate(text, history) history.append((text, response)) return "", history def text_input(text, history): response = generate(text, history) history.append((text, response)) return "", history async def generate_audio(history): if len(history) > 0: last_response = history[-1][1] audio_path = await text_to_speech(last_response) return audio_path return None # Gradio interface setup with gr.Blocks() as demo: gr.Markdown("# Mood-Based Music Recommender with Voice Chat") chatbot = gr.Chatbot() audio_input = gr.Audio(source="microphone", type="filepath") text_input = gr.Textbox(placeholder="Type your message here...") audio_output = gr.Audio(label="AI Response") audio_input.change(chatbot, inputs=[audio_input, chatbot], outputs=[audio_input, chatbot]) text_input.submit(text_input, inputs=[text_input, chatbot], outputs=[text_input, chatbot]) chatbot.change(generate_audio, inputs=[chatbot], outputs=[audio_output]) if __name__ == "__main__": demo.launch()