from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import torch
import gradio as gr

asr_model = "openai/whisper-tiny.en"
nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"

pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
sampling_rate = pipe.feature_extractor.sampling_rate

chunk_length_s = 10  # how often returns the text
stream_chunk_s = 1  # how often the microphone is checked for new audio
mic = ffmpeg_microphone_live(
    sampling_rate=sampling_rate,
    chunk_length_s=chunk_length_s,
    stream_chunk_s=stream_chunk_s,
)

def listen_print_loop(responses):
    for response in responses:
        if response["text"]:
            print(response["text"], end="\r")
            return response["text"]
        if not response["partial"]:
            print("")


classifier = pipeline("zero-shot-classification", model=nlp_model)
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"]


while True:
    context = listen_print_loop(pipe(mic))
    print(context)
    output = classifier(context, candidate_labels, multi_label=False)
    top_label = output['labels'][0]
    top_score = output['scores'][0]
    print(f"Top Prediction: {top_label} with a score of {top_score:.2f}")
    

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.inputs.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Real-Time ASR Transcription",
    description="Speak into the microphone and get the real-time transcription."
)

iface.launch()