from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_microphone_live import torch import gradio as gr asr_model = "openai/whisper-tiny.en" nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli" pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device) sampling_rate = pipe.feature_extractor.sampling_rate chunk_length_s = 10 # how often returns the text stream_chunk_s = 1 # how often the microphone is checked for new audio mic = ffmpeg_microphone_live( sampling_rate=sampling_rate, chunk_length_s=chunk_length_s, stream_chunk_s=stream_chunk_s, ) def listen_print_loop(responses): for response in responses: if response["text"]: print(response["text"], end="\r") return response["text"] if not response["partial"]: print("") classifier = pipeline("zero-shot-classification", model=nlp_model) candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"] while True: context = listen_print_loop(pipe(mic)) print(context) output = classifier(context, candidate_labels, multi_label=False) top_label = output['labels'][0] top_score = output['scores'][0] print(f"Top Prediction: {top_label} with a score of {top_score:.2f}") iface = gr.Interface( fn=transcribe, inputs=gr.inputs.Audio(source="microphone", type="filepath"), outputs="text", title="Real-Time ASR Transcription", description="Speak into the microphone and get the real-time transcription." ) iface.launch()