|
from transformers import pipeline |
|
from transformers.pipelines.audio_utils import ffmpeg_microphone_live |
|
import torch |
|
import gradio as gr |
|
|
|
asr_model = "openai/whisper-tiny.en" |
|
nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli" |
|
|
|
pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device) |
|
sampling_rate = pipe.feature_extractor.sampling_rate |
|
|
|
chunk_length_s = 10 |
|
stream_chunk_s = 1 |
|
mic = ffmpeg_microphone_live( |
|
sampling_rate=sampling_rate, |
|
chunk_length_s=chunk_length_s, |
|
stream_chunk_s=stream_chunk_s, |
|
) |
|
|
|
def listen_print_loop(responses): |
|
for response in responses: |
|
if response["text"]: |
|
print(response["text"], end="\r") |
|
return response["text"] |
|
if not response["partial"]: |
|
print("") |
|
|
|
|
|
classifier = pipeline("zero-shot-classification", model=nlp_model) |
|
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"] |
|
|
|
|
|
while True: |
|
context = listen_print_loop(pipe(mic)) |
|
print(context) |
|
output = classifier(context, candidate_labels, multi_label=False) |
|
top_label = output['labels'][0] |
|
top_score = output['scores'][0] |
|
print(f"Top Prediction: {top_label} with a score of {top_score:.2f}") |
|
|
|
|
|
iface = gr.Interface( |
|
fn=transcribe, |
|
inputs=gr.inputs.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Real-Time ASR Transcription", |
|
description="Speak into the microphone and get the real-time transcription." |
|
) |
|
|
|
iface.launch() |