Teapack1's picture
Rename app.py to _app.py
e303f20
raw
history blame
1.62 kB
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import torch
import gradio as gr
asr_model = "openai/whisper-tiny.en"
nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
sampling_rate = pipe.feature_extractor.sampling_rate
chunk_length_s = 10 # how often returns the text
stream_chunk_s = 1 # how often the microphone is checked for new audio
mic = ffmpeg_microphone_live(
sampling_rate=sampling_rate,
chunk_length_s=chunk_length_s,
stream_chunk_s=stream_chunk_s,
)
def listen_print_loop(responses):
for response in responses:
if response["text"]:
print(response["text"], end="\r")
return response["text"]
if not response["partial"]:
print("")
classifier = pipeline("zero-shot-classification", model=nlp_model)
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"]
while True:
context = listen_print_loop(pipe(mic))
print(context)
output = classifier(context, candidate_labels, multi_label=False)
top_label = output['labels'][0]
top_score = output['scores'][0]
print(f"Top Prediction: {top_label} with a score of {top_score:.2f}")
iface = gr.Interface(
fn=transcribe,
inputs=gr.inputs.Audio(source="microphone", type="filepath"),
outputs="text",
title="Real-Time ASR Transcription",
description="Speak into the microphone and get the real-time transcription."
)
iface.launch()