File size: 1,620 Bytes
73f862c eb7a9c3 73f862c 62a6cf8 e303f20 62a6cf8 eb7a9c3 62a6cf8 eb7a9c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import torch
import gradio as gr
asr_model = "openai/whisper-tiny.en"
nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
pipe = pipeline("automatic-speech-recognition", model=asr_model, device=device)
sampling_rate = pipe.feature_extractor.sampling_rate
chunk_length_s = 10 # how often returns the text
stream_chunk_s = 1 # how often the microphone is checked for new audio
mic = ffmpeg_microphone_live(
sampling_rate=sampling_rate,
chunk_length_s=chunk_length_s,
stream_chunk_s=stream_chunk_s,
)
def listen_print_loop(responses):
for response in responses:
if response["text"]:
print(response["text"], end="\r")
return response["text"]
if not response["partial"]:
print("")
classifier = pipeline("zero-shot-classification", model=nlp_model)
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"]
while True:
context = listen_print_loop(pipe(mic))
print(context)
output = classifier(context, candidate_labels, multi_label=False)
top_label = output['labels'][0]
top_score = output['scores'][0]
print(f"Top Prediction: {top_label} with a score of {top_score:.2f}")
iface = gr.Interface(
fn=transcribe,
inputs=gr.inputs.Audio(source="microphone", type="filepath"),
outputs="text",
title="Real-Time ASR Transcription",
description="Speak into the microphone and get the real-time transcription."
)
iface.launch() |