|
import gradio as gr |
|
from transformers import pipeline |
|
import numpy as np |
|
import time |
|
|
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en") |
|
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli") |
|
|
|
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "not about lighting"] |
|
last_update_time = time.time() - 5 |
|
|
|
|
|
last_transcription = "" |
|
last_classification = "" |
|
|
|
def transcribe_and_classify(stream, new_chunk): |
|
global last_update_time, last_transcription, last_classification |
|
sr, y = new_chunk |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
|
|
|
|
if stream is not None: |
|
stream = np.concatenate([stream, y]) |
|
else: |
|
stream = y |
|
|
|
|
|
transcription = transcriber({"sampling_rate": sr, "task": "transcribe", "language": "english", "raw": stream})["text"] |
|
last_transcription = transcription |
|
|
|
|
|
if transcription.strip(): |
|
output = classifier(transcription, candidate_labels, multi_label=False) |
|
top_label = output['labels'][0] |
|
top_score = output['scores'][0] |
|
last_classification = f"{top_label.upper()}, score: {top_score:.2f}" |
|
|
|
|
|
return stream, last_transcription, last_classification |
|
|
|
|
|
demo = gr.Interface( |
|
fn=transcribe_and_classify, |
|
inputs=[ |
|
"state", |
|
gr.Audio(sources=["microphone"]) |
|
], |
|
outputs=[ |
|
"state", |
|
"text", |
|
"text" |
|
], |
|
|
|
) |
|
|
|
|
|
demo.launch(debug=True) |