|
import gradio as gr |
|
from transformers import pipeline |
|
import numpy as np |
|
|
|
asr_model = "distil-whisper/distil-medium.en" |
|
|
|
asr_pipe = pipeline("automatic-speech-recognition", model=asr_model) |
|
|
|
def transcribe(stream, new_chunk): |
|
sr, y = new_chunk |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
|
|
if stream is not None: |
|
stream = np.concatenate([stream, y]) |
|
else: |
|
stream = y |
|
return stream, asr_pipe({"sampling_rate": sr, "raw": stream})["text"] |
|
|
|
demo = gr.Blocks() |
|
|
|
|
|
mic = gr.Interface( |
|
fn = transcribe, |
|
inputs = [ |
|
"state", gr.Audio(sources=["microphone"], streaming=True)], |
|
outputs = ["state", "text"], |
|
layout="horizontal", |
|
theme="huggingface", |
|
title="Whisper & BERT demo - Intent Classification", |
|
description=( |
|
"Transcribe audio inputs with Whisper ASR model and detect intention from the text. Use BERT NLP model to classify the intention |
|
"as one of the commands to command a light." |
|
), |
|
allow_flagging="never", |
|
live=True, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|