import torch import torchaudio import numpy as np from espnet2.bin.st_inference_streaming import Speech2TextStreaming import gradio as gr import soundfile as sf import librosa # Load your custom model model = Speech2TextStreaming( st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth", # path to your model weights st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml", # path to your config file device="cuda", minlenratio=0.1, maxlenratio=0.7, beam_size=1 # change to "cuda" if using GPU ) silence_threshold = 0.01 # Adjust this threshold based on your audio levels silence_duration = 1.0 # Duration of silence to detect (in seconds) def is_silence(audio_chunk, sr, threshold=silence_threshold): return np.mean(np.abs(audio_chunk)) < threshold def transcribe(state, new_chunk): stream, silence_time = state if new_chunk is None: return (None, None), "" sr, y = new_chunk y = y.astype(np.float32) if sr != 16000: y = librosa.resample(y=y, orig_sr=sr, target_sr=16000) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y model(np.zeros(stream.shape), is_final=True) if is_silence(y, sr): silence_time += len(y) / sr else: silence_time = 0 if silence_time >= silence_duration: output = model(stream, is_final=True) return (None, 0), output[0][0] if output else "" else: output = model(stream) return (stream, silence_time), output[0][0] if output else "" def clear_transcription(): return (None, 0), "" with gr.Blocks() as demo: state = gr.State((None, 0)) audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True) text = gr.Textbox() clear_button = gr.Button("Clear") audio.stream(transcribe, inputs=[state, audio], outputs=[state, text]) clear_button.click(clear_transcription, inputs=[], outputs=[state, text]) demo.launch()