ankita-01 commited on
Commit
5f570d8
·
1 Parent(s): 2ebe705

Add app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ import numpy as np
4
+ from espnet2.bin.st_inference_streaming import Speech2TextStreaming
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ import librosa
8
+
9
+ # Load your custom model
10
+ model = Speech2TextStreaming(
11
+ st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth", # path to your model weights
12
+ st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml", # path to your config file
13
+ device="cuda",
14
+ minlenratio=0.1,
15
+ maxlenratio=0.7,
16
+ beam_size=1 # change to "cuda" if using GPU
17
+ )
18
+
19
+
20
+
21
+ silence_threshold = 0.01 # Adjust this threshold based on your audio levels
22
+ silence_duration = 1.0 # Duration of silence to detect (in seconds)
23
+
24
+ def is_silence(audio_chunk, sr, threshold=silence_threshold):
25
+ return np.mean(np.abs(audio_chunk)) < threshold
26
+
27
+ def transcribe(state, new_chunk):
28
+ stream, silence_time = state
29
+ if new_chunk is None:
30
+ return (None, None), ""
31
+
32
+ sr, y = new_chunk
33
+ y = y.astype(np.float32)
34
+
35
+ if sr != 16000:
36
+ y = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
37
+ y /= np.max(np.abs(y))
38
+
39
+ if stream is not None:
40
+ stream = np.concatenate([stream, y])
41
+ else:
42
+ stream = y
43
+ model(np.zeros(stream.shape), is_final=True)
44
+
45
+ if is_silence(y, sr):
46
+ silence_time += len(y) / sr
47
+ else:
48
+ silence_time = 0
49
+
50
+ if silence_time >= silence_duration:
51
+ output = model(stream, is_final=True)
52
+ return (None, 0), output[0][0] if output else ""
53
+ else:
54
+ output = model(stream)
55
+ return (stream, silence_time), output[0][0] if output else ""
56
+
57
+ def clear_transcription():
58
+ return (None, 0), ""
59
+
60
+ with gr.Blocks() as demo:
61
+ state = gr.State((None, 0))
62
+ audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
63
+ text = gr.Textbox()
64
+ clear_button = gr.Button("Clear")
65
+
66
+ audio.stream(transcribe, inputs=[state, audio], outputs=[state, text])
67
+ clear_button.click(clear_transcription, inputs=[], outputs=[state, text])
68
+
69
+ demo.launch()