manuth commited on
Commit
98b12fe
1 Parent(s): 1bf04d4

Add application file

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from faster_whisper import WhisperModel
3
+ import torch
4
+ import numpy as np
5
+
6
+ import os
7
+ import wave
8
+
9
+ def model_init():
10
+ # get device
11
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
+ model_size = "large-v3"
13
+ if device == "cuda:0":
14
+ # Run on GPU with FP16
15
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
16
+ print("--------------")
17
+ print("Model runs on GPU")
18
+ print("--------------")
19
+ # or Run on GPU with INT8
20
+ # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
21
+ else:
22
+ # Run on CPU with INT8
23
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
24
+
25
+ return model
26
+
27
+ model = model_init()
28
+
29
+ import time
30
+
31
+ def transcribe_moon(stream, new_chunk):
32
+ start_time = time.time() # Start timing
33
+ sr, y = new_chunk
34
+ y = y.astype(np.float32)
35
+ y /= np.max(np.abs(y))
36
+
37
+ if stream is not None:
38
+ stream = np.concatenate([stream, y])
39
+ else:
40
+ stream = y
41
+
42
+ # Perform the transcription using the specified model and settings
43
+ segments, info = model.transcribe(
44
+ stream,
45
+ )
46
+ # beam_size=5,
47
+ # vad_filter=True,
48
+ # vad_parameters={'min_silence_duration_ms': 500}
49
+ # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
50
+ # Compile the transcript with timestamps
51
+ transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
52
+ language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
53
+
54
+ end_time = time.time() # End timing
55
+ execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
56
+
57
+ return stream, transcript, language_info, execution_time
58
+
59
+ def transcribe(audio_file):
60
+ start_time = time.time() # Start timing
61
+
62
+ # Perform the transcription using the specified model and settings
63
+ segments, info = model.transcribe(
64
+ audio_file,
65
+ beam_size=5,
66
+ vad_filter=True,
67
+ vad_parameters={'min_silence_duration_ms': 500}
68
+ )
69
+
70
+ # Compile the transcript with timestamps
71
+ transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
72
+ language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
73
+
74
+ end_time = time.time() # End timing
75
+ execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
76
+
77
+ return transcript, language_info, execution_time
78
+
79
+
80
+ # Input and Interface setup for file upload
81
+ # , "microphone"
82
+ input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio")
83
+
84
+ file_upload_interface = gr.Interface(
85
+ fn=transcribe,
86
+ inputs=input_audio,
87
+ outputs=["text", "text", "text"],
88
+ title="Whisper Model Transcription",
89
+ description="Upload an MP3 file to transcribe and detect the spoken language."
90
+ )
91
+
92
+ input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True)
93
+ streaming_interface = gr.Interface(
94
+ transcribe_moon,
95
+ ["state", gr.Audio(sources=["microphone"], streaming=True)],
96
+ ["state", "text", "text", "text"],
97
+ live=True,
98
+ )
99
+
100
+ # Combine both interfaces in a single Gradio app using Tabs
101
+ tabbed_interface = gr.TabbedInterface(
102
+ interface_list=[file_upload_interface, streaming_interface],
103
+ tab_names=["Upload File", "Live Stream"]
104
+ )
105
+
106
+ tabbed_interface.launch(debug=True)