Spaces:
Sleeping
Sleeping
Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from faster_whisper import WhisperModel
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
import os
|
7 |
+
import wave
|
8 |
+
|
9 |
+
def model_init():
|
10 |
+
# get device
|
11 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
+
model_size = "large-v3"
|
13 |
+
if device == "cuda:0":
|
14 |
+
# Run on GPU with FP16
|
15 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
16 |
+
print("--------------")
|
17 |
+
print("Model runs on GPU")
|
18 |
+
print("--------------")
|
19 |
+
# or Run on GPU with INT8
|
20 |
+
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
|
21 |
+
else:
|
22 |
+
# Run on CPU with INT8
|
23 |
+
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
24 |
+
|
25 |
+
return model
|
26 |
+
|
27 |
+
model = model_init()
|
28 |
+
|
29 |
+
import time
|
30 |
+
|
31 |
+
def transcribe_moon(stream, new_chunk):
|
32 |
+
start_time = time.time() # Start timing
|
33 |
+
sr, y = new_chunk
|
34 |
+
y = y.astype(np.float32)
|
35 |
+
y /= np.max(np.abs(y))
|
36 |
+
|
37 |
+
if stream is not None:
|
38 |
+
stream = np.concatenate([stream, y])
|
39 |
+
else:
|
40 |
+
stream = y
|
41 |
+
|
42 |
+
# Perform the transcription using the specified model and settings
|
43 |
+
segments, info = model.transcribe(
|
44 |
+
stream,
|
45 |
+
)
|
46 |
+
# beam_size=5,
|
47 |
+
# vad_filter=True,
|
48 |
+
# vad_parameters={'min_silence_duration_ms': 500}
|
49 |
+
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
|
50 |
+
# Compile the transcript with timestamps
|
51 |
+
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
|
52 |
+
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
|
53 |
+
|
54 |
+
end_time = time.time() # End timing
|
55 |
+
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
|
56 |
+
|
57 |
+
return stream, transcript, language_info, execution_time
|
58 |
+
|
59 |
+
def transcribe(audio_file):
|
60 |
+
start_time = time.time() # Start timing
|
61 |
+
|
62 |
+
# Perform the transcription using the specified model and settings
|
63 |
+
segments, info = model.transcribe(
|
64 |
+
audio_file,
|
65 |
+
beam_size=5,
|
66 |
+
vad_filter=True,
|
67 |
+
vad_parameters={'min_silence_duration_ms': 500}
|
68 |
+
)
|
69 |
+
|
70 |
+
# Compile the transcript with timestamps
|
71 |
+
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
|
72 |
+
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
|
73 |
+
|
74 |
+
end_time = time.time() # End timing
|
75 |
+
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
|
76 |
+
|
77 |
+
return transcript, language_info, execution_time
|
78 |
+
|
79 |
+
|
80 |
+
# Input and Interface setup for file upload
|
81 |
+
# , "microphone"
|
82 |
+
input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio")
|
83 |
+
|
84 |
+
file_upload_interface = gr.Interface(
|
85 |
+
fn=transcribe,
|
86 |
+
inputs=input_audio,
|
87 |
+
outputs=["text", "text", "text"],
|
88 |
+
title="Whisper Model Transcription",
|
89 |
+
description="Upload an MP3 file to transcribe and detect the spoken language."
|
90 |
+
)
|
91 |
+
|
92 |
+
input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True)
|
93 |
+
streaming_interface = gr.Interface(
|
94 |
+
transcribe_moon,
|
95 |
+
["state", gr.Audio(sources=["microphone"], streaming=True)],
|
96 |
+
["state", "text", "text", "text"],
|
97 |
+
live=True,
|
98 |
+
)
|
99 |
+
|
100 |
+
# Combine both interfaces in a single Gradio app using Tabs
|
101 |
+
tabbed_interface = gr.TabbedInterface(
|
102 |
+
interface_list=[file_upload_interface, streaming_interface],
|
103 |
+
tab_names=["Upload File", "Live Stream"]
|
104 |
+
)
|
105 |
+
|
106 |
+
tabbed_interface.launch(debug=True)
|