Spaces:
Sleeping
Sleeping
import gradio as gr | |
from faster_whisper import WhisperModel | |
import torch | |
import numpy as np | |
import os | |
import wave | |
def model_init(): | |
# get device | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
model_size = "large-v3" | |
if device == "cuda:0": | |
# Run on GPU with FP16 | |
model = WhisperModel(model_size, device="cuda", compute_type="float16") | |
print("--------------") | |
print("Model runs on GPU") | |
print("--------------") | |
# or Run on GPU with INT8 | |
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") | |
else: | |
# Run on CPU with INT8 | |
model = WhisperModel(model_size, device="cpu", compute_type="int8") | |
return model | |
model = model_init() | |
import time | |
def transcribe_moon(stream, new_chunk): | |
start_time = time.time() # Start timing | |
sr, y = new_chunk | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
if stream is not None: | |
stream = np.concatenate([stream, y]) | |
else: | |
stream = y | |
# Perform the transcription using the specified model and settings | |
segments, info = model.transcribe( | |
stream, | |
) | |
# beam_size=5, | |
# vad_filter=True, | |
# vad_parameters={'min_silence_duration_ms': 500} | |
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] | |
# Compile the transcript with timestamps | |
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) | |
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" | |
end_time = time.time() # End timing | |
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time | |
return stream, transcript, language_info, execution_time | |
def transcribe(audio_file): | |
start_time = time.time() # Start timing | |
# Perform the transcription using the specified model and settings | |
segments, info = model.transcribe( | |
audio_file, | |
beam_size=5, | |
vad_filter=True, | |
vad_parameters={'min_silence_duration_ms': 500} | |
) | |
# Compile the transcript with timestamps | |
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) | |
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" | |
end_time = time.time() # End timing | |
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time | |
return transcript, language_info, execution_time | |
# Input and Interface setup for file upload | |
# , "microphone" | |
input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio") | |
file_upload_interface = gr.Interface( | |
fn=transcribe, | |
inputs=input_audio, | |
outputs=["text", "text", "text"], | |
title="Whisper Model Transcription", | |
description="Upload an MP3 file to transcribe and detect the spoken language." | |
) | |
input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True) | |
streaming_interface = gr.Interface( | |
transcribe_moon, | |
["state", gr.Audio(sources=["microphone"], streaming=True)], | |
["state", "text", "text", "text"], | |
live=True, | |
) | |
# Combine both interfaces in a single Gradio app using Tabs | |
tabbed_interface = gr.TabbedInterface( | |
interface_list=[file_upload_interface, streaming_interface], | |
tab_names=["Upload File", "Live Stream"] | |
) | |
tabbed_interface.launch(debug=True) |