speech_to_text / app.py
manuth's picture
Add application file
98b12fe
raw
history blame
3.52 kB
import gradio as gr
from faster_whisper import WhisperModel
import torch
import numpy as np
import os
import wave
def model_init():
# get device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_size = "large-v3"
if device == "cuda:0":
# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")
print("--------------")
print("Model runs on GPU")
print("--------------")
# or Run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
else:
# Run on CPU with INT8
model = WhisperModel(model_size, device="cpu", compute_type="int8")
return model
model = model_init()
import time
def transcribe_moon(stream, new_chunk):
start_time = time.time() # Start timing
sr, y = new_chunk
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
# Perform the transcription using the specified model and settings
segments, info = model.transcribe(
stream,
)
# beam_size=5,
# vad_filter=True,
# vad_parameters={'min_silence_duration_ms': 500}
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
# Compile the transcript with timestamps
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
end_time = time.time() # End timing
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
return stream, transcript, language_info, execution_time
def transcribe(audio_file):
start_time = time.time() # Start timing
# Perform the transcription using the specified model and settings
segments, info = model.transcribe(
audio_file,
beam_size=5,
vad_filter=True,
vad_parameters={'min_silence_duration_ms': 500}
)
# Compile the transcript with timestamps
transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"
end_time = time.time() # End timing
execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time
return transcript, language_info, execution_time
# Input and Interface setup for file upload
# , "microphone"
input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio")
file_upload_interface = gr.Interface(
fn=transcribe,
inputs=input_audio,
outputs=["text", "text", "text"],
title="Whisper Model Transcription",
description="Upload an MP3 file to transcribe and detect the spoken language."
)
input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True)
streaming_interface = gr.Interface(
transcribe_moon,
["state", gr.Audio(sources=["microphone"], streaming=True)],
["state", "text", "text", "text"],
live=True,
)
# Combine both interfaces in a single Gradio app using Tabs
tabbed_interface = gr.TabbedInterface(
interface_list=[file_upload_interface, streaming_interface],
tab_names=["Upload File", "Live Stream"]
)
tabbed_interface.launch(debug=True)