import gradio as gr from faster_whisper import WhisperModel import torch import numpy as np import os import wave def model_init(): # get device device = "cuda:0" if torch.cuda.is_available() else "cpu" model_size = "large-v3" if device == "cuda:0": # Run on GPU with FP16 model = WhisperModel(model_size, device="cuda", compute_type="float16") print("--------------") print("Model runs on GPU") print("--------------") # or Run on GPU with INT8 # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") else: # Run on CPU with INT8 model = WhisperModel(model_size, device="cpu", compute_type="int8") return model model = model_init() import time def transcribe_moon(stream, new_chunk): start_time = time.time() # Start timing sr, y = new_chunk y = y.astype(np.float32) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y # Perform the transcription using the specified model and settings segments, info = model.transcribe( stream, ) # beam_size=5, # vad_filter=True, # vad_parameters={'min_silence_duration_ms': 500} # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] # Compile the transcript with timestamps transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" end_time = time.time() # End timing execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time return stream, transcript, language_info, execution_time def transcribe(audio_file): start_time = time.time() # Start timing # Perform the transcription using the specified model and settings segments, info = model.transcribe( audio_file, beam_size=5, vad_filter=True, vad_parameters={'min_silence_duration_ms': 500} ) # Compile the transcript with timestamps transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments]) language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}" end_time = time.time() # End timing execution_time = f"Execution time: {end_time - start_time:.2f} seconds" # Calculate execution time return transcript, language_info, execution_time # Input and Interface setup for file upload # , "microphone" input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio") file_upload_interface = gr.Interface( fn=transcribe, inputs=input_audio, outputs=["text", "text", "text"], live=True, title="Speak to Transcript", description="Upload an audio file to transcribe and detect the spoken language." ) input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True) streaming_interface = gr.Interface( transcribe_moon, ["state", gr.Audio(sources=["microphone"], streaming=True)], ["state", "text", "text", "text"], live=True, ) # Combine both interfaces in a single Gradio app using Tabs tabbed_interface = gr.TabbedInterface( interface_list=[file_upload_interface, streaming_interface], tab_names=["Upload File", "Live Stream"] ) tabbed_interface.launch(debug=True)