|
import os |
|
import tempfile |
|
from subprocess import Popen, PIPE |
|
import torch |
|
import gradio as gr |
|
from transformers import pipeline |
|
from transformers.pipelines.audio_utils import ffmpeg_read |
|
from pydub import AudioSegment |
|
|
|
|
|
MODEL_NAME = "openai/whisper-large-v3-turbo" |
|
BATCH_SIZE = 8 |
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
whisper_pipeline = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
def convert_mp4_to_mp3(mp4_path, mp3_path): |
|
try: |
|
audio = AudioSegment.from_file(mp4_path, format="mp4") |
|
audio.export(mp3_path, format="mp3") |
|
except Exception as e: |
|
raise RuntimeError(f"Error converting MP4 to MP3: {e}") |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
try: |
|
|
|
with open(audio_path, "rb") as audio_file: |
|
audio_data = audio_file.read() |
|
|
|
|
|
inputs = ffmpeg_read(audio_data, whisper_pipeline.feature_extractor.sampling_rate) |
|
inputs = {"array": inputs, "sampling_rate": whisper_pipeline.feature_extractor.sampling_rate} |
|
|
|
|
|
result = whisper_pipeline(inputs, batch_size=8, return_timestamps=False) |
|
return result["text"] |
|
except Exception as e: |
|
return f"Error during transcription: {e}" |
|
|
|
|
|
def transcribe_file(file): |
|
|
|
if file.name.endswith(".mp4"): |
|
temp_mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name |
|
try: |
|
convert_mp4_to_mp3(file.name, temp_mp3_path) |
|
audio_path = temp_mp3_path |
|
except Exception as e: |
|
return f"Error during MP4 to MP3 conversion: {e}" |
|
else: |
|
audio_path = file.name |
|
|
|
|
|
transcription = transcribe_audio(audio_path) |
|
|
|
|
|
if file.name.endswith(".mp4"): |
|
os.remove(temp_mp3_path) |
|
|
|
return transcription |
|
|
|
|
|
def launch_gradio(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Audio Transcription with Whisper Model") |
|
|
|
gr.Interface( |
|
fn=transcribe_file, |
|
inputs=gr.File(label="Upload Audio/Video File (MP4 or MP3)"), |
|
outputs=gr.Textbox(label="Transcribed Text"), |
|
) |
|
|
|
demo.launch(share=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
launch_gradio() |
|
|