whisper-gradio / app.py
AbdullahAdeeb's picture
0.0.1
973bb27
import gradio as gr
import whisper
import torch
import json
import spaces
from datetime import timedelta
import os
import zipfile
from pathlib import Path
def format_timestamp(seconds):
"""Convert seconds to SRT timestamp format"""
td = timedelta(seconds=seconds)
hours = td.seconds//3600
minutes = (td.seconds//60)%60
seconds = td.seconds%60
milliseconds = td.microseconds//1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def save_files(text, srt, json_data, base_name):
"""Save transcription in different formats and create zip"""
# Create output directory if it doesn't exist
output_dir = Path("transcriptions")
output_dir.mkdir(exist_ok=True)
# Generate filenames
base_name = Path(base_name).stem
txt_path = output_dir / f"{base_name}.txt"
srt_path = output_dir / f"{base_name}.srt"
json_path = output_dir / f"{base_name}.json"
zip_path = output_dir / f"{base_name}_all.zip"
# Save individual files
txt_path.write_text(text)
srt_path.write_text(srt)
json_path.write_text(json_data)
# Create ZIP file
with zipfile.ZipFile(zip_path, 'w') as zipf:
zipf.write(txt_path, txt_path.name)
zipf.write(srt_path, srt_path.name)
zipf.write(json_path, json_path.name)
return str(txt_path), str(srt_path), str(json_path), str(zip_path)
@spaces.GPU
def transcribe(audio_file):
# Load the Whisper model
model = whisper.load_model("large-v3-turbo")
# Transcribe the audio
result = model.transcribe(audio_file)
# Format as plain text
text_output = result["text"]
# Format as JSON
json_output = json.dumps(result, indent=2)
# Format as SRT
srt_output = ""
for i, segment in enumerate(result["segments"], 1):
start_time = format_timestamp(segment["start"])
end_time = format_timestamp(segment["end"])
text = segment["text"].strip()
srt_output += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
# Save files and get paths
txt_file, srt_file, json_file, zip_file = save_files(
text_output, srt_output, json_output,
os.path.basename(audio_file)
)
return (
txt_file, srt_file, json_file, zip_file, text_output, srt_output, json_output
)
# Create the Gradio interface
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath", label="Upload Audio"),
outputs=[
gr.File(label="Download TXT"),
gr.File(label="Download SRT"),
gr.File(label="Download JSON"),
gr.File(label="Download All (ZIP)"),
gr.Textbox(label="Transcription", lines=5),
gr.Textbox(label="SRT Format"),
gr.JSON(label="JSON Output")
],
title="Audio Transcription with Whisper",
description="Upload an audio file to transcribe it into text, SRT, and JSON formats using OpenAI's Whisper model. You can download the results in different formats or get everything in a ZIP file."
)
if __name__ == "__main__":
demo.launch()