Spaces:
Running
Running
import gradio as gr | |
import torch | |
import os | |
from faster_whisper import WhisperModel | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import logging | |
import google.generativeai as genai | |
# Suppress moviepy logs | |
logging.getLogger("moviepy").setLevel(logging.ERROR) | |
# Configure Gemini API | |
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
# Create the Gemini model | |
generation_config = { | |
"temperature": 1, | |
"top_p": 0.95, | |
"top_k": 40, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="gemini-2.0-flash-exp", | |
generation_config=generation_config, | |
) | |
# Define the Whisper model and device | |
MODEL_NAME = "Systran/faster-whisper-large-v3" | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
compute_type = "float32" if device == "cuda" else "int8" | |
# Load the Whisper model | |
whisper_model = WhisperModel(MODEL_NAME, device=device, compute_type=compute_type) | |
# List of all supported languages in Whisper | |
SUPPORTED_LANGUAGES = [ | |
"Auto Detect", "English", "Chinese", "German", "Spanish", "Russian", "Korean", | |
"French", "Japanese", "Portuguese", "Turkish", "Polish", "Catalan", "Dutch", | |
"Arabic", "Swedish", "Italian", "Indonesian", "Hindi", "Finnish", "Vietnamese", | |
"Hebrew", "Ukrainian", "Greek", "Malay", "Czech", "Romanian", "Danish", | |
"Hungarian", "Tamil", "Norwegian", "Thai", "Urdu", "Croatian", "Bulgarian", | |
"Lithuanian", "Latin", "Maori", "Malayalam", "Welsh", "Slovak", "Telugu", | |
"Persian", "Latvian", "Bengali", "Serbian", "Azerbaijani", "Slovenian", | |
"Kannada", "Estonian", "Macedonian", "Breton", "Basque", "Icelandic", | |
"Armenian", "Nepali", "Mongolian", "Bosnian", "Kazakh", "Albanian", | |
"Swahili", "Galician", "Marathi", "Punjabi", "Sinhala", "Khmer", "Shona", | |
"Yoruba", "Somali", "Afrikaans", "Occitan", "Georgian", "Belarusian", | |
"Tajik", "Sindhi", "Gujarati", "Amharic", "Yiddish", "Lao", "Uzbek", | |
"Faroese", "Haitian Creole", "Pashto", "Turkmen", "Nynorsk", "Maltese", | |
"Sanskrit", "Luxembourgish", "Burmese", "Tibetan", "Tagalog", "Malagasy", | |
"Assamese", "Tatar", "Hawaiian", "Lingala", "Hausa", "Bashkir", "Javanese", | |
"Sundanese" | |
] | |
def extract_audio_from_video(video_file): | |
"""Extract audio from a video file and save it as a WAV file.""" | |
video = VideoFileClip(video_file) | |
audio_file = "extracted_audio.wav" | |
video.audio.write_audiofile(audio_file, fps=16000, logger=None) # Suppress logs | |
return audio_file | |
def generate_subtitles(audio_file, language="Auto Detect"): | |
"""Generate subtitles from an audio file using Whisper.""" | |
# Transcribe the audio | |
segments, info = whisper_model.transcribe( | |
audio_file, | |
task="transcribe", | |
language=None if language == "Auto Detect" else language.lower(), | |
word_timestamps=True | |
) | |
# Generate SRT format subtitles | |
srt_subtitles = "" | |
for i, segment in enumerate(segments, start=1): | |
start_time = segment.start | |
end_time = segment.end | |
text = segment.text.strip() | |
# Format timestamps for SRT | |
start_time_srt = format_timestamp(start_time) | |
end_time_srt = format_timestamp(end_time) | |
# Add to SRT | |
srt_subtitles += f"{i}\n{start_time_srt} --> {end_time_srt}\n{text}\n\n" | |
return srt_subtitles, info.language | |
def format_timestamp(seconds): | |
"""Convert seconds to SRT timestamp format (HH:MM:SS,mmm).""" | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
seconds = seconds % 60 | |
milliseconds = int((seconds - int(seconds)) * 1000) | |
return f"{hours:02}:{minutes:02}:{int(seconds):02},{milliseconds:03}" | |
def translate_srt(srt_text, target_language): | |
"""Translate an SRT file while preserving timestamps.""" | |
# Magic prompt for Gemini | |
prompt = f"Translate the following SRT subtitles into {target_language}. Preserve the SRT format (timestamps and structure). Translate only the text after the timestamp. Do not add explanations or extra text.\n\n{srt_text}" | |
# Send the prompt to Gemini | |
response = model.generate_content(prompt) | |
return response.text | |
def process_video(video_file, language="Auto Detect", translate_to=None): | |
"""Process a video file to generate and translate subtitles.""" | |
# Extract audio from the video | |
audio_file = extract_audio_from_video(video_file) | |
# Generate subtitles | |
subtitles, detected_language = generate_subtitles(audio_file, language) | |
# Save original subtitles to an SRT file | |
original_srt_file = "original_subtitles.srt" | |
with open(original_srt_file, "w", encoding="utf-8") as f: | |
f.write(subtitles) | |
# Translate subtitles if a target language is provided | |
translated_srt_file = None | |
if translate_to and translate_to != "None": | |
translated_subtitles = translate_srt(subtitles, translate_to) | |
translated_srt_file = "translated_subtitles.srt" | |
with open(translated_srt_file, "w", encoding="utf-8") as f: | |
f.write(translated_subtitles) | |
# Clean up extracted audio file | |
os.remove(audio_file) | |
return original_srt_file, translated_srt_file, detected_language | |
# Define the Gradio interface | |
with gr.Blocks(title="AutoSubGen - AI Video Subtitle Generator") as demo: | |
# Header | |
with gr.Column(): | |
gr.Markdown("# 🎥 AutoSubGen") | |
gr.Markdown("### AI-Powered Video Subtitle Generator") | |
gr.Markdown("Automatically generate and translate subtitles for your videos in **SRT format**. Supports **100+ languages** and **auto-detection**.") | |
# Main content | |
with gr.Tab("Generate Subtitles"): | |
gr.Markdown("### Upload a video file to generate subtitles.") | |
with gr.Row(): | |
video_input = gr.Video(label="Upload Video File", scale=2) | |
language_dropdown = gr.Dropdown( | |
choices=SUPPORTED_LANGUAGES, | |
label="Select Language", | |
value="Auto Detect", | |
scale=1 | |
) | |
translate_to_dropdown = gr.Dropdown( | |
choices=["None"] + SUPPORTED_LANGUAGES[1:], # Exclude "Auto Detect" | |
label="Translate To", | |
value="None", | |
scale=1 | |
) | |
generate_button = gr.Button("Generate Subtitles", variant="primary") | |
with gr.Row(): | |
original_subtitle_output = gr.File(label="Download Original Subtitles (SRT)") | |
translated_subtitle_output = gr.File(label="Download Translated Subtitles (SRT)") | |
detected_language_output = gr.Textbox(label="Detected Language") | |
# Link button to function | |
generate_button.click( | |
process_video, | |
inputs=[video_input, language_dropdown, translate_to_dropdown], | |
outputs=[original_subtitle_output, translated_subtitle_output, detected_language_output] | |
) | |
# Launch the Gradio interface with a public link | |
demo.launch(share=True) |