Lenylvt commited on
Commit
9e156fa
·
verified ·
1 Parent(s): f42b460

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import ffmpeg
4
+ import pysrt
5
+ import pandas as pd
6
+ import requests
7
+ import io
8
+ from transformers import MarianMTModel, MarianTokenizer
9
+
10
+ def fetch_languages(url):
11
+ response = requests.get(url)
12
+ if response.status_code == 200:
13
+ csv_content = response.content.decode('utf-8')
14
+ df = pd.read_csv(io.StringIO(csv_content), delimiter="|", skiprows=2, header=None).dropna(axis=1, how='all')
15
+ df.columns = ['ISO 639-1', 'ISO 639-2', 'Language Name', 'Native Name']
16
+ df['ISO 639-1'] = df['ISO 639-1'].str.strip()
17
+ language_options = [(row['ISO 639-1'], f"{row['ISO 639-1']} - {row['Language Name']}") for index, row in df.iterrows()]
18
+ return language_options
19
+ else:
20
+ return []
21
+
22
+ def text_to_srt(text):
23
+ lines = text.split('\n')
24
+ srt_content = ""
25
+ for i, line in enumerate(lines):
26
+ if line.strip() == "":
27
+ continue
28
+ try:
29
+ times, content = line.split(']', 1)
30
+ start, end = times[1:].split(' -> ')
31
+ if start.count(":") == 1:
32
+ start = "00:" + start
33
+ if end.count(":") == 1:
34
+ end = "00:" + end
35
+ srt_content += f"{i+1}\n{start.replace('.', ',')} --> {end.replace('.', ',')}\n{content.strip()}\n\n"
36
+ except ValueError:
37
+ continue
38
+ temp_file_path = '/tmp/output.srt'
39
+ with open(temp_file_path, 'w', encoding='utf-8') as file:
40
+ file.write(srt_content)
41
+ return temp_file_path
42
+
43
+ def translate_text(text, source_language_code, target_language_code):
44
+ model_name = f"Helsinki-NLP/opus-mt-{source_language_code}-{target_language_code}"
45
+ try:
46
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
47
+ model = MarianMTModel.from_pretrained(model_name)
48
+ except Exception as e:
49
+ return f"Failed to load model for {source_language_code} to {target_language_code}: {str(e)}"
50
+ translated = model.generate(**tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512))
51
+ translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
52
+ return translated_text
53
+
54
+ def translate_srt(input_file, source_language_code, target_language_code):
55
+ subs = pysrt.open(input_file)
56
+ for sub in subs:
57
+ sub.text = translate_text(sub.text, source_language_code, target_language_code)
58
+ translated_srt_path = input_file.replace(".srt", f"_{target_language_code}.srt")
59
+ subs.save(translated_srt_path)
60
+ return translated_srt_path
61
+
62
+ def add_subtitle_to_video(input_video, subtitle_file, soft_subtitle=True):
63
+ video_input_stream = ffmpeg.input(input_video)
64
+ subtitle_input_stream = ffmpeg.input(subtitle_file)
65
+ input_video_name = os.path.splitext(os.path.basename(input_video))[0]
66
+ output_video = f"/tmp/{input_video_name}_subtitled.mp4"
67
+
68
+ if soft_subtitle:
69
+ stream = ffmpeg.output(
70
+ video_input_stream, subtitle_input_stream, output_video,
71
+ **{"c": "copy", "c:s": "mov_text"}
72
+ )
73
+ else:
74
+ stream = ffmpeg.output(
75
+ video_input_stream, output_video,
76
+ vf=f"subtitles={subtitle_file}"
77
+ )
78
+
79
+ ffmpeg.run(stream, overwrite_output=True)
80
+ return output_video
81
+
82
+ def process_video(input_video, text_transcription, video_language, target_language):
83
+ srt_path = text_to_srt(text_transcription)
84
+ translated_srt_path = translate_srt(srt_path, video_language, target_language)
85
+ output_video = add_subtitle_to_video(input_video.name, translated_srt_path)
86
+ return output_video
87
+
88
+ language_url = "https://huggingface.co/Lenylvt/LanguageISO/resolve/main/iso.md"
89
+ video_language_options = fetch_languages(language_url)
90
+
91
+ with gr.Blocks() as app:
92
+ with gr.Row():
93
+ input_video = gr.Video(label="Video File")
94
+ text_transcription = gr.TextArea(label="Text Transcription")
95
+ video_language = gr.Dropdown(choices=video_language_options, label="Language of the Video")
96
+ target_language = gr.Dropdown(choices=video_language_options, label="Language Translated")
97
+ output_video = gr.Video(label="Video with Translated Subtitles")
98
+
99
+ input_video.change(fn=process_video, inputs=[input_video, text_transcription, video_language, target_language], outputs=output_video)
100
+
101
+ app.launch()