Spaces:

Artificial-superintelligence
/

Aita

Running

App Files Files Community

Artificial-superintelligence commited on Nov 12, 2024

Commit

4b6f416

verified ·

1 Parent(s): dc36981

Update app.py

Browse files

Files changed (1) hide show

app.py +345 -117

app.py CHANGED Viewed

@@ -1,126 +1,354 @@
 import streamlit as st
-from moviepy.editor import VideoFileClip, AudioFileClip
 import whisper
 from translate import Translator
 from gtts import gTTS
 import tempfile
 import os
 import numpy as np
-import time
-# Initialize Whisper model
-try:
-    whisper_model = whisper.load_model("base")
-except Exception as e:
-    st.error(f"Error loading Whisper model: {e}")
-# Language options
-LANGUAGES = {
-    'English': 'en',
-    'Tamil': 'ta',
-    'Sinhala': 'si',
-    'French': 'fr',  # Add more languages as needed
 }
-st.title("AI Video Translator with Whisper and GTTS")
-# Step 1: Upload video file
-video_file = st.file_uploader("Upload a video file", type=["mp4", "mov", "avi", "mkv"])
-if video_file:
-    # Step 2: Select translation language
-    target_language = st.selectbox("Select the target language for translation", list(LANGUAGES.keys()))
-    # Process when user clicks translate
-    if st.button("Translate Video"):
-        # Save video to a temporary file
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as temp_video:
-            temp_video.write(video_file.read())
-            temp_video_path = temp_video.name
-        # Extract audio from video
-        try:
-            video = VideoFileClip(temp_video_path)
-            audio_path = tempfile.mktemp(suffix=".wav")
-            video.audio.write_audiofile(audio_path)
-        except Exception as e:
-            st.error(f"Error extracting audio from video: {e}")
-            os.remove(temp_video_path)
-            st.stop()
-        # Function to transcribe audio in chunks
-        def transcribe_audio_in_chunks(audio_path, model, chunk_length=30):
-            audio_clip = whisper.load_audio(audio_path)
-            audio_duration = len(audio_clip) / whisper.audio.SAMPLE_RATE  # Calculate duration in seconds
-            segments = []
-            for start in np.arange(0, audio_duration, chunk_length):
-                end = min(start + chunk_length, audio_duration)
-                segment = audio_clip[int(start * whisper.audio.SAMPLE_RATE):int(end * whisper.audio.SAMPLE_RATE)]
-                result = model.transcribe(segment)
-                segments.append(result['text'])
-            return ' '.join(segments)
-        # Function to translate text in chunks
-        def translate_in_chunks(text, translator, max_length=500):
-            words = text.split()
-            chunks = []
-            current_chunk = ""
-            for word in words:
-                if len(current_chunk) + len(word) + 1 <= max_length:
-                    current_chunk += " " + word if current_chunk else word
-                else:
-                    chunks.append(current_chunk)
-                    current_chunk = word
-            if current_chunk:
-                chunks.append(current_chunk)
-            translated_chunks = [translator.translate(chunk) for chunk in chunks]
-            return ' '.join(translated_chunks)
-        # Transcribe audio using Whisper
-        try:
-            original_text = transcribe_audio_in_chunks(audio_path, whisper_model)
-            st.write("Original Transcription:", original_text)
-            # Translate text to the target language
-            translator = Translator(to_lang=LANGUAGES[target_language])
-            translated_text = translate_in_chunks(original_text, translator)
-            st.write(f"Translated Text ({target_language}):", translated_text)
-            # Convert translated text to speech
-            tts = gTTS(text=translated_text, lang=LANGUAGES[target_language])
-            translated_audio_path = tempfile.mktemp(suffix=".mp3")
-            tts.save(translated_audio_path)
-            # Merge translated audio with the original video
-            final_video_path = tempfile.mktemp(suffix=".mp4")
-            original_video = VideoFileClip(temp_video_path)
-            translated_audio = AudioFileClip(translated_audio_path)
-            final_video = original_video.set_audio(translated_audio)
-            final_video.write_videofile(final_video_path, codec='libx264', audio_codec='aac')
-            # Display success message and provide download link
-            st.success("Translation successful! Download your translated video below:")
-            st.video(final_video_path)
-            # Provide download link
-            with open(final_video_path, "rb") as f:
-                st.download_button("Download Translated Video", f, file_name="translated_video.mp4")
-        except Exception as e:
-            st.error(f"Error during transcription/translation: {e}")
-            translated_audio_path = None  # Ensure this variable is defined
-            final_video_path = None  # Ensure this variable is defined
-        # Clean up temporary files
-        os.remove(temp_video_path)
-        os.remove(audio_path)
-        if translated_audio_path:  # Only remove if it was created
-            os.remove(translated_audio_path)
-        if final_video_path:  # Only remove if it was created
-            os.remove(final_video_path)

 import streamlit as st
+from moviepy.editor import VideoFileClip, AudioFileClip, TextClip, CompositeVideoClip
 import whisper
 from translate import Translator
 from gtts import gTTS
 import tempfile
 import os
 import numpy as np
+from pydub import AudioSegment
+import speech_recognition as sr
+from datetime import timedelta
+import json
+import indic_transliteration
+from indic_transliteration import sanscript
+from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
+import azure.cognitiveservices.speech as speechsdk
+# Tamil-specific voice configurations
+TAMIL_VOICES = {
+    'Female 1': {'gender': 'female', 'age': 'adult', 'style': 'normal'},
+    'Female 2': {'gender': 'female', 'age': 'adult', 'style': 'formal'},
+    'Male 1': {'gender': 'male', 'age': 'adult', 'style': 'normal'},
+    'Male 2': {'gender': 'male', 'age': 'adult', 'style': 'formal'},
+}
+# Tamil-specific pronunciations and replacements
+TAMIL_PRONUNCIATIONS = {
+    'zh': 'l',  # Handle special Tamil character ழ
+    'L': 'l',   # Handle special Tamil character ள
+    'N': 'n',   # Handle special Tamil character ண
+    'R': 'r',   # Handle special Tamil character ற
 }
+class TamilTextProcessor:
+    @staticmethod
+    def normalize_tamil_text(text):
+        """Normalize Tamil text for better pronunciation"""
+        # Convert Tamil numerals to English numerals
+        tamil_numerals = {'௦': '0', '௧': '1', '௨': '2', '௩': '3', '௪': '4',
+                         '௫': '5', '௬': '6', '௭': '7', '௮': '8', '௯': '9'}
+        for tamil_num, eng_num in tamil_numerals.items():
+            text = text.replace(tamil_num, eng_num)
+        # Handle special characters and combinations
+        text = text.replace('ஜ்ஞ', 'க்ய')  # Replace complex character combinations
+        return text
+    @staticmethod
+    def split_tamil_sentences(text):
+        """Split Tamil text into natural sentence boundaries"""
+        sentence_endings = ['।', '.', '!', '?', '॥']
+        sentences = []
+        current_sentence = ''
+        for char in text:
+            current_sentence += char
+            if char in sentence_endings:
+                sentences.append(current_sentence.strip())
+                current_sentence = ''
+        if current_sentence:
+            sentences.append(current_sentence.strip())
+        return sentences
+class TamilAudioProcessor:
+    @staticmethod
+    def adjust_tamil_audio(audio_segment):
+        """Adjust audio characteristics for Tamil speech"""
+        # Enhance clarity of Tamil consonants
+        enhanced_audio = audio_segment.high_pass_filter(80)
+        enhanced_audio = enhanced_audio.low_pass_filter(8000)
+        # Adjust speed slightly for better comprehension
+        enhanced_audio = enhanced_audio.speedup(playback_speed=0.95)
+        return enhanced_audio
+    @staticmethod
+    def match_emotion(audio_segment, emotion_type):
+        """Adjust audio based on emotional context"""
+        if emotion_type == 'happy':
+            return audio_segment.apply_gain(2).high_pass_filter(100)
+        elif emotion_type == 'sad':
+            return audio_segment.apply_gain(-1).low_pass_filter(3000)
+        elif emotion_type == 'angry':
+            return audio_segment.apply_gain(4).high_pass_filter(200)
+        return audio_segment
+class TamilVideoDubber:
+    def __init__(self, azure_key=None, azure_region=None):
+        self.whisper_model = whisper.load_model("base")
+        self.temp_files = []
+        self.azure_key = azure_key
+        self.azure_region = azure_region
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.cleanup()
+    def cleanup(self):
+        for temp_file in self.temp_files:
+            if os.path.exists(temp_file):
+                os.remove(temp_file)
+    def create_temp_file(self, suffix):
+        temp_file = tempfile.mktemp(suffix=suffix)
+        self.temp_files.append(temp_file)
+        return temp_file
+    def extract_audio_segments(self, video_path):
+        """Extract audio segments with emotion detection"""
+        video = VideoFileClip(video_path)
+        result = self.whisper_model.transcribe(video_path)
+        segments = []
+        for segment in result["segments"]:
+            # Basic emotion detection based on punctuation and keywords
+            emotion = self.detect_emotion(segment["text"])
+            segments.append({
+                "text": segment["text"],
+                "start": segment["start"],
+                "end": segment["end"],
+                "duration": segment["end"] - segment["start"],
+                "emotion": emotion
+            })
+        return segments, video.duration
+    def detect_emotion(self, text):
+        """Simple emotion detection based on text analysis"""
+        happy_words = ['happy', 'joy', 'laugh', 'smile', 'மகிழ்ச்சி']
+        sad_words = ['sad', 'sorry', 'cry', 'வருத்தம்']
+        angry_words = ['angry', 'hate', 'கோபம்']
+        text_lower = text.lower()
+        if any(word in text_lower for word in happy_words):
+            return 'happy'
+        elif any(word in text_lower for word in sad_words):
+            return 'sad'
+        elif any(word in text_lower for word in angry_words):
+            return 'angry'
+        return 'neutral'
+    def translate_to_tamil(self, text):
+        """Translate text to Tamil with context preservation"""
+        translator = Translator(to_lang='ta')
+        translated = translator.translate(text)
+        return TamilTextProcessor.normalize_tamil_text(translated)
+    def generate_tamil_audio(self, text, voice_config, emotion='neutral'):
+        """Generate Tamil audio using Azure TTS or gTTS"""
+        if self.azure_key and self.azure_region:
+            return self._generate_azure_tamil_audio(text, voice_config, emotion)
+        else:
+            return self._generate_gtts_tamil_audio(text, emotion)
+    def _generate_azure_tamil_audio(self, text, voice_config, emotion):
+        """Generate Tamil audio using Azure Cognitive Services"""
+        speech_config = speechsdk.SpeechConfig(
+            subscription=self.azure_key, region=self.azure_region)
+        # Configure Tamil voice
+        speech_config.speech_synthesis_voice_name = "ta-IN-PallaviNeural"
+        # Create speech synthesizer
+        speech_synthesizer = speechsdk.SpeechSynthesizer(
+            speech_config=speech_config)
+        # Add SSML for emotion and style
+        ssml_text = f"""
+        <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis">
+            <voice name="ta-IN-PallaviNeural">
+                <prosody rate="{self._get_emotion_rate(emotion)}"
+                         pitch="{self._get_emotion_pitch(emotion)}">
+                    {text}
+                </prosody>
+            </voice>
+        </speak>
+        """
+        result = speech_synthesizer.speak_ssml_async(ssml_text).get()
+        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+            return AudioSegment.from_wav(io.BytesIO(result.audio_data))
+        else:
+            raise Exception("Speech synthesis failed")
+    def _generate_gtts_tamil_audio(self, text, emotion):
+        """Fallback to gTTS for Tamil audio generation"""
+        temp_path = self.create_temp_file(".mp3")
+        tts = gTTS(text=text, lang='ta')
+        tts.save(temp_path)
+        audio = AudioSegment.from_mp3(temp_path)
+        # Apply emotion-based adjustments
+        audio = TamilAudioProcessor.match_emotion(audio, emotion)
+        return audio
+    @staticmethod
+    def _get_emotion_rate(emotion):
+        """Get speech rate based on emotion"""
+        rates = {
+            'happy': '1.1',
+            'sad': '0.9',
+            'angry': '1.2',
+            'neutral': '1.0'
+        }
+        return rates.get(emotion, '1.0')
+    @staticmethod
+    def _get_emotion_pitch(emotion):
+        """Get pitch adjustment based on emotion"""
+        pitches = {
+            'happy': '+1st',
+            'sad': '-1st',
+            'angry': '+2st',
+            'neutral': '0st'
+        }
+        return pitches.get(emotion, '0st')
+def main():
+    st.title("Tamil Movie Dubbing System")
+    st.sidebar.header("Settings")
+    # Video upload
+    video_file = st.file_uploader("Upload your video", type=['mp4', 'mov', 'avi'])
+    if not video_file:
+        return
+    # Voice selection
+    selected_voice = st.selectbox("Select Tamil voice", list(TAMIL_VOICES.keys()))
+    # Advanced settings
+    with st.expander("Advanced Settings"):
+        generate_subtitles = st.checkbox("Generate Tamil subtitles", value=True)
+        adjust_audio = st.checkbox("Enhance Tamil audio clarity", value=True)
+        emotion_detection = st.checkbox("Enable emotion detection", value=True)
+        # Tamil font selection for subtitles
+        tamil_fonts = ["Latha", "Vijaya", "Mukta Malar"]
+        selected_font = st.selectbox("Select Tamil font", tamil_fonts)
+        # Audio enhancement options
+        if adjust_audio:
+            clarity_level = st.slider("Audio clarity level", 1, 5, 3)
+            bass_boost = st.slider("Bass boost", 0, 100, 50)
+    if st.button("Start Tamil Dubbing"):
+        with st.spinner("Processing your video..."):
+            try:
+                with TamilVideoDubber() as dubber:
+                    # Save uploaded video
+                    temp_video_path = dubber.create_temp_file(".mp4")
+                    with open(temp_video_path, "wb") as f:
+                        f.write(video_file.read())
+                    # Process video with progress tracking
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    # Extract and analyze segments
+                    status_text.text("Analyzing video...")
+                    segments, duration = dubber.extract_audio_segments(
+                        temp_video_path)
+                    progress_bar.progress(0.25)
+                    # Translation and audio generation
+                    status_text.text("Generating Tamil audio...")
+                    final_audio = AudioSegment.empty()
+                    for i, segment in enumerate(segments):
+                        # Translate to Tamil
+                        tamil_text = dubber.translate_to_tamil(segment["text"])
+                        # Generate Tamil audio
+                        segment_audio = dubber.generate_tamil_audio(
+                            tamil_text,
+                            TAMIL_VOICES[selected_voice],
+                            segment["emotion"] if emotion_detection else 'neutral'
+                        )
+                        # Apply audio enhancements
+                        if adjust_audio:
+                            segment_audio = TamilAudioProcessor.adjust_tamil_audio(
+                                segment_audio)
+                        # Add to final audio
+                        if len(final_audio) < segment["start"] * 1000:
+                            silence_duration = (segment["start"] * 1000 -
+                                len(final_audio))
+                            final_audio += AudioSegment.silent(
+                                duration=silence_duration)
+                        final_audio += segment_audio
+                        # Update progress
+                        progress_bar.progress(0.25 + (0.5 * (i + 1) /
+                            len(segments)))
+                    # Generate final video with subtitles
+                    status_text.text("Creating final video...")
+                    output_path = dubber.create_temp_file(".mp4")
+                    video = VideoFileClip(temp_video_path)
+                    video = video.set_audio(AudioFileClip(final_audio))
+                    if generate_subtitles:
+                        # Add Tamil subtitles
+                        subtitle_clips = []
+                        for segment in segments:
+                            tamil_text = dubber.translate_to_tamil(segment["text"])
+                            subtitle_clip = TextClip(
+                                tamil_text,
+                                fontsize=24,
+                                font=selected_font,
+                                color='white',
+                                stroke_color='black',
+                                stroke_width=1
+                            )
+                            subtitle_clip = subtitle_clip.set_position(
+                                ('center', 'bottom')
+                            ).set_duration(
+                                segment["end"] - segment["start"]
+                            ).set_start(segment["start"])
+                            subtitle_clips.append(subtitle_clip)
+                        video = CompositeVideoClip([video] + subtitle_clips)
+                    # Write final video
+                    video.write_videofile(output_path, codec='libx264',
+                        audio_codec='aac')
+                    progress_bar.progress(1.0)
+                    # Display result
+                    st.success("Tamil dubbing completed!")
+                    st.video(output_path)
+                    # Provide download button
+                    with open(output_path, "rb") as f:
+                        st.download_button(
+                            "Download Tamil Dubbed Video",
+                            f,
+                            file_name="tamil_dubbed_video.mp4"
+                        )
+            except Exception as e:
+                st.error(f"An error occurred: {str(e)}")
+if __name__ == "__main__":
+    main()