import os import numpy as np import gradio as gr import assemblyai as aai from translate import Translator import uuid from elevenlabs import VoiceSettings from elevenlabs.client import ElevenLabs from pathlib import Path ELEVENLABS_API = os.environ.get("ELEVENLABS_API") ASSEMBLYAI_API = os.environ.get("ASSEMBLYAI_API") def voice_to_voice(audio_file): transcript = transcribe_audio(audio_file) if transcript.status == aai.TranscriptStatus.error: raise gr.Error(transcript.error) else: transcript = transcript.text list_translations = translate_text(transcript) generated_audio_paths = [] for translation in list_translations: translated_audio_file_name = text_to_speech(translation) path = Path(translated_audio_file_name) generated_audio_paths.append(path) return tuple(generated_audio_paths + list_translations) def transcribe_audio(audio_file): aai.settings.api_key = ELEVENLABS_API transcriber = aai.Transcriber() transcript = transcriber.transcribe(audio_file) return transcript def translate_text(text): languages = ["ru", "tr", "sv", "de", "es", "ja", "id"] list_translations = [] for lan in languages: translator = Translator(from_lang="en", to_lang=lan) translation = translator.translate(text) list_translations.append(translation) return list_translations def text_to_speech(text): client = ElevenLabs(api_key=ELEVENLABS_API) response = client.text_to_speech.convert( voice_id="", optimize_streaming_latency="0", output_format="mp3_22050_32", text=text, model_id="eleven_multilingual_v2", voice_settings=VoiceSettings( stability=0.5, similarity_boost=0.8, style=0.5, use_speaker_boost=True, ), ) save_file_path = f"{uuid.uuid4()}.mp3" with open(save_file_path, "wb") as f: for chunk in response: if chunk: f.write(chunk) return save_file_path with gr.Blocks() as demo: gr.Markdown("## audio Translator") gr.Markdown( f""" The API Key you need: (AssemblyAI API key)[https://www.assemblyai.com/?utm_source=youtube&utm_medium=referral&utm_campaign=yt_mis_66]
(Elevenlabs API key)[https://elevenlabs.io/]
Note: you need at least 30 minutes of a voice recording of yourself for the *Professional voice cloning. But there is also a simpler voice cloning option that only requires 30 seconds of voice recording. *Professional voice cloning is a paid feature. """ ) audio_input = gr.Audio(type="filepath", show_download_button=True) submit = gr.Button("Submit", variant="primary") clear_button = gr.ClearButton(audio_input, "Clear") output_components = [] languages = ["Turkish", "Swedish", "Russian", "German", "Spanish", "Japanese", "indonesian"] for lang in languages: with gr.Group(): output_components.append(gr.Audio(label=lang, interactive=False)) output_components.append(gr.Markdown()) submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True) if __name__ == "__main__": demo.launch()