import os os.environ["COQUI_NO_TERMS"] = "1" # Add this line to accept the TOS import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from TTS.api import TTS # Supported languages for both Whisper and XTTS languages = { "English": "en", "Spanish": "es", "French": "fr", "German": "de", "Italian": "it", "Portuguese": "pt", "Polish": "pl", "Turkish": "tr", "Russian": "ru", "Dutch": "nl", "Czech": "cs", "Arabic": "ar", "Chinese": "zh-cn", "Japanese": "ja", "Hungarian": "hu", "Korean": "ko", "Hindi": "hi" } # Model and Device Configuration device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 whisper_model_id = "openai/whisper-small" tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # Replace with your actual TTS model # Load Whisper Model (for transcription and translation) whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained( whisper_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ).to(device) whisper_processor = AutoProcessor.from_pretrained(whisper_model_id) # Load TTS Model (for text-to-speech) tts = TTS(model_name=tts_model_name, progress_bar=False) # Translation Pipeline def create_translate_pipeline(target_language): return pipeline( "automatic-speech-recognition", model=whisper_model, tokenizer=whisper_processor.tokenizer, feature_extractor=whisper_processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=1, torch_dtype=torch_dtype, device=device, return_timestamps=True, generate_kwargs={"task": "transcribe", "language": target_language} ) # Audio Processing Function def process_audio(audio_file, translate_language, tts_language): try: # Create translation pipeline translate_pipeline = create_translate_pipeline(translate_language) # Transcribe and translate result = translate_pipeline(audio_file)["text"] # Generate synthesized speech output_audio_file = "output.wav" tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file) return result, output_audio_file except Exception as e: return f"An error occurred: {e}", None # Gradio Interface with gr.Blocks() as interface: gr.Markdown("# AI VOX LAB POC") gr.Markdown("Upload/record audio, translate, and get synthesized speech!") # Add the image here gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250) with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language") tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language") with gr.Row(): translate_button = gr.Button("Translate and Synthesize") with gr.Row(): text_output = gr.Textbox(label="Translated Text") audio_output = gr.Audio(label="Generated Audio") translate_button.click( fn=process_audio, inputs=[audio_input, translate_lang, tts_lang], outputs=[text_output, audio_output] ) # Launch the App if __name__ == "__main__": interface.launch(share=True)