Spaces:
Runtime error
Runtime error
import os | |
os.environ["COQUI_NO_TERMS"] = "1" # Add this line to accept the TOS | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
from TTS.api import TTS | |
# Supported languages for both Whisper and XTTS | |
languages = { | |
"English": "en", | |
"Spanish": "es", | |
"French": "fr", | |
"German": "de", | |
"Italian": "it", | |
"Portuguese": "pt", | |
"Polish": "pl", | |
"Turkish": "tr", | |
"Russian": "ru", | |
"Dutch": "nl", | |
"Czech": "cs", | |
"Arabic": "ar", | |
"Chinese": "zh-cn", | |
"Japanese": "ja", | |
"Hungarian": "hu", | |
"Korean": "ko", | |
"Hindi": "hi" | |
} | |
# Model and Device Configuration | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
whisper_model_id = "openai/whisper-small" | |
tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # Replace with your actual TTS model | |
# Load Whisper Model (for transcription and translation) | |
whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
whisper_model_id, | |
torch_dtype=torch_dtype, | |
low_cpu_mem_usage=True, | |
use_safetensors=True | |
).to(device) | |
whisper_processor = AutoProcessor.from_pretrained(whisper_model_id) | |
# Load TTS Model (for text-to-speech) | |
tts = TTS(model_name=tts_model_name, progress_bar=False) | |
# Translation Pipeline | |
def create_translate_pipeline(target_language): | |
return pipeline( | |
"automatic-speech-recognition", | |
model=whisper_model, | |
tokenizer=whisper_processor.tokenizer, | |
feature_extractor=whisper_processor.feature_extractor, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=1, | |
torch_dtype=torch_dtype, | |
device=device, | |
return_timestamps=True, | |
generate_kwargs={"task": "transcribe", "language": target_language} | |
) | |
# Audio Processing Function | |
def process_audio(audio_file, translate_language, tts_language): | |
try: | |
# Create translation pipeline | |
translate_pipeline = create_translate_pipeline(translate_language) | |
# Transcribe and translate | |
result = translate_pipeline(audio_file)["text"] | |
# Generate synthesized speech | |
output_audio_file = "output.wav" | |
tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file) | |
return result, output_audio_file | |
except Exception as e: | |
return f"An error occurred: {e}", None | |
# Gradio Interface | |
with gr.Blocks() as interface: | |
gr.Markdown("# AI VOX LAB POC") | |
gr.Markdown("Upload/record audio, translate, and get synthesized speech!") | |
# Add the image here | |
gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250) | |
with gr.Row(): | |
audio_input = gr.Audio(type="filepath", label="Upload or Record Audio") | |
translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language") | |
tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language") | |
with gr.Row(): | |
translate_button = gr.Button("Translate and Synthesize") | |
with gr.Row(): | |
text_output = gr.Textbox(label="Translated Text") | |
audio_output = gr.Audio(label="Generated Audio") | |
translate_button.click( | |
fn=process_audio, | |
inputs=[audio_input, translate_lang, tts_lang], | |
outputs=[text_output, audio_output] | |
) | |
# Launch the App | |
if __name__ == "__main__": | |
interface.launch(share=True) | |