Spaces:

tbboukhari
/

AIVOXLAB

Runtime error

App Files Files Community

AIVOXLAB / app.py

tbboukhari

Update app.py

1ac7603 verified 3 months ago

raw

history blame contribute delete

3.59 kB

	import os
	os.environ["COQUI_NO_TERMS"] = "1" # Add this line to accept the TOS

	import gradio as gr
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from TTS.api import TTS

	# Supported languages for both Whisper and XTTS
	languages = {
	"English": "en",
	"Spanish": "es",
	"French": "fr",
	"German": "de",
	"Italian": "it",
	"Portuguese": "pt",
	"Polish": "pl",
	"Turkish": "tr",
	"Russian": "ru",
	"Dutch": "nl",
	"Czech": "cs",
	"Arabic": "ar",
	"Chinese": "zh-cn",
	"Japanese": "ja",
	"Hungarian": "hu",
	"Korean": "ko",
	"Hindi": "hi"
	}

	# Model and Device Configuration
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	whisper_model_id = "openai/whisper-small"
	tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2" # Replace with your actual TTS model

	# Load Whisper Model (for transcription and translation)
	whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	whisper_model_id,
	torch_dtype=torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	).to(device)
	whisper_processor = AutoProcessor.from_pretrained(whisper_model_id)

	# Load TTS Model (for text-to-speech)
	tts = TTS(model_name=tts_model_name, progress_bar=False)

	# Translation Pipeline
	def create_translate_pipeline(target_language):
	return pipeline(
	"automatic-speech-recognition",
	model=whisper_model,
	tokenizer=whisper_processor.tokenizer,
	feature_extractor=whisper_processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	batch_size=1,
	torch_dtype=torch_dtype,
	device=device,
	return_timestamps=True,
	generate_kwargs={"task": "transcribe", "language": target_language}
	)

	# Audio Processing Function
	def process_audio(audio_file, translate_language, tts_language):
	try:
	# Create translation pipeline
	translate_pipeline = create_translate_pipeline(translate_language)

	# Transcribe and translate
	result = translate_pipeline(audio_file)["text"]

	# Generate synthesized speech
	output_audio_file = "output.wav"
	tts.tts_to_file(result, speaker_wav=audio_file, language=tts_language, file_path=output_audio_file)

	return result, output_audio_file

	except Exception as e:
	return f"An error occurred: {e}", None

	# Gradio Interface
	with gr.Blocks() as interface:
	gr.Markdown("# AI VOX LAB POC")
	gr.Markdown("Upload/record audio, translate, and get synthesized speech!")

	# Add the image here
	gr.Image(value="/Users/mac/Desktop/VOX_AI/logo_transparent_background.png", label="App Logo", show_label=False, width=700, height=250)

	with gr.Row():
	audio_input = gr.Audio(type="filepath", label="Upload or Record Audio")
	translate_lang = gr.Dropdown(choices=list(languages.keys()), label="Translation Language")
	tts_lang = gr.Dropdown(choices=list(languages.values()), label="TTS Synthesis Language")

	with gr.Row():
	translate_button = gr.Button("Translate and Synthesize")

	with gr.Row():
	text_output = gr.Textbox(label="Translated Text")
	audio_output = gr.Audio(label="Generated Audio")

	translate_button.click(
	fn=process_audio,
	inputs=[audio_input, translate_lang, tts_lang],
	outputs=[text_output, audio_output]
	)

	# Launch the App
	if __name__ == "__main__":
	interface.launch(share=True)