import gradio as gr import numpy as np import torch from transformers import pipeline checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da" revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c" device = "cuda:0" if torch.cuda.is_available() else "cpu" # load speech translation checkpoint asr_pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-base", device=device, chunk_length_s=30, use_fast=True, ) # load text-to-speech checkpoint and speaker embeddings pipe = pipeline( "text-to-speech", model=checkpoint_finetuned, use_fast=True, device=device, revision=revision, ) speaker_embedding_path = "female_23_vestjylland.npy" speaker_embedding = np.load(speaker_embedding_path) speaker_embedding_tensor = torch.tensor(speaker_embedding).unsqueeze(0) target_dtype = np.int16 max_range = np.iinfo(target_dtype).max def translate(audio): outputs = asr_pipe( audio, max_new_tokens=256, batch_size=8, generate_kwargs={"task": "translate", "language": "danish"}, ) return outputs["text"] def synthesise(text): if len(text.strip()) == 0: return (16000, np.zeros(0)) text = replace_danish_letters(text) forward_params = {"speaker_embeddings": speaker_embedding_tensor} speech = pipe(text, forward_params=forward_params) sr, audio = speech["sampling_rate"], speech["audio"] audio = (audio * max_range).astype(np.int16) return sr, audio def speech_to_speech_translation(audio): translated_text = translate(audio) return synthesise(translated_text) def replace_danish_letters(text): for src, dst in replacements: text = text.replace(src, dst) return text replacements = [ ("&", "og"), ("\r", " "), ("´", ""), ("\\", ""), ("¨", " "), ("Å", "AA"), ("Æ", "AE"), ("É", "E"), ("Ö", "OE"), ("Ø", "OE"), ("á", "a"), ("ä", "ae"), ("å", "aa"), ("è", "e"), ("î", "i"), ("ô", "oe"), ("ö", "oe"), ("ø", "oe"), ("ü", "y"), ] title = "Cascaded STST" description = """ Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech: ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation") """ demo = gr.Interface( fn=speech_to_speech_translation, inputs=[ gr.Audio(label="Input Speech", type="filepath"), ], outputs=gr.Audio(label="Translated Speech", type="numpy"), title=title, description=description, examples=[["./example.wav"]], cache_examples=True, allow_flagging="never", ) demo.launch()