Spaces:
Build error
Build error
File size: 3,051 Bytes
d347764 77862e1 d347764 77862e1 d347764 77862e1 d347764 77862e1 1aa084a 91325a6 77862e1 d347764 77862e1 d347764 77862e1 d347764 77862e1 d347764 77862e1 d347764 77862e1 a224cc4 77862e1 d347764 77862e1 d347764 f805e49 77862e1 f805e49 77862e1 d347764 1aa084a c737803 1aa084a c737803 d347764 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import gradio as gr
import numpy as np
import torch
from transformers import pipeline
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# load speech translation checkpoint
asr_pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base",
device=device,
chunk_length_s=30,
use_fast=True,
)
# load text-to-speech checkpoint and speaker embeddings
pipe = pipeline(
"text-to-speech",
model=checkpoint_finetuned,
use_fast=True,
device=device,
revision=revision,
)
speaker_embedding_path = "female_23_vestjylland.npy"
speaker_embedding = np.load(speaker_embedding_path)
speaker_embedding_tensor = torch.tensor(speaker_embedding).unsqueeze(0)
target_dtype = np.int16
max_range = np.iinfo(target_dtype).max
def translate(audio):
outputs = asr_pipe(
audio,
max_new_tokens=256,
batch_size=8,
generate_kwargs={"task": "translate", "language": "danish"},
)
return outputs["text"]
def synthesise(text):
if len(text.strip()) == 0:
return (16000, np.zeros(0))
text = replace_danish_letters(text)
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
speech = pipe(text, forward_params=forward_params)
sr, audio = speech["sampling_rate"], speech["audio"]
audio = (audio * max_range).astype(np.int16)
return sr, audio
def speech_to_speech_translation(audio):
translated_text = translate(audio)
return synthesise(translated_text)
def replace_danish_letters(text):
for src, dst in replacements:
text = text.replace(src, dst)
return text
replacements = [
("&", "og"),
("\r", " "),
("´", ""),
("\\", ""),
("¨", " "),
("Å", "AA"),
("Æ", "AE"),
("É", "E"),
("Ö", "OE"),
("Ø", "OE"),
("á", "a"),
("ä", "ae"),
("å", "aa"),
("è", "e"),
("î", "i"),
("ô", "oe"),
("ö", "oe"),
("ø", "oe"),
("ü", "y"),
]
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
[speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Interface(
fn=speech_to_speech_translation,
inputs=[
gr.Audio(label="Input Speech", type="filepath"),
],
outputs=gr.Audio(label="Translated Speech", type="numpy"),
title=title,
description=description,
examples=[["./example.wav"]],
cache_examples=True,
allow_flagging="never",
)
demo.launch()
|