Spaces:
Build error
Build error
JackismyShephard
commited on
Commit
·
4b94ac2
1
Parent(s):
63fa33e
add text translation module
Browse files
app.py
CHANGED
@@ -4,9 +4,6 @@ import torch
|
|
4 |
|
5 |
from transformers import pipeline
|
6 |
|
7 |
-
checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
|
8 |
-
|
9 |
-
revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
11 |
|
12 |
# load speech translation checkpoint
|
@@ -17,14 +14,21 @@ asr_pipe = pipeline(
|
|
17 |
chunk_length_s=30,
|
18 |
use_fast=True,
|
19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# load text-to-speech checkpoint and speaker embeddings
|
22 |
-
|
23 |
"text-to-speech",
|
24 |
-
model=
|
25 |
use_fast=True,
|
26 |
device=device,
|
27 |
-
revision=
|
28 |
)
|
29 |
|
30 |
speaker_embedding_path = "female_23_vestjylland.npy"
|
@@ -38,11 +42,17 @@ max_range = np.iinfo(target_dtype).max
|
|
38 |
def translate(audio):
|
39 |
outputs = asr_pipe(
|
40 |
audio,
|
41 |
-
max_new_tokens=256,
|
42 |
batch_size=8,
|
43 |
-
generate_kwargs={
|
|
|
|
|
44 |
)
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
def synthesise(text):
|
@@ -52,7 +62,7 @@ def synthesise(text):
|
|
52 |
text = replace_danish_letters(text)
|
53 |
|
54 |
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
|
55 |
-
speech =
|
56 |
|
57 |
sr, audio = speech["sampling_rate"], speech["audio"]
|
58 |
|
@@ -95,7 +105,7 @@ replacements = [
|
|
95 |
]
|
96 |
|
97 |
|
98 |
-
title = "
|
99 |
description = """
|
100 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
|
101 |
[speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
|
@@ -105,9 +115,7 @@ Demo for cascaded speech-to-speech translation (STST), mapping from source speec
|
|
105 |
|
106 |
demo = gr.Interface(
|
107 |
fn=speech_to_speech_translation,
|
108 |
-
inputs=
|
109 |
-
gr.Audio(label="Input Speech", type="filepath"),
|
110 |
-
],
|
111 |
outputs=gr.Audio(label="Translated Speech", type="numpy"),
|
112 |
title=title,
|
113 |
description=description,
|
|
|
4 |
|
5 |
from transformers import pipeline
|
6 |
|
|
|
|
|
|
|
7 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
8 |
|
9 |
# load speech translation checkpoint
|
|
|
14 |
chunk_length_s=30,
|
15 |
use_fast=True,
|
16 |
)
|
17 |
+
# load text translation checkpoint
|
18 |
+
translation_pipe = pipeline(
|
19 |
+
"translation",
|
20 |
+
model="facebook/nllb-200-distilled-600M",
|
21 |
+
use_fast=True,
|
22 |
+
device=device,
|
23 |
+
)
|
24 |
|
25 |
# load text-to-speech checkpoint and speaker embeddings
|
26 |
+
tts_pipe = pipeline(
|
27 |
"text-to-speech",
|
28 |
+
model="JackismyShephard/speecht5_tts-finetuned-nst-da",
|
29 |
use_fast=True,
|
30 |
device=device,
|
31 |
+
revision="5af228df418092b681cf31c31e413bdd2b5f9c8c",
|
32 |
)
|
33 |
|
34 |
speaker_embedding_path = "female_23_vestjylland.npy"
|
|
|
42 |
def translate(audio):
|
43 |
outputs = asr_pipe(
|
44 |
audio,
|
|
|
45 |
batch_size=8,
|
46 |
+
generate_kwargs={
|
47 |
+
"task": "translate",
|
48 |
+
},
|
49 |
)
|
50 |
+
translated_text = translation_pipe(
|
51 |
+
outputs["text"],
|
52 |
+
src_lang="eng_Latn",
|
53 |
+
tgt_lang="dan_Latn",
|
54 |
+
)[0]["translation_text"]
|
55 |
+
return translated_text
|
56 |
|
57 |
|
58 |
def synthesise(text):
|
|
|
62 |
text = replace_danish_letters(text)
|
63 |
|
64 |
forward_params = {"speaker_embeddings": speaker_embedding_tensor}
|
65 |
+
speech = tts_pipe(text, forward_params=forward_params)
|
66 |
|
67 |
sr, audio = speech["sampling_rate"], speech["audio"]
|
68 |
|
|
|
105 |
]
|
106 |
|
107 |
|
108 |
+
title = "Speech to Danish Speech Translation"
|
109 |
description = """
|
110 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
|
111 |
[speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
|
|
|
115 |
|
116 |
demo = gr.Interface(
|
117 |
fn=speech_to_speech_translation,
|
118 |
+
inputs=gr.Audio(label="Input Speech", type="filepath"),
|
|
|
|
|
119 |
outputs=gr.Audio(label="Translated Speech", type="numpy"),
|
120 |
title=title,
|
121 |
description=description,
|