JackismyShephard commited on
Commit
4b94ac2
·
1 Parent(s): 63fa33e

add text translation module

Browse files
Files changed (1) hide show
  1. app.py +22 -14
app.py CHANGED
@@ -4,9 +4,6 @@ import torch
4
 
5
  from transformers import pipeline
6
 
7
- checkpoint_finetuned = "JackismyShephard/speecht5_tts-finetuned-nst-da"
8
-
9
- revision = "5af228df418092b681cf31c31e413bdd2b5f9c8c"
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
 
12
  # load speech translation checkpoint
@@ -17,14 +14,21 @@ asr_pipe = pipeline(
17
  chunk_length_s=30,
18
  use_fast=True,
19
  )
 
 
 
 
 
 
 
20
 
21
  # load text-to-speech checkpoint and speaker embeddings
22
- pipe = pipeline(
23
  "text-to-speech",
24
- model=checkpoint_finetuned,
25
  use_fast=True,
26
  device=device,
27
- revision=revision,
28
  )
29
 
30
  speaker_embedding_path = "female_23_vestjylland.npy"
@@ -38,11 +42,17 @@ max_range = np.iinfo(target_dtype).max
38
  def translate(audio):
39
  outputs = asr_pipe(
40
  audio,
41
- max_new_tokens=256,
42
  batch_size=8,
43
- generate_kwargs={"task": "translate", "language": "danish"},
 
 
44
  )
45
- return outputs["text"]
 
 
 
 
 
46
 
47
 
48
  def synthesise(text):
@@ -52,7 +62,7 @@ def synthesise(text):
52
  text = replace_danish_letters(text)
53
 
54
  forward_params = {"speaker_embeddings": speaker_embedding_tensor}
55
- speech = pipe(text, forward_params=forward_params)
56
 
57
  sr, audio = speech["sampling_rate"], speech["audio"]
58
 
@@ -95,7 +105,7 @@ replacements = [
95
  ]
96
 
97
 
98
- title = "Cascaded STST"
99
  description = """
100
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
101
  [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
@@ -105,9 +115,7 @@ Demo for cascaded speech-to-speech translation (STST), mapping from source speec
105
 
106
  demo = gr.Interface(
107
  fn=speech_to_speech_translation,
108
- inputs=[
109
- gr.Audio(label="Input Speech", type="filepath"),
110
- ],
111
  outputs=gr.Audio(label="Translated Speech", type="numpy"),
112
  title=title,
113
  description=description,
 
4
 
5
  from transformers import pipeline
6
 
 
 
 
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
8
 
9
  # load speech translation checkpoint
 
14
  chunk_length_s=30,
15
  use_fast=True,
16
  )
17
+ # load text translation checkpoint
18
+ translation_pipe = pipeline(
19
+ "translation",
20
+ model="facebook/nllb-200-distilled-600M",
21
+ use_fast=True,
22
+ device=device,
23
+ )
24
 
25
  # load text-to-speech checkpoint and speaker embeddings
26
+ tts_pipe = pipeline(
27
  "text-to-speech",
28
+ model="JackismyShephard/speecht5_tts-finetuned-nst-da",
29
  use_fast=True,
30
  device=device,
31
+ revision="5af228df418092b681cf31c31e413bdd2b5f9c8c",
32
  )
33
 
34
  speaker_embedding_path = "female_23_vestjylland.npy"
 
42
  def translate(audio):
43
  outputs = asr_pipe(
44
  audio,
 
45
  batch_size=8,
46
+ generate_kwargs={
47
+ "task": "translate",
48
+ },
49
  )
50
+ translated_text = translation_pipe(
51
+ outputs["text"],
52
+ src_lang="eng_Latn",
53
+ tgt_lang="dan_Latn",
54
+ )[0]["translation_text"]
55
+ return translated_text
56
 
57
 
58
  def synthesise(text):
 
62
  text = replace_danish_letters(text)
63
 
64
  forward_params = {"speaker_embeddings": speaker_embedding_tensor}
65
+ speech = tts_pipe(text, forward_params=forward_params)
66
 
67
  sr, audio = speech["sampling_rate"], speech["audio"]
68
 
 
105
  ]
106
 
107
 
108
+ title = "Speech to Danish Speech Translation"
109
  description = """
110
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Danish. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and JackismyShephard's
111
  [speecht5_tts-finetuned-nst-da](https://huggingface.co/JackismyShephard/speecht5_tts-finetuned-nst-da) model for text-to-speech:
 
115
 
116
  demo = gr.Interface(
117
  fn=speech_to_speech_translation,
118
+ inputs=gr.Audio(label="Input Speech", type="filepath"),
 
 
119
  outputs=gr.Audio(label="Translated Speech", type="numpy"),
120
  title=title,
121
  description=description,