File size: 3,177 Bytes
226475c
5307e6b
dbc99da
5307e6b
 
226475c
dbc99da
5307e6b
dbc99da
 
 
 
da9d4b3
dbc99da
 
226475c
5307e6b
 
 
dbc99da
 
 
 
 
 
 
 
b9359f0
dbc99da
b9359f0
 
226475c
dbc99da
226475c
dbc99da
226475c
 
0bc8a9a
69c7afe
 
 
 
 
 
 
 
 
 
 
 
 
0bc8a9a
a2d9db4
226475c
b9359f0
 
9e2b006
b9359f0
 
 
 
 
9e2b006
 
b9359f0
9e2b006
b9359f0
9e2b006
b9359f0
 
 
 
 
9e2b006
 
 
226475c
b9359f0
 
 
 
226475c
8e194d1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor

device="cpu"

pipe = pipeline(
    "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
    return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
    inputs = processor(text_prompt, voice_preset=voice_preset)
    speech_output = model.generate(**inputs.to(device),pad_token_id=10000)  
    return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text,voice_preset)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return synthesised_rate , synthesised_speech ,translated_text
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
    synthesised_rate,synthesised_speech,translated_text = speech_to_speech_translation(audio,voice_preset)
    return (synthesised_rate,synthesised_speech.T),translated_text

title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
examples = [
    ["./cs-CZ.mp3", None],
    ["./de-DE.mp3", None],
    ["./en-AU.mp3", None],
    ["./en-GB.mp3", None],
    ["./en-US.mp3", None],
    ["./es-ES.mp3", None],
    ["./fr-FR.mp3", None],
    ["./it-IT.mp3", None],
    ["./ko-KR.mp3", None],
    ["./nl-NL.mp3", None],
    ["./pl-PL.mp3", None],
    ["./pt-PT.mp3", None],
    ["./ru-RU.mp3", None],
]
import gradio as gr

demo = gr.Blocks()
file_transcribe = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
    ],
    title=title,
    description=description,
    examples=examples,
)
mic_transcribe = gr.Interface(
    fn=speech_to_speech_translation_fix,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
        gr.Text(label="Transcription"),
    ],
    title=title,
    description=description,
)
with demo:
    gr.TabbedInterface(
        [file_transcribe, mic_transcribe],
        ["Transcribe Audio File", "Transcribe Microphone"],
    )

demo.launch(share=True)