Spaces:
Paused
Paused
File size: 3,177 Bytes
226475c 5307e6b dbc99da 5307e6b 226475c dbc99da 5307e6b dbc99da da9d4b3 dbc99da 226475c 5307e6b dbc99da b9359f0 dbc99da b9359f0 226475c dbc99da 226475c dbc99da 226475c 0bc8a9a 69c7afe 0bc8a9a a2d9db4 226475c b9359f0 9e2b006 b9359f0 9e2b006 b9359f0 9e2b006 b9359f0 9e2b006 b9359f0 9e2b006 226475c b9359f0 226475c 8e194d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import torch
import numpy as np
from transformers import pipeline
from transformers import BarkModel
from transformers import AutoProcessor
device="cpu"
pipe = pipeline(
"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
)
processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")
model = model.to(device)
synthesised_rate = model.generation_config.sample_rate
def translate(audio):
outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
return outputs["text"]
def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
inputs = processor(text_prompt, voice_preset=voice_preset)
speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
return speech_output
def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
translated_text = translate(audio)
synthesised_speech = synthesise(translated_text,voice_preset)
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
return synthesised_rate , synthesised_speech ,translated_text
def speech_to_speech_translation_fix(audio,voice_preset="v2/zh_speaker_1"):
synthesised_rate,synthesised_speech,translated_text = speech_to_speech_translation(audio,voice_preset)
return (synthesised_rate,synthesised_speech.T),translated_text
title = "Multilanguage to Chinese(mandarin) Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in Multilanguage to target speech in Chinese(mandarin). Demo uses OpenAI's [Whisper arge-v2](https://huggingface.co/openai/whisper-large-v2) model for speech translation, and a suno/bark[bark-small](https://huggingface.co/suno/bark) model for text-to-speech:

"""
examples = [
["./cs-CZ.mp3", None],
["./de-DE.mp3", None],
["./en-AU.mp3", None],
["./en-GB.mp3", None],
["./en-US.mp3", None],
["./es-ES.mp3", None],
["./fr-FR.mp3", None],
["./it-IT.mp3", None],
["./ko-KR.mp3", None],
["./nl-NL.mp3", None],
["./pl-PL.mp3", None],
["./pt-PT.mp3", None],
["./ru-RU.mp3", None],
]
import gradio as gr
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=speech_to_speech_translation_fix,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
],
title=title,
description=description,
examples=examples,
)
mic_transcribe = gr.Interface(
fn=speech_to_speech_translation_fix,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Transcription"),
],
title=title,
description=description,
)
with demo:
gr.TabbedInterface(
[file_transcribe, mic_transcribe],
["Transcribe Audio File", "Transcribe Microphone"],
)
demo.launch(share=True) |