|
import spaces |
|
import gradio as gr |
|
import io |
|
import os |
|
import re |
|
import torch |
|
import torchaudio |
|
from pathlib import Path |
|
from whisperspeech.pipeline import Pipeline |
|
|
|
title = """# 🙋🏻♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech |
|
|
|
You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech) |
|
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3> |
|
|
|
We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗 |
|
|
|
### How to Use |
|
Input text with tahe language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print.Scroll down and try the api <3 Gradio. |
|
This space runs on ZeroGPU, so **you need to be patient** while you acquire the GPU and load the model the first time you make a request ! |
|
""" |
|
|
|
|
|
text_examples = [ |
|
["This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.", None], |
|
["World War II or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis.", "https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg"], |
|
["<pl>To jest pierwszy test wielojęzycznego <en>Whisper Speech <pl>, modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze <en>Jewels.", None], |
|
["<en> WhisperSpeech is an Open Source library that helps you convert text to speech. <pl>Teraz także po Polsku! <en>I think I just tried saying \"now also in Polish\", don't judge me...", None], |
|
|
|
["<pl>To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.", None], |
|
|
|
] |
|
|
|
def parse_multilingual_text(input_text): |
|
pattern = r"(?:<(\w+)>)|([^<]+)" |
|
cur_lang = 'en' |
|
segments = [] |
|
for i, (lang, txt) in enumerate(re.findall(pattern, input_text)): |
|
if lang: cur_lang = lang |
|
else: segments.append((cur_lang, f" {txt} ")) |
|
if not segments: return [("en", "")] |
|
return segments |
|
|
|
@spaces.GPU(enable_queue=True) |
|
def generate_audio(pipe, segments, speaker, speaker_url, cps=14): |
|
if isinstance(speaker, (str, Path)): speaker = pipe.extract_spk_emb(speaker) |
|
elif speaker_url: speaker = pipe.extract_spk_emb(speaker_url) |
|
else: speaker = pipe.default_speaker |
|
langs, texts = [list(x) for x in zip(*segments)] |
|
print(texts, langs) |
|
stoks = pipe.t2s.generate(texts, cps=cps, lang=langs)[0] |
|
atoks = pipe.s2a.generate(stoks, speaker.unsqueeze(0)) |
|
audio = pipe.vocoder.decode(atoks) |
|
return audio.cpu() |
|
|
|
def whisper_speech_demo(multilingual_text, speaker_audio, speaker_url, cps): |
|
if len(multilingual_text) == 0: |
|
raise gr.Error("Please enter some text for me to speak!") |
|
|
|
segments = parse_multilingual_text(multilingual_text) |
|
|
|
audio = generate_audio(pipe, segments, speaker_audio, speaker_url, cps) |
|
|
|
return (24000, audio.T.numpy()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(title) |
|
with gr.Row(equal_height=True): |
|
with gr.Column(scale=2): |
|
text_input = gr.Textbox(label="Enter multilingual text💬📝", |
|
value=text_examples[0][0], |
|
info="You can use `<en>` for English and `<pl>` for Polish, see examples below.") |
|
cps = gr.Slider(value=14, minimum=10, maximum=15, step=.25, |
|
label="Tempo (in characters per second)") |
|
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)🌬️💬", |
|
sources=["upload", "microphone"], |
|
type='filepath') |
|
gr.Markdown(" \n ") |
|
url_input = gr.Textbox(label="alternatively, you can paste in an audio file URL:") |
|
generate_button = gr.Button("Try Collabora's WhisperSpeech🌟") |
|
with gr.Column(scale=1): |
|
output_audio = gr.Audio(label="WhisperSpeech says…") |
|
|
|
with gr.Row(): |
|
gr.Examples( |
|
examples=text_examples, |
|
inputs=[text_input, url_input], |
|
outputs=[output_audio], |
|
fn=whisper_speech_demo, |
|
cache_examples=False, |
|
label="Try these to get started !🌟🌬️" |
|
) |
|
|
|
generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input, url_input, cps], outputs=output_audio) |
|
|
|
pipe = Pipeline() |
|
pipe.generate("WhisperSpeech warmup") |
|
|
|
demo.launch(server_port=3000) |
|
|