Spaces:
Sleeping
Sleeping
import os | |
import numpy as np | |
import gradio as gr | |
import pyopenjtalk | |
from util import preprocess_input, postprocess_phn, get_tokenizer, load_pitch_dict, get_pinyin | |
from espnet_model_zoo.downloader import ModelDownloader | |
from espnet2.bin.svs_inference import SingingGenerate | |
singer_embeddings = { | |
"Model①(Chinese)-zh": { | |
"singer1 (male)": 1, | |
"singer2 (female)": 12, | |
"singer3 (male)": 23, | |
"singer4 (female)": 29, | |
"singer5 (male)": 18, | |
"singer6 (female)": 8, | |
"singer7 (male)": 25, | |
"singer8 (female)": 5, | |
"singer9 (male)": 10, | |
"singer10 (female)": 15, | |
}, | |
"Model②(Multilingual)-zh": { | |
"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy", | |
"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy", | |
"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy", | |
"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy", | |
"singer5 (male)": "resource/singer/singer_embedding_ace-7.npy", | |
"singer6 (female)": "resource/singer/singer_embedding_itako.npy", | |
"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy", | |
"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy", | |
"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy", | |
"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy", | |
}, | |
"Model②(Multilingual)-jp": { | |
"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy", | |
"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy", | |
"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy", | |
"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy", | |
"singer5 (male)": "resource/singer/singer_embedding_ace-7.npy", | |
"singer6 (female)": "resource/singer/singer_embedding_itako.npy", | |
"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy", | |
"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy", | |
"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy", | |
"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy", | |
} | |
} | |
model_dict = { | |
"Model①(Chinese)-zh": "espnet/aceopencpop_svs_visinger2_40singer_pretrain", | |
"Model②(Multilingual)-zh": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", | |
"Model②(Multilingual)-jp": "espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", | |
} | |
total_singers = list(singer_embeddings["Model②(Multilingual)-zh"].keys()) | |
langs = { | |
"zh": 2, | |
"jp": 1, | |
} | |
def gen_song(model_name, spk, texts, durs, pitchs): | |
fs = 44100 | |
tempo = 120 | |
lang = model_name.split("-")[-1] | |
PRETRAIN_MODEL = model_dict[model_name] | |
if texts is None: | |
return (fs, np.array([0.0])), "Error: No Text provided!" | |
if durs is None: | |
return (fs, np.array([0.0])), "Error: No Dur provided!" | |
if pitchs is None: | |
return (fs, np.array([0.0])), "Error: No Pitch provided!" | |
# preprocess | |
if lang == "zh": | |
texts = preprocess_input(texts, "") | |
text_list = get_pinyin(texts) | |
elif lang == "jp": | |
texts = preprocess_input(texts, " ") | |
text_list = texts.strip().split() | |
durs = preprocess_input(durs, " ") | |
dur_list = durs.strip().split() | |
pitchs = preprocess_input(pitchs, " ") | |
pitch_list = pitchs.strip().split() | |
if len(text_list) != len(dur_list): | |
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!" | |
if len(text_list) != len(pitch_list): | |
return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!" | |
## text to phoneme | |
tokenizer = get_tokenizer(model_name, lang) | |
sybs = [] | |
for text in text_list: | |
if text == "AP" or text == "SP": | |
rev = [text] | |
elif text == "-" or text == "——": | |
rev = [text] | |
else: | |
rev = tokenizer(text) | |
if rev == False: | |
return (fs, np.array([0.0])), f"Error: text `{text}` is invalid!" | |
rev = postprocess_phn(rev, model_name, lang) | |
phns = "_".join(rev) | |
sybs.append(phns) | |
pitch_dict = load_pitch_dict() | |
labels = [] | |
notes = [] | |
st = 0 | |
pre_phn = "" | |
for phns, dur, pitch in zip(sybs, dur_list, pitch_list): | |
if phns == "-" or phns == "——": | |
phns = pre_phn | |
if pitch not in pitch_dict: | |
return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!" | |
pitch = pitch_dict[pitch] | |
phn_list = phns.split("_") | |
lyric = "".join(phn_list) | |
dur = float(dur) | |
note = [st, st + dur, lyric, pitch, phns] | |
st += dur | |
notes.append(note) | |
for phn in phn_list: | |
labels.append(phn) | |
pre_phn = labels[-1] | |
phns_str = " ".join(labels) | |
batch = { | |
"score": ( | |
int(tempo), | |
notes, | |
), | |
"text": phns_str, | |
} | |
print(batch) | |
# return (fs, np.array([0.0])), "success!" | |
# Infer | |
device = "cpu" | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
d = ModelDownloader() | |
pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL) | |
svs = SingingGenerate( | |
train_config = pretrain_downloaded["train_config"], | |
model_file = pretrain_downloaded["model_file"], | |
device = device | |
) | |
if model_name == "Model①(Chinese)-zh": | |
sid = np.array([singer_embeddings[model_name][spk]]) | |
output_dict = svs(batch, sids=sid) | |
else: | |
lid = np.array([langs[lang]]) | |
spk_embed = np.load(singer_embeddings[model_name][spk]) | |
output_dict = svs(batch, lids=lid, spembs=spk_embed) | |
wav_info = output_dict["wav"].cpu().numpy() | |
return (fs, wav_info), "success!" | |
# SP: silence, AP: aspirate. | |
examples = [ | |
["Model①(Chinese)-zh", "singer1 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0"], | |
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # midi note | |
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C#4 D#4 D#4 D#4 rest D#4 B3 rest\nB3 B3 rest B3 B3 E4 rest"], # up 1 key | |
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 大 地 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest"], # lyrics | |
["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], | |
["Model②(Multilingual)-zh", "singer3 (male)", "你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.66 0.58 0.27 0.3 0.97\n0.48 0.36 0.69 0.3 0.53 0.56 1.27 0.89", "63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # double duration | |
["Model①(Chinese)-zh", "singer3 (male)", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP\n你 说 你 不 SP 懂\n 为 何 在 这 时 牵 手 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34\n0.11 0.33 0.29 0.13 0.15 0.48\n0.24 0.18 0.34 0.15 0.27 0.28 0.63 0.44", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0\n63 63 63 63 0 63\n62 62 62 63 65 63 62 0"], # long | |
["Model①(Chinese)-zh", "singer3 (male)", "修 炼 爱 情 的 心 酸 SP AP", "0.42 0.21 0.19 0.28 0.22 0.33 1.53 0.1 0.29", "68 70 68 66 63 68 68 0 0"], | |
["Model①(Chinese)-zh", "singer3 (male)", "学 会 放 好 以 前 的 渴 望 SP AP", "0.3 0.22 0.29 0.27 0.25 0.44 0.54 0.29 1.03 0.08 0.39", "68 70 68 66 61 68 68 65 66 0 0"], | |
["Model①(Chinese)-zh", "singer3 (male)", "SP 我 不 - 是 一 定 要 你 回 - 来 SP", "0.37 0.45 0.47 0.17 0.52 0.28 0.46 0.31 0.44 0.45 0.2 2.54 0.19", "0 51 60 61 59 59 57 57 59 60 61 59 0"], # slur | |
["Model①(Chinese)-zh", "singer4 (female)", "AP 我 多 想 再 见 你\n哪 怕 匆 - 匆 一 AP 眼 就 别 离 AP", "0.13 0.24 0.68 0.78 0.86 0.4 0.94 0.54 0.3 0.56 0.16 0.86 0.26 0.22 0.28 0.78 0.68 1.5 0.32", "0 57 66 63 63 63 63 60 61 61 63 66 66 0 61 61 59 58 0"], | |
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "60 60 60 56 56 56 55 55 55 53 56"], | |
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.6 0.3 0.3 0.3 0.3 0.6 0.6 0.3 0.3 0.6 0.23", "62 62 62 58 58 58 57 57 57 55 58"], # pitch | |
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "1.2 0.6 0.6 0.6 0.6 1.2 1.2 0.6 0.6 1.2 0.45", "60 60 60 56 56 56 55 55 55 53 56"], # double dur | |
["Model②(Multilingual)-jp", "singer8 (female)", "い じ ん さ ん に つ れ ら れ て", "0.3 0.15 0.15 0.15 0.15 0.3 0.3 0.15 0.15 0.3 0.11", "60 60 60 56 56 56 55 55 55 53 56"], # half dur | |
["Model②(Multilingual)-jp", "singer8 (female)", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0"], | |
["Model②(Multilingual)-jp", "singer8 (female)", "じゃ の め で お む か え う れ し い な", "0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.43 0.14 0.65", "60 60 60 62 64 67 69 69 64 64 64 62 60"], | |
["Model②(Multilingual)-jp", "singer10 (female)", "お と め わ ら い か ふぁ い や ら い か ん な い す ぶ ろ うぃ ん ぶ ろ うぃ ん い ん ざ うぃ ん", "0.15 0.15 0.15 0.15 0.3 0.15 0.3 0.15 0.15 0.3 0.07 0.07 0.15 0.15 0.15 0.15 0.15 0.15 0.45 0.07 0.07 0.07 0.38 0.07 0.07 0.15 0.15 0.3 0.15 0.15", "67 67 67 67 67 67 69 67 67 69 67 67 64 64 64 64 64 64 62 64 64 62 62 64 64 62 62 59 59 59"], | |
] | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
<h1 align="center"> Demo of Singing Voice Synthesis in Muskits-ESPnet </h1> | |
<div style="font-size: 20px;"> | |
This is the demo page of our toolkit <a href="https://arxiv.org/abs/2409.07226"><b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b></a>. | |
Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer. | |
Music score usually includes lyrics, as well as duration and pitch of each word in lyrics, | |
<h2>How to use:</h2> | |
1. <b>Choose Model-Language</b>: | |
<ul> | |
<li> "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li> | |
<li> For example, "Model②(Mulitlingual)-zh" means model "Model②(Multilingual)" with lyrics input in Chinese. </li> | |
</ul> | |
2. <b>[Optional] Choose Singer</b>: Choose one singer you like from the drop-down list. | |
3. <b>Input lyrics</b>: | |
<ul> | |
<li> Lyrics use Chinese characters when the language is 'zh' and hiragana when the language is 'jp'. </li> | |
<li> Special characters such as 'AP' (breath), 'SP' (silence), and '-' (slur, only for Chinese lyrics) can also be used. </li> | |
<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li> | |
</ul> | |
4. <b>Input durations</b>: | |
<ul> | |
<li> Durations use float number as input. </li> | |
<li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li> | |
<li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li> | |
</ul> | |
5. <b>Input pitches</b>: | |
<ul> | |
<li> Pitches use MIDI note or MIDI note number as input. Specially, "69" in MIDI note number represents "A4" in MIDI note. </li> | |
<li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li> | |
<li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li> | |
</ul> | |
6. <b>Hit "Generate" and listen to the result!</b> | |
</div> | |
<h2>Notice:</h2> | |
<ul> | |
<li> Plenty of exmpales are provided. </li> | |
<li> Extreme values may result in suboptimal generation quality! </li> | |
</ul> | |
""" | |
) | |
# Row-1 | |
with gr.Row(): | |
with gr.Column(variant="panel"): | |
model_name = gr.Radio( | |
label="Model-Language", | |
choices=[ | |
"Model①(Chinese)-zh", | |
"Model②(Multilingual)-zh", | |
"Model②(Multilingual)-jp", | |
], | |
) | |
with gr.Column(variant="panel"): | |
singer = gr.Dropdown( | |
label="Singer", | |
choices=total_singers, | |
) | |
# def set_model(model_name_str: str): | |
# """ | |
# gets value from `model_name`. either | |
# uses cached list of speakers for the given model name | |
# or loads the addon and checks what are the speakers. | |
# """ | |
# speakers = list(singer_embeddings[model_name_str].keys()) | |
# value = speakers[0] | |
# return gr.update( | |
# choices=speakers, value=value, visible=True, interactive=True | |
# ) | |
# model_name.change(set_model, inputs=model_name, outputs=singer) | |
# Row-2 | |
with gr.Row(): | |
with gr.Column(variant="panel"): | |
lyrics = gr.Textbox(label="Lyrics") | |
duration = gr.Textbox(label="Duration") | |
pitch = gr.Textbox(label="Pitch") | |
generate = gr.Button("Generate") | |
with gr.Column(variant="panel"): | |
gened_song = gr.Audio(label="Generated Song", type="numpy") | |
run_status = gr.Textbox(label="Running Status") | |
gr.Examples( | |
examples=examples, | |
inputs=[model_name, singer, lyrics, duration, pitch], | |
outputs=[singer], | |
label="Examples", | |
examples_per_page=20, | |
) | |
gr.Markdown(""" | |
<div style='margin:20px auto;'> | |
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> | | |
<a href="https://github.com/espnet/espnet">espnet</a> | | |
<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">Model①(Chinese)</a> | | |
<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">Model②(Multilingual)</a></p> | |
</div> | |
""" | |
) | |
generate.click( | |
fn=gen_song, | |
inputs=[model_name, singer, lyrics, duration, pitch], | |
outputs=[gened_song, run_status], | |
) | |
demo.launch() | |