TangRain commited on
Commit
9df835b
·
1 Parent(s): 8e4aee9

feat(demo-v1): support Chinese song with pretrained VISinger2

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. app.py +185 -0
  3. midi-note.scp +152 -0
  4. pinyin_dict.py +425 -0
  5. requirements.txt +8 -0
  6. util.py +5 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ .gradio
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import gradio as gr
3
+ from pypinyin import lazy_pinyin
4
+
5
+ from pinyin_dict import PINYIN_DICT
6
+
7
+ from espnet_model_zoo.downloader import ModelDownloader
8
+ from espnet2.fileio.read_text import read_label
9
+ from espnet2.bin.svs_inference import SingingGenerate
10
+
11
+
12
+ spks = {
13
+ "singer1 (man)": 1,
14
+ "singer2 (man)": 2,
15
+ "singer3 (female)": 5,
16
+ "singer4 (female)": 9,
17
+ "singer5 (man)": 18,
18
+ "singer6 (female)": 15,
19
+ "singer7 (man)": 23,
20
+ "singer8 (man)": 25,
21
+ "singer9 (female)": 29,
22
+ "singer10 (man)": 27,
23
+ }
24
+
25
+ def gen_song(lang, tempo, texts, durs, pitchs, spk):
26
+ if lang == "zh":
27
+ PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
28
+ fs = 44100
29
+ text_list = lazy_pinyin(texts)
30
+
31
+ # preprocess
32
+ if texts is None:
33
+ return (fs, np.array([0.0])), "Error: No Text provided!"
34
+ if durs is None:
35
+ return (fs, np.array([0.0])), "Error: No Dur provided!"
36
+ if pitchs is None:
37
+ return (fs, np.array([0.0])), "Error: No Pitch provided!"
38
+
39
+ dur_list = durs.strip().split()
40
+ pitch_list = pitchs.strip().split()
41
+
42
+ if len(text_list) != len(dur_list):
43
+ return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with duration({len(dur_list)})!"
44
+ if len(text_list) != len(pitch_list):
45
+ return (fs, np.array([0.0])), f"Error: len in text({len(text_list)}) mismatch with pitch({len(pitch_list)})!"
46
+
47
+ ## text to phoneme
48
+ sybs = []
49
+ if lang == "zh":
50
+ pinyin_dict = PINYIN_DICT
51
+ for text in text_list:
52
+ text = text.lower()
53
+ if text not in pinyin_dict:
54
+ return (fs, np.array([0.0])), f"Error: pinyin `{text}` is invalid!"
55
+ phns = "_".join(pinyin_dict[text])
56
+ sybs.append(phns)
57
+
58
+ ## pitch
59
+ pitch_dict = {}
60
+ with open("./midi-note.scp", "r", encoding="utf-8") as f:
61
+ for line in f:
62
+ items = line.strip().split()
63
+ pitch_dict[items[0]] = int(items[1])
64
+ pitch_dict[items[1]] = int(items[1])
65
+
66
+ labels = []
67
+ notes = []
68
+ st = 0
69
+ for phns, dur, pitch in zip(sybs, dur_list, pitch_list):
70
+ if pitch not in pitch_dict:
71
+ return (fs, np.array([0.0])), f"Error: pitch `{pitch}` is invalid!"
72
+ pitch = pitch_dict[pitch]
73
+ dur = float(dur)
74
+ phn_list = phns.split("_")
75
+ lyric = "".join(phn_list)
76
+ note = [st, st + dur, lyric, pitch, phns]
77
+ st += dur
78
+ notes.append(note)
79
+ for phn in phn_list:
80
+ labels.append(phn)
81
+
82
+ phns_str = " ".join(labels)
83
+ batch = {
84
+ "score": (
85
+ int(tempo),
86
+ notes,
87
+ ),
88
+ "text": phns_str,
89
+ }
90
+
91
+ # Infer
92
+ device = "cpu"
93
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
94
+ d = ModelDownloader()
95
+ pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
96
+ svs = SingingGenerate(
97
+ train_config = pretrain_downloaded["train_config"],
98
+ model_file = pretrain_downloaded["model_file"],
99
+ device = device
100
+ )
101
+ sid = spks[spk]
102
+ output_dict = svs(batch, sids=np.array([sid]))
103
+ wav_info = output_dict["wav"].cpu().numpy()
104
+ return (fs, wav_info), "success!"
105
+
106
+
107
+ title = "Demo of Singing Voice Synthesis in Muskits-ESPnet"
108
+
109
+ description = """
110
+ This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm<b>.
111
+
112
+
113
+ <p>How to use:</p>
114
+ <ol>
115
+ <li> Choose language ID. Language id </li>
116
+ <li> Input tempo in integer </li>
117
+ <li> Input text, duration, pitch of equal length </li>
118
+ <li> Choose ons singer </li>
119
+ <li> Click submit button </li>
120
+ </ol>
121
+
122
+
123
+ """
124
+
125
+ article = """
126
+ <div style='margin:20px auto;'>
127
+
128
+ <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
129
+ <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
130
+ <a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>
131
+
132
+ <pre>
133
+ @inproceedings{wu2024muskits,
134
+ title = {{Muskits-ESPnet}: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm},
135
+ author = {Yuning Wu and Jiatong Shi and Yifeng Yu and Yuxun Tang and Tao Qian and Yueqian Lin and Jionghao Han and Xinyi Bai and Shinji Watanabe and Qin Jin},
136
+ booktitle={Proc. ACM Multimedia},
137
+ year={2024},
138
+ }
139
+ </pre>
140
+
141
+ </div>
142
+ """
143
+
144
+
145
+ # SP: silence, AP: aspirate.
146
+ examples = [
147
+ ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "60 62 62 62 0 62 58 0", "singer1 (man)"],
148
+ ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (man)"],
149
+ ["zh", 89, "雨淋湿了SP天空AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
150
+ ]
151
+
152
+ gr.Interface(
153
+ fn=gen_song,
154
+ inputs=[
155
+ gr.Radio(label="language", choices=["zh"], value="zh"),
156
+ gr.Textbox(label="Tempo"),
157
+ gr.Textbox(label="Text"),
158
+ gr.Textbox(label="Duration"),
159
+ gr.Textbox(label="Pitch"),
160
+ gr.Radio(
161
+ label="Singer",
162
+ choices=[
163
+ "singer1 (man)",
164
+ "singer2 (man)",
165
+ "singer3 (female)",
166
+ "singer4 (female)",
167
+ "singer5 (man)",
168
+ "singer6 (female)",
169
+ "singer7 (man)",
170
+ "singer8 (man)",
171
+ "singer9 (female)",
172
+ "singer10 (man)",
173
+ ],
174
+ value="singer1 (man)"
175
+ ),
176
+ ],
177
+ outputs=[
178
+ gr.Audio(label="Generated Song", type="numpy"),
179
+ gr.Textbox(label="Running Status"),
180
+ ],
181
+ title=title,
182
+ description=description,
183
+ article=article,
184
+ examples=examples,
185
+ ).launch()
midi-note.scp ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ G9 127
2
+ F#9 126
3
+ Gb9 126
4
+ F9 125
5
+ E9 124
6
+ D#9 123
7
+ Eb9 123
8
+ D9 122
9
+ C#9 121
10
+ Db9 121
11
+ C9 120
12
+ B8 119
13
+ A#8 118
14
+ Bb8 118
15
+ A8 117
16
+ G#8 116
17
+ Ab8 116
18
+ G8 115
19
+ F#8 114
20
+ Gb8 114
21
+ F8 113
22
+ E8 112
23
+ D#8 111
24
+ Eb8 111
25
+ D8 110
26
+ C#8 109
27
+ Db8 109
28
+ C8 108
29
+ B7 107
30
+ A#7 106
31
+ Bb7 106
32
+ A7 105
33
+ G#7 104
34
+ Ab7 104
35
+ G7 103
36
+ F#7 102
37
+ Gb7 102
38
+ F7 101
39
+ E7 100
40
+ D#7 99
41
+ Eb7 99
42
+ D7 98
43
+ C#7 97
44
+ Db7 97
45
+ C7 96
46
+ B6 95
47
+ A#6 94
48
+ Bb6 94
49
+ A6 93
50
+ G#6 92
51
+ Ab6 92
52
+ G6 91
53
+ F#6 90
54
+ Gb6 90
55
+ F6 89
56
+ E6 88
57
+ D#6 87
58
+ Eb6 87
59
+ D6 86
60
+ C#6 85
61
+ Db6 85
62
+ C6 84
63
+ B5 83
64
+ A#5 82
65
+ Bb5 82
66
+ A5 81
67
+ G#5 80
68
+ Ab5 80
69
+ G5 79
70
+ F#5 78
71
+ Gb5 78
72
+ F5 77
73
+ E5 76
74
+ D#5 75
75
+ Eb5 75
76
+ D5 74
77
+ C#5 73
78
+ Db5 73
79
+ C5 72
80
+ B4 71
81
+ A#4 70
82
+ Bb4 70
83
+ A4 69
84
+ G#4 68
85
+ Ab4 68
86
+ G4 67
87
+ F#4 66
88
+ Gb4 66
89
+ F4 65
90
+ E4 64
91
+ D#4 63
92
+ Eb4 63
93
+ D4 62
94
+ C#4 61
95
+ Db4 61
96
+ C4 60
97
+ B3 59
98
+ A#3 58
99
+ Bb3 58
100
+ A3 57
101
+ G#3 56
102
+ Ab3 56
103
+ G3 55
104
+ F#3 54
105
+ Gb3 54
106
+ F3 53
107
+ E3 52
108
+ D#3 51
109
+ Eb3 51
110
+ D3 50
111
+ C#3 49
112
+ Db3 49
113
+ C3 48
114
+ B2 47
115
+ A#2 46
116
+ Bb2 46
117
+ A2 45
118
+ G#2 44
119
+ Ab2 44
120
+ G2 43
121
+ F#2 42
122
+ Gb2 42
123
+ F2 41
124
+ E2 40
125
+ D#2 39
126
+ Eb2 39
127
+ D2 38
128
+ C#2 37
129
+ Db2 37
130
+ C2 36
131
+ B1 35
132
+ A#1 34
133
+ Bb1 34
134
+ A1 33
135
+ G#1 32
136
+ Ab1 32
137
+ G1 31
138
+ F#1 30
139
+ Gb1 30
140
+ F1 29
141
+ E1 28
142
+ D#1 27
143
+ Eb1 27
144
+ D1 26
145
+ C#1 25
146
+ Db1 25
147
+ C1 24
148
+ B0 23
149
+ A#0 22
150
+ Bb0 22
151
+ A0 21
152
+ rest 0
pinyin_dict.py ADDED
@@ -0,0 +1,425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from Opencpop's pinyin to phoneme mapping table:
2
+ # https://wenet.org.cn/opencpop/resources/annotationformat/
3
+ PINYIN_DICT = {
4
+ "a": ("a",),
5
+ "ai": ("ai",),
6
+ "an": ("an",),
7
+ "ang": ("ang",),
8
+ "ao": ("ao",),
9
+ "ba": ("b", "a"),
10
+ "bai": ("b", "ai"),
11
+ "ban": ("b", "an"),
12
+ "bang": ("b", "ang"),
13
+ "bao": ("b", "ao"),
14
+ "bei": ("b", "ei"),
15
+ "ben": ("b", "en"),
16
+ "beng": ("b", "eng"),
17
+ "bi": ("b", "i"),
18
+ "bian": ("b", "ian"),
19
+ "biao": ("b", "iao"),
20
+ "bie": ("b", "ie"),
21
+ "bin": ("b", "in"),
22
+ "bing": ("b", "ing"),
23
+ "bo": ("b", "o"),
24
+ "bu": ("b", "u"),
25
+ "ca": ("c", "a"),
26
+ "cai": ("c", "ai"),
27
+ "can": ("c", "an"),
28
+ "cang": ("c", "ang"),
29
+ "cao": ("c", "ao"),
30
+ "ce": ("c", "e"),
31
+ "cei": ("c", "ei"),
32
+ "cen": ("c", "en"),
33
+ "ceng": ("c", "eng"),
34
+ "cha": ("ch", "a"),
35
+ "chai": ("ch", "ai"),
36
+ "chan": ("ch", "an"),
37
+ "chang": ("ch", "ang"),
38
+ "chao": ("ch", "ao"),
39
+ "che": ("ch", "e"),
40
+ "chen": ("ch", "en"),
41
+ "cheng": ("ch", "eng"),
42
+ "chi": ("ch", "i"),
43
+ "chong": ("ch", "ong"),
44
+ "chou": ("ch", "ou"),
45
+ "chu": ("ch", "u"),
46
+ "chua": ("ch", "ua"),
47
+ "chuai": ("ch", "uai"),
48
+ "chuan": ("ch", "uan"),
49
+ "chuang": ("ch", "uang"),
50
+ "chui": ("ch", "ui"),
51
+ "chun": ("ch", "un"),
52
+ "chuo": ("ch", "uo"),
53
+ "ci": ("c", "i"),
54
+ "cong": ("c", "ong"),
55
+ "cou": ("c", "ou"),
56
+ "cu": ("c", "u"),
57
+ "cuan": ("c", "uan"),
58
+ "cui": ("c", "ui"),
59
+ "cun": ("c", "un"),
60
+ "cuo": ("c", "uo"),
61
+ "da": ("d", "a"),
62
+ "dai": ("d", "ai"),
63
+ "dan": ("d", "an"),
64
+ "dang": ("d", "ang"),
65
+ "dao": ("d", "ao"),
66
+ "de": ("d", "e"),
67
+ "dei": ("d", "ei"),
68
+ "den": ("d", "en"),
69
+ "deng": ("d", "eng"),
70
+ "di": ("d", "i"),
71
+ "dia": ("d", "ia"),
72
+ "dian": ("d", "ian"),
73
+ "diao": ("d", "iao"),
74
+ "die": ("d", "ie"),
75
+ "ding": ("d", "ing"),
76
+ "diu": ("d", "iu"),
77
+ "dong": ("d", "ong"),
78
+ "dou": ("d", "ou"),
79
+ "du": ("d", "u"),
80
+ "duan": ("d", "uan"),
81
+ "dui": ("d", "ui"),
82
+ "dun": ("d", "un"),
83
+ "duo": ("d", "uo"),
84
+ "e": ("e",),
85
+ "ei": ("ei",),
86
+ "en": ("en",),
87
+ "eng": ("eng",),
88
+ "er": ("er",),
89
+ "fa": ("f", "a"),
90
+ "fan": ("f", "an"),
91
+ "fang": ("f", "ang"),
92
+ "fei": ("f", "ei"),
93
+ "fen": ("f", "en"),
94
+ "feng": ("f", "eng"),
95
+ "fo": ("f", "o"),
96
+ "fou": ("f", "ou"),
97
+ "fu": ("f", "u"),
98
+ "ga": ("g", "a"),
99
+ "gai": ("g", "ai"),
100
+ "gan": ("g", "an"),
101
+ "gang": ("g", "ang"),
102
+ "gao": ("g", "ao"),
103
+ "ge": ("g", "e"),
104
+ "gei": ("g", "ei"),
105
+ "gen": ("g", "en"),
106
+ "geng": ("g", "eng"),
107
+ "gong": ("g", "ong"),
108
+ "gou": ("g", "ou"),
109
+ "gu": ("g", "u"),
110
+ "gua": ("g", "ua"),
111
+ "guai": ("g", "uai"),
112
+ "guan": ("g", "uan"),
113
+ "guang": ("g", "uang"),
114
+ "gui": ("g", "ui"),
115
+ "gun": ("g", "un"),
116
+ "guo": ("g", "uo"),
117
+ "ha": ("h", "a"),
118
+ "hai": ("h", "ai"),
119
+ "han": ("h", "an"),
120
+ "hang": ("h", "ang"),
121
+ "hao": ("h", "ao"),
122
+ "he": ("h", "e"),
123
+ "hei": ("h", "ei"),
124
+ "hen": ("h", "en"),
125
+ "heng": ("h", "eng"),
126
+ "hm": ("h", "m"),
127
+ "hng": ("h", "ng"),
128
+ "hong": ("h", "ong"),
129
+ "hou": ("h", "ou"),
130
+ "hu": ("h", "u"),
131
+ "hua": ("h", "ua"),
132
+ "huai": ("h", "uai"),
133
+ "huan": ("h", "uan"),
134
+ "huang": ("h", "uang"),
135
+ "hui": ("h", "ui"),
136
+ "hun": ("h", "un"),
137
+ "huo": ("h", "uo"),
138
+ "ji": ("j", "i"),
139
+ "jia": ("j", "ia"),
140
+ "jian": ("j", "ian"),
141
+ "jiang": ("j", "iang"),
142
+ "jiao": ("j", "iao"),
143
+ "jie": ("j", "ie"),
144
+ "jin": ("j", "in"),
145
+ "jing": ("j", "ing"),
146
+ "jiong": ("j", "iong"),
147
+ "jiu": ("j", "iu"),
148
+ "ju": ("j", "v"),
149
+ "juan": ("j", "van"),
150
+ "jue": ("j", "ve"),
151
+ "jun": ("j", "vn"),
152
+ "ka": ("k", "a"),
153
+ "kai": ("k", "ai"),
154
+ "kan": ("k", "an"),
155
+ "kang": ("k", "ang"),
156
+ "kao": ("k", "ao"),
157
+ "ke": ("k", "e"),
158
+ "kei": ("k", "ei"),
159
+ "ken": ("k", "en"),
160
+ "keng": ("k", "eng"),
161
+ "kong": ("k", "ong"),
162
+ "kou": ("k", "ou"),
163
+ "ku": ("k", "u"),
164
+ "kua": ("k", "ua"),
165
+ "kuai": ("k", "uai"),
166
+ "kuan": ("k", "uan"),
167
+ "kuang": ("k", "uang"),
168
+ "kui": ("k", "ui"),
169
+ "kun": ("k", "un"),
170
+ "kuo": ("k", "uo"),
171
+ "la": ("l", "a"),
172
+ "lai": ("l", "ai"),
173
+ "lan": ("l", "an"),
174
+ "lang": ("l", "ang"),
175
+ "lao": ("l", "ao"),
176
+ "le": ("l", "e"),
177
+ "lei": ("l", "ei"),
178
+ "leng": ("l", "eng"),
179
+ "li": ("l", "i"),
180
+ "lia": ("l", "ia"),
181
+ "lian": ("l", "ian"),
182
+ "liang": ("l", "iang"),
183
+ "liao": ("l", "iao"),
184
+ "lie": ("l", "ie"),
185
+ "lin": ("l", "in"),
186
+ "ling": ("l", "ing"),
187
+ "liu": ("l", "iu"),
188
+ "lo": ("l", "o"),
189
+ "long": ("l", "ong"),
190
+ "lou": ("l", "ou"),
191
+ "lu": ("l", "u"),
192
+ "luan": ("l", "uan"),
193
+ "lun": ("l", "un"),
194
+ "luo": ("l", "uo"),
195
+ "lv": ("l", "v"),
196
+ "lve": ("l", "ve"),
197
+ "m": ("m",),
198
+ "ma": ("m", "a"),
199
+ "mai": ("m", "ai"),
200
+ "man": ("m", "an"),
201
+ "mang": ("m", "ang"),
202
+ "mao": ("m", "ao"),
203
+ "me": ("m", "e"),
204
+ "mei": ("m", "ei"),
205
+ "men": ("m", "en"),
206
+ "meng": ("m", "eng"),
207
+ "mi": ("m", "i"),
208
+ "mian": ("m", "ian"),
209
+ "miao": ("m", "iao"),
210
+ "mie": ("m", "ie"),
211
+ "min": ("m", "in"),
212
+ "ming": ("m", "ing"),
213
+ "miu": ("m", "iu"),
214
+ "mo": ("m", "o"),
215
+ "mou": ("m", "ou"),
216
+ "mu": ("m", "u"),
217
+ "n": ("n",),
218
+ "na": ("n", "a"),
219
+ "nai": ("n", "ai"),
220
+ "nan": ("n", "an"),
221
+ "nang": ("n", "ang"),
222
+ "nao": ("n", "ao"),
223
+ "ne": ("n", "e"),
224
+ "nei": ("n", "ei"),
225
+ "nen": ("n", "en"),
226
+ "neng": ("n", "eng"),
227
+ "ng": ("n", "g"),
228
+ "ni": ("n", "i"),
229
+ "nian": ("n", "ian"),
230
+ "niang": ("n", "iang"),
231
+ "niao": ("n", "iao"),
232
+ "nie": ("n", "ie"),
233
+ "nin": ("n", "in"),
234
+ "ning": ("n", "ing"),
235
+ "niu": ("n", "iu"),
236
+ "nong": ("n", "ong"),
237
+ "nou": ("n", "ou"),
238
+ "nu": ("n", "u"),
239
+ "nuan": ("n", "uan"),
240
+ "nun": ("n", "un"),
241
+ "nuo": ("n", "uo"),
242
+ "nv": ("n", "v"),
243
+ "nve": ("n", "ve"),
244
+ "o": ("o",),
245
+ "ou": ("ou",),
246
+ "pa": ("p", "a"),
247
+ "pai": ("p", "ai"),
248
+ "pan": ("p", "an"),
249
+ "pang": ("p", "ang"),
250
+ "pao": ("p", "ao"),
251
+ "pei": ("p", "ei"),
252
+ "pen": ("p", "en"),
253
+ "peng": ("p", "eng"),
254
+ "pi": ("p", "i"),
255
+ "pian": ("p", "ian"),
256
+ "piao": ("p", "iao"),
257
+ "pie": ("p", "ie"),
258
+ "pin": ("p", "in"),
259
+ "ping": ("p", "ing"),
260
+ "po": ("p", "o"),
261
+ "pou": ("p", "ou"),
262
+ "pu": ("p", "u"),
263
+ "qi": ("q", "i"),
264
+ "qia": ("q", "ia"),
265
+ "qian": ("q", "ian"),
266
+ "qiang": ("q", "iang"),
267
+ "qiao": ("q", "iao"),
268
+ "qie": ("q", "ie"),
269
+ "qin": ("q", "in"),
270
+ "qing": ("q", "ing"),
271
+ "qiong": ("q", "iong"),
272
+ "qiu": ("q", "iu"),
273
+ "qu": ("q", "v"),
274
+ "quan": ("q", "van"),
275
+ "que": ("q", "ve"),
276
+ "qun": ("q", "vn"),
277
+ "ran": ("r", "an"),
278
+ "rang": ("r", "ang"),
279
+ "rao": ("r", "ao"),
280
+ "re": ("r", "e"),
281
+ "ren": ("r", "en"),
282
+ "reng": ("r", "eng"),
283
+ "ri": ("r", "i"),
284
+ "rong": ("r", "ong"),
285
+ "rou": ("r", "ou"),
286
+ "ru": ("r", "u"),
287
+ "rua": ("r", "ua"),
288
+ "ruan": ("r", "uan"),
289
+ "rui": ("r", "ui"),
290
+ "run": ("r", "un"),
291
+ "ruo": ("r", "uo"),
292
+ "sa": ("s", "a"),
293
+ "sai": ("s", "ai"),
294
+ "san": ("s", "an"),
295
+ "sang": ("s", "ang"),
296
+ "sao": ("s", "ao"),
297
+ "se": ("s", "e"),
298
+ "sen": ("s", "en"),
299
+ "seng": ("s", "eng"),
300
+ "sha": ("sh", "a"),
301
+ "shai": ("sh", "ai"),
302
+ "shan": ("sh", "an"),
303
+ "shang": ("sh", "ang"),
304
+ "shao": ("sh", "ao"),
305
+ "she": ("sh", "e"),
306
+ "shei": ("sh", "ei"),
307
+ "shen": ("sh", "en"),
308
+ "sheng": ("sh", "eng"),
309
+ "shi": ("sh", "i"),
310
+ "shou": ("sh", "ou"),
311
+ "shu": ("sh", "u"),
312
+ "shua": ("sh", "ua"),
313
+ "shuai": ("sh", "uai"),
314
+ "shuan": ("sh", "uan"),
315
+ "shuang": ("sh", "uang"),
316
+ "shui": ("sh", "ui"),
317
+ "shun": ("sh", "un"),
318
+ "shuo": ("sh", "uo"),
319
+ "si": ("s", "i"),
320
+ "song": ("s", "ong"),
321
+ "sou": ("s", "ou"),
322
+ "su": ("s", "u"),
323
+ "suan": ("s", "uan"),
324
+ "sui": ("s", "ui"),
325
+ "sun": ("s", "un"),
326
+ "suo": ("s", "uo"),
327
+ "ta": ("t", "a"),
328
+ "tai": ("t", "ai"),
329
+ "tan": ("t", "an"),
330
+ "tang": ("t", "ang"),
331
+ "tao": ("t", "ao"),
332
+ "te": ("t", "e"),
333
+ "tei": ("t", "ei"),
334
+ "teng": ("t", "eng"),
335
+ "ti": ("t", "i"),
336
+ "tian": ("t", "ian"),
337
+ "tiao": ("t", "iao"),
338
+ "tie": ("t", "ie"),
339
+ "ting": ("t", "ing"),
340
+ "tong": ("t", "ong"),
341
+ "tou": ("t", "ou"),
342
+ "tu": ("t", "u"),
343
+ "tuan": ("t", "uan"),
344
+ "tui": ("t", "ui"),
345
+ "tun": ("t", "un"),
346
+ "tuo": ("t", "uo"),
347
+ "wa": ("w", "a"),
348
+ "wai": ("w", "ai"),
349
+ "wan": ("w", "an"),
350
+ "wang": ("w", "ang"),
351
+ "wei": ("w", "ei"),
352
+ "wen": ("w", "en"),
353
+ "weng": ("w", "eng"),
354
+ "wo": ("w", "o"),
355
+ "wu": ("w", "u"),
356
+ "xi": ("x", "i"),
357
+ "xia": ("x", "ia"),
358
+ "xian": ("x", "ian"),
359
+ "xiang": ("x", "iang"),
360
+ "xiao": ("x", "iao"),
361
+ "xie": ("x", "ie"),
362
+ "xin": ("x", "in"),
363
+ "xing": ("x", "ing"),
364
+ "xiong": ("x", "iong"),
365
+ "xiu": ("x", "iu"),
366
+ "xu": ("x", "v"),
367
+ "xuan": ("x", "van"),
368
+ "xue": ("x", "ve"),
369
+ "xun": ("x", "vn"),
370
+ "ya": ("y", "a"),
371
+ "yan": ("y", "an"),
372
+ "yang": ("y", "ang"),
373
+ "yao": ("y", "ao"),
374
+ "ye": ("y", "e"),
375
+ "yi": ("y", "i"),
376
+ "yin": ("y", "in"),
377
+ "ying": ("y", "ing"),
378
+ "yo": ("y", "o"),
379
+ "yong": ("y", "ong"),
380
+ "you": ("y", "ou"),
381
+ "yu": ("y", "v"),
382
+ "yuan": ("y", "van"),
383
+ "yue": ("y", "ve"),
384
+ "yun": ("y", "vn"),
385
+ "za": ("z", "a"),
386
+ "zai": ("z", "ai"),
387
+ "zan": ("z", "an"),
388
+ "zang": ("z", "ang"),
389
+ "zao": ("z", "ao"),
390
+ "ze": ("z", "e"),
391
+ "zei": ("z", "ei"),
392
+ "zen": ("z", "en"),
393
+ "zeng": ("z", "eng"),
394
+ "zha": ("zh", "a"),
395
+ "zhai": ("zh", "ai"),
396
+ "zhan": ("zh", "an"),
397
+ "zhang": ("zh", "ang"),
398
+ "zhao": ("zh", "ao"),
399
+ "zhe": ("zh", "e"),
400
+ "zhei": ("zh", "ei"),
401
+ "zhen": ("zh", "en"),
402
+ "zheng": ("zh", "eng"),
403
+ "zhi": ("zh", "i"),
404
+ "zhong": ("zh", "ong"),
405
+ "zhou": ("zh", "ou"),
406
+ "zhu": ("zh", "u"),
407
+ "zhua": ("zh", "ua"),
408
+ "zhuai": ("zh", "uai"),
409
+ "zhuan": ("zh", "uan"),
410
+ "zhuang": ("zh", "uang"),
411
+ "zhui": ("zh", "ui"),
412
+ "zhun": ("zh", "un"),
413
+ "zhuo": ("zh", "uo"),
414
+ "zi": ("z", "i"),
415
+ "zong": ("z", "ong"),
416
+ "zou": ("z", "ou"),
417
+ "zu": ("z", "u"),
418
+ "zuan": ("z", "uan"),
419
+ "zui": ("z", "ui"),
420
+ "zun": ("z", "un"),
421
+ "zuo": ("z", "uo"),
422
+ "sp": ("SP",),
423
+ "ap": ("AP",),
424
+ }
425
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/South-Twilight/espnet
2
+ torch
3
+ numpy
4
+ librosa
5
+ espnet_model_zoo
6
+ importlib
7
+ pathlib
8
+ pypinyin
util.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ def split_pinyin(pinyin: str, pinyin_dict: dict) -> tuple[str]:
3
+ # load pinyin dict from local/pinyin.dict
4
+ pinyin = pinyin.lower()
5
+ return pinyin_dict[pinyin]