tobiccino commited on
Commit
8c70653
·
1 Parent(s): 7e48c52

update ui tacotron

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. TTS/.DS_Store +0 -0
  3. TTS/.models.json +515 -0
  4. TTS/VERSION +1 -0
  5. TTS/__init__.py +6 -0
  6. TTS/__pycache__/__init__.cpython-310.pyc +0 -0
  7. TTS/__pycache__/__init__.cpython-37.pyc +0 -0
  8. TTS/__pycache__/__init__.cpython-38.pyc +0 -0
  9. TTS/__pycache__/__init__.cpython-39.pyc +0 -0
  10. TTS/__pycache__/model.cpython-310.pyc +0 -0
  11. TTS/__pycache__/model.cpython-37.pyc +0 -0
  12. TTS/__pycache__/model.cpython-38.pyc +0 -0
  13. TTS/__pycache__/model.cpython-39.pyc +0 -0
  14. TTS/bin/.ipynb_checkpoints/find_unique_chars-checkpoint.py +45 -0
  15. TTS/bin/__init__.py +0 -0
  16. TTS/bin/collect_env_info.py +48 -0
  17. TTS/bin/compute_attention_masks.py +165 -0
  18. TTS/bin/compute_embeddings.py +131 -0
  19. TTS/bin/compute_statistics.py +98 -0
  20. TTS/bin/eval_encoder.py +89 -0
  21. TTS/bin/extract_tts_spectrograms.py +287 -0
  22. TTS/bin/find_unique_chars.py +48 -0
  23. TTS/bin/find_unique_phonemes.py +78 -0
  24. TTS/bin/remove_silence_using_vad.py +93 -0
  25. TTS/bin/resample.py +87 -0
  26. TTS/bin/synthesize.py +374 -0
  27. TTS/bin/train_encoder.py +319 -0
  28. TTS/bin/train_tts.py +75 -0
  29. TTS/bin/train_vocoder.py +81 -0
  30. TTS/bin/tune_wavegrad.py +103 -0
  31. TTS/config/.ipynb_checkpoints/__init__-checkpoint.py +132 -0
  32. TTS/config/.ipynb_checkpoints/config-checkpoint.json +20 -0
  33. TTS/config/.ipynb_checkpoints/shared_configs-checkpoint.py +264 -0
  34. TTS/config/__init__.py +132 -0
  35. TTS/config/__pycache__/__init__.cpython-310.pyc +0 -0
  36. TTS/config/__pycache__/__init__.cpython-37.pyc +0 -0
  37. TTS/config/__pycache__/__init__.cpython-38.pyc +0 -0
  38. TTS/config/__pycache__/__init__.cpython-39.pyc +0 -0
  39. TTS/config/__pycache__/shared_configs.cpython-310.pyc +0 -0
  40. TTS/config/__pycache__/shared_configs.cpython-37.pyc +0 -0
  41. TTS/config/__pycache__/shared_configs.cpython-38.pyc +0 -0
  42. TTS/config/__pycache__/shared_configs.cpython-39.pyc +0 -0
  43. TTS/config/config.json +25 -0
  44. TTS/config/shared_configs.py +264 -0
  45. TTS/encoder/README.md +18 -0
  46. TTS/encoder/__init__.py +0 -0
  47. TTS/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
  48. TTS/encoder/__pycache__/__init__.cpython-37.pyc +0 -0
  49. TTS/encoder/__pycache__/__init__.cpython-38.pyc +0 -0
  50. TTS/encoder/__pycache__/__init__.cpython-39.pyc +0 -0
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
TTS/.DS_Store ADDED
Binary file (6.15 kB). View file
 
TTS/.models.json ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual":{
4
+ "multi-dataset":{
5
+ "your_tts":{
6
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
7
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
8
+ "default_vocoder": null,
9
+ "commit": "e9a1953e",
10
+ "license": "CC BY-NC-ND 4.0",
11
+ "contact": "[email protected]"
12
+ }
13
+ }
14
+ },
15
+ "en": {
16
+ "ek1": {
17
+ "tacotron2": {
18
+ "description": "EK1 en-rp tacotron2 by NMStoker",
19
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
20
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
21
+ "commit": "c802255",
22
+ "license": "apache 2.0"
23
+ }
24
+ },
25
+ "ljspeech": {
26
+ "tacotron2-DDC": {
27
+ "description": "Tacotron2 with Double Decoder Consistency.",
28
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
29
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
30
+ "commit": "bae2ad0f",
31
+ "author": "Eren Gölge @erogol",
32
+ "license": "apache 2.0",
33
+ "contact": "[email protected]"
34
+ },
35
+ "tacotron2-DDC_ph": {
36
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
37
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
38
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
39
+ "commit": "3900448",
40
+ "author": "Eren Gölge @erogol",
41
+ "license": "apache 2.0",
42
+ "contact": "[email protected]"
43
+ },
44
+ "glow-tts": {
45
+ "description": "",
46
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
47
+ "stats_file": null,
48
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
49
+ "commit": "",
50
+ "author": "Eren Gölge @erogol",
51
+ "license": "MPL",
52
+ "contact": "[email protected]"
53
+ },
54
+ "speedy-speech": {
55
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
56
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
57
+ "stats_file": null,
58
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
59
+ "commit": "4581e3d",
60
+ "author": "Eren Gölge @erogol",
61
+ "license": "apache 2.0",
62
+ "contact": "[email protected]"
63
+ },
64
+ "tacotron2-DCA": {
65
+ "description": "",
66
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
67
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
68
+ "commit": "",
69
+ "author": "Eren Gölge @erogol",
70
+ "license": "MPL",
71
+ "contact": "[email protected]"
72
+ },
73
+ "vits": {
74
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
75
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
76
+ "default_vocoder": null,
77
+ "commit": "3900448",
78
+ "author": "Eren Gölge @erogol",
79
+ "license": "apache 2.0",
80
+ "contact": "[email protected]"
81
+ },
82
+ "fast_pitch": {
83
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
84
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
85
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
86
+ "commit": "b27b3ba",
87
+ "author": "Eren Gölge @erogol",
88
+ "license": "apache 2.0",
89
+ "contact": "[email protected]"
90
+ }
91
+ },
92
+ "vctk": {
93
+ "vits": {
94
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
95
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
96
+ "default_vocoder": null,
97
+ "commit": "3900448",
98
+ "author": "Eren @erogol",
99
+ "license": "apache 2.0",
100
+ "contact": "[email protected]"
101
+ },
102
+ "fast_pitch":{
103
+ "description": "FastPitch model trained on VCTK dataseset.",
104
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
105
+ "default_vocoder": null,
106
+ "commit": "bdab788d",
107
+ "author": "Eren @erogol",
108
+ "license": "CC BY-NC-ND 4.0",
109
+ "contact": "[email protected]"
110
+ }
111
+ },
112
+ "sam": {
113
+ "tacotron-DDC": {
114
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
115
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
116
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
117
+ "commit": "bae2ad0f",
118
+ "author": "Eren Gölge @erogol",
119
+ "license": "apache 2.0",
120
+ "contact": "[email protected]"
121
+ }
122
+ },
123
+ "blizzard2013": {
124
+ "capacitron-t2-c50": {
125
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
126
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
127
+ "commit": "d6284e7",
128
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
129
+ "author": "Adam Froghyar @a-froghyar",
130
+ "license": "apache 2.0",
131
+ "contact": "[email protected]"
132
+ },
133
+ "capacitron-t2-c150_v2": {
134
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
135
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
136
+ "commit": "a67039d",
137
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
138
+ "author": "Adam Froghyar @a-froghyar",
139
+ "license": "apache 2.0",
140
+ "contact": "[email protected]"
141
+ }
142
+ }
143
+ },
144
+ "es": {
145
+ "mai": {
146
+ "tacotron2-DDC": {
147
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
148
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
149
+ "commit": "",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "MPL",
152
+ "contact": "[email protected]"
153
+ }
154
+ }
155
+ },
156
+ "fr": {
157
+ "mai": {
158
+ "tacotron2-DDC": {
159
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
160
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
161
+ "commit": "",
162
+ "author": "Eren Gölge @erogol",
163
+ "license": "MPL",
164
+ "contact": "[email protected]"
165
+ }
166
+ }
167
+ },
168
+ "uk":{
169
+ "mai": {
170
+ "glow-tts": {
171
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
172
+ "author":"@robinhad",
173
+ "commit": "bdab788d",
174
+ "license": "MIT",
175
+ "contact": "",
176
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
177
+ }
178
+ }
179
+ },
180
+ "zh-CN": {
181
+ "baker": {
182
+ "tacotron2-DDC-GST": {
183
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
184
+ "commit": "unknown",
185
+ "author": "@kirianguiller",
186
+ "license": "apache 2.0",
187
+ "default_vocoder": null
188
+ }
189
+ }
190
+ },
191
+ "nl": {
192
+ "mai": {
193
+ "tacotron2-DDC": {
194
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
195
+ "author": "@r-dh",
196
+ "license": "apache 2.0",
197
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
198
+ "stats_file": null,
199
+ "commit": "540d811"
200
+ }
201
+ }
202
+ },
203
+ "de": {
204
+ "thorsten": {
205
+ "tacotron2-DCA": {
206
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
207
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
208
+ "author": "@thorstenMueller",
209
+ "license": "apache 2.0",
210
+ "commit": "unknown"
211
+ },
212
+ "vits": {
213
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
214
+ "default_vocoder": null,
215
+ "author": "@thorstenMueller",
216
+ "license": "apache 2.0",
217
+ "commit": "unknown"
218
+ },
219
+ "tacotron2-DDC": {
220
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
221
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
222
+ "description": "Thorsten-Dec2021-22k-DDC",
223
+ "author": "@thorstenMueller",
224
+ "license": "apache 2.0",
225
+ "commit": "unknown"
226
+ }
227
+ }
228
+ },
229
+ "ja": {
230
+ "kokoro": {
231
+ "tacotron2-DDC": {
232
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
233
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
234
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
235
+ "author": "@kaiidams",
236
+ "license": "apache 2.0",
237
+ "commit": "401fbd89"
238
+ }
239
+ }
240
+ },
241
+ "tr":{
242
+ "common-voice": {
243
+ "glow-tts":{
244
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
245
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
246
+ "license": "MIT",
247
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
248
+ "author": "Fatih Akademi",
249
+ "commit": null
250
+ }
251
+ }
252
+ },
253
+ "it": {
254
+ "mai_female": {
255
+ "glow-tts":{
256
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
257
+ "default_vocoder": null,
258
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
259
+ "author": "@nicolalandro",
260
+ "license": "apache 2.0",
261
+ "commit": null
262
+ },
263
+ "vits":{
264
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
265
+ "default_vocoder": null,
266
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
267
+ "author": "@nicolalandro",
268
+ "license": "apache 2.0",
269
+ "commit": null
270
+ }
271
+ },
272
+ "mai_male": {
273
+ "glow-tts":{
274
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
275
+ "default_vocoder": null,
276
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
277
+ "author": "@nicolalandro",
278
+ "license": "apache 2.0",
279
+ "commit": null
280
+ },
281
+ "vits":{
282
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
283
+ "default_vocoder": null,
284
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
285
+ "author": "@nicolalandro",
286
+ "license": "apache 2.0",
287
+ "commit": null
288
+ }
289
+ }
290
+ },
291
+ "ewe": {
292
+ "openbible": {
293
+ "vits":{
294
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
295
+ "default_vocoder": null,
296
+ "license": "CC-BY-SA 4.0",
297
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
298
+ "author": "@coqui_ai",
299
+ "commit": "1b22f03"
300
+ }
301
+ }
302
+ },
303
+ "hau": {
304
+ "openbible": {
305
+ "vits":{
306
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
307
+ "default_vocoder": null,
308
+ "license": "CC-BY-SA 4.0",
309
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
310
+ "author": "@coqui_ai",
311
+ "commit": "1b22f03"
312
+ }
313
+ }
314
+ },
315
+ "lin": {
316
+ "openbible": {
317
+ "vits":{
318
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
319
+ "default_vocoder": null,
320
+ "license": "CC-BY-SA 4.0",
321
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
322
+ "author": "@coqui_ai",
323
+ "commit": "1b22f03"
324
+ }
325
+ }
326
+ },
327
+ "tw_akuapem": {
328
+ "openbible": {
329
+ "vits":{
330
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
331
+ "default_vocoder": null,
332
+ "license": "CC-BY-SA 4.0",
333
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
334
+ "author": "@coqui_ai",
335
+ "commit": "1b22f03"
336
+ }
337
+ }
338
+ },
339
+ "tw_asante": {
340
+ "openbible": {
341
+ "vits":{
342
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
343
+ "default_vocoder": null,
344
+ "license": "CC-BY-SA 4.0",
345
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
346
+ "author": "@coqui_ai",
347
+ "commit": "1b22f03"
348
+ }
349
+ }
350
+ },
351
+ "yor": {
352
+ "openbible": {
353
+ "vits":{
354
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
355
+ "default_vocoder": null,
356
+ "license": "CC-BY-SA 4.0",
357
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
358
+ "author": "@coqui_ai",
359
+ "commit": "1b22f03"
360
+ }
361
+ }
362
+ }
363
+ },
364
+ "vocoder_models": {
365
+ "universal": {
366
+ "libri-tts": {
367
+ "wavegrad": {
368
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
369
+ "commit": "ea976b0",
370
+ "author": "Eren Gölge @erogol",
371
+ "license": "MPL",
372
+ "contact": "[email protected]"
373
+ },
374
+ "fullband-melgan": {
375
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
376
+ "commit": "4132240",
377
+ "author": "Eren Gölge @erogol",
378
+ "license": "MPL",
379
+ "contact": "[email protected]"
380
+ }
381
+ }
382
+ },
383
+ "en": {
384
+ "ek1": {
385
+ "wavegrad": {
386
+ "description": "EK1 en-rp wavegrad by NMStoker",
387
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
388
+ "commit": "c802255",
389
+ "license": "apache 2.0"
390
+ }
391
+ },
392
+ "ljspeech": {
393
+ "multiband-melgan": {
394
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
395
+ "commit": "ea976b0",
396
+ "author": "Eren Gölge @erogol",
397
+ "license": "MPL",
398
+ "contact": "[email protected]"
399
+ },
400
+ "hifigan_v2": {
401
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
402
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
403
+ "commit": "bae2ad0f",
404
+ "author": "@erogol",
405
+ "license": "apache 2.0",
406
+ "contact": "[email protected]"
407
+ },
408
+ "univnet": {
409
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
410
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
411
+ "commit": "4581e3d",
412
+ "author": "Eren @erogol",
413
+ "license": "apache 2.0",
414
+ "contact": "[email protected]"
415
+ }
416
+ },
417
+ "blizzard2013": {
418
+ "hifigan_v2": {
419
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
420
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
421
+ "commit": "d6284e7",
422
+ "author": "Adam Froghyar @a-froghyar",
423
+ "license": "apache 2.0",
424
+ "contact": "[email protected]"
425
+ }
426
+ },
427
+ "vctk": {
428
+ "hifigan_v2": {
429
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
430
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
431
+ "commit": "2f07160",
432
+ "author": "Edresson Casanova",
433
+ "license": "apache 2.0",
434
+ "contact": ""
435
+ }
436
+ },
437
+ "sam": {
438
+ "hifigan_v2": {
439
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
440
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
441
+ "commit": "2f07160",
442
+ "author": "Eren Gölge @erogol",
443
+ "license": "apache 2.0",
444
+ "contact": "[email protected]"
445
+ }
446
+ }
447
+ },
448
+ "nl": {
449
+ "mai": {
450
+ "parallel-wavegan": {
451
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
452
+ "author": "@r-dh",
453
+ "license": "apache 2.0",
454
+ "commit": "unknown"
455
+ }
456
+ }
457
+ },
458
+ "de": {
459
+ "thorsten": {
460
+ "wavegrad": {
461
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
462
+ "author": "@thorstenMueller",
463
+ "license": "apache 2.0",
464
+ "commit": "unknown"
465
+ },
466
+ "fullband-melgan": {
467
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
468
+ "author": "@thorstenMueller",
469
+ "license": "apache 2.0",
470
+ "commit": "unknown"
471
+ },
472
+ "hifigan_v1": {
473
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
474
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
475
+ "author": "@thorstenMueller",
476
+ "license": "apache 2.0",
477
+ "commit": "unknown"
478
+ }
479
+ }
480
+ },
481
+ "ja": {
482
+ "kokoro": {
483
+ "hifigan_v1": {
484
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
485
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
486
+ "author": "@kaiidams",
487
+ "license": "apache 2.0",
488
+ "commit": "3900448"
489
+ }
490
+ }
491
+ },
492
+ "uk": {
493
+ "mai": {
494
+ "multiband-melgan": {
495
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
496
+ "author":"@robinhad",
497
+ "commit": "bdab788d",
498
+ "license": "MIT",
499
+ "contact": ""
500
+ }
501
+ }
502
+ },
503
+ "tr":{
504
+ "common-voice": {
505
+ "hifigan":{
506
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
507
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
508
+ "author": "Fatih Akademi",
509
+ "license": "MIT",
510
+ "commit": null
511
+ }
512
+ }
513
+ }
514
+ }
515
+ }
TTS/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.8.0
TTS/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
TTS/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (355 Bytes). View file
 
TTS/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (290 Bytes). View file
 
TTS/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (321 Bytes). View file
 
TTS/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (364 Bytes). View file
 
TTS/__pycache__/model.cpython-310.pyc ADDED
Binary file (2.58 kB). View file
 
TTS/__pycache__/model.cpython-37.pyc ADDED
Binary file (2.51 kB). View file
 
TTS/__pycache__/model.cpython-38.pyc ADDED
Binary file (2.57 kB). View file
 
TTS/__pycache__/model.cpython-39.pyc ADDED
Binary file (2.59 kB). View file
 
TTS/bin/.ipynb_checkpoints/find_unique_chars-checkpoint.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/__init__.py ADDED
File without changes
TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.config.shared_configs import BaseDatasetConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.managers import save_file
12
+ from TTS.tts.utils.speakers import SpeakerManager
13
+
14
+ parser = argparse.ArgumentParser(
15
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
16
+ """
17
+ Example runs:
18
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
19
+
20
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
21
+ """,
22
+ formatter_class=RawTextHelpFormatter,
23
+ )
24
+ parser.add_argument(
25
+ "--model_path",
26
+ type=str,
27
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
28
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
29
+ )
30
+ parser.add_argument(
31
+ "--config_path",
32
+ type=str,
33
+ help="Path to model config file. It defaults to the released speaker encoder config.",
34
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
35
+ )
36
+ parser.add_argument(
37
+ "--config_dataset_path",
38
+ type=str,
39
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
40
+ default=None,
41
+ )
42
+ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
43
+ parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
44
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
45
+ parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
46
+ parser.add_argument(
47
+ "--formatter_name",
48
+ type=str,
49
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
50
+ default=None,
51
+ )
52
+ parser.add_argument(
53
+ "--dataset_name",
54
+ type=str,
55
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
56
+ default=None,
57
+ )
58
+ parser.add_argument(
59
+ "--dataset_path",
60
+ type=str,
61
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
62
+ default=None,
63
+ )
64
+ parser.add_argument(
65
+ "--metafile",
66
+ type=str,
67
+ help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
68
+ default=None,
69
+ )
70
+ args = parser.parse_args()
71
+
72
+ use_cuda = torch.cuda.is_available() and not args.disable_cuda
73
+
74
+ if args.config_dataset_path is not None:
75
+ c_dataset = load_config(args.config_dataset_path)
76
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
77
+ else:
78
+ c_dataset = BaseDatasetConfig()
79
+ c_dataset.formatter = args.formatter_name
80
+ c_dataset.dataset_name = args.dataset_name
81
+ c_dataset.path = args.dataset_path
82
+ c_dataset.meta_file_train = args.metafile if args.metafile else None
83
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)
84
+
85
+
86
+ if meta_data_eval is None:
87
+ samples = meta_data_train
88
+ else:
89
+ samples = meta_data_train + meta_data_eval
90
+
91
+ encoder_manager = SpeakerManager(
92
+ encoder_model_path=args.model_path,
93
+ encoder_config_path=args.config_path,
94
+ d_vectors_file_path=args.old_file,
95
+ use_cuda=use_cuda,
96
+ )
97
+
98
+ class_name_key = encoder_manager.encoder_config.class_name_key
99
+
100
+ # compute speaker embeddings
101
+ speaker_mapping = {}
102
+ for idx, fields in enumerate(tqdm(samples)):
103
+ class_name = fields[class_name_key]
104
+ audio_file = fields["audio_file"]
105
+ embedding_key = fields["audio_unique_name"]
106
+ root_path = fields["root_path"]
107
+
108
+ if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
109
+ # get the embedding from the old file
110
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
111
+ else:
112
+ # extract the embedding
113
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
114
+
115
+ # create speaker_mapping if target dataset is defined
116
+ speaker_mapping[embedding_key] = {}
117
+ speaker_mapping[embedding_key]["name"] = class_name
118
+ speaker_mapping[embedding_key]["embedding"] = embedd
119
+
120
+ if speaker_mapping:
121
+ # save speaker_mapping if target dataset is defined
122
+ if os.path.isdir(args.output_path):
123
+ mapping_file_path = os.path.join(args.output_path, "speakers.pth")
124
+ else:
125
+ mapping_file_path = args.output_path
126
+
127
+ if os.path.dirname(mapping_file_path) != "":
128
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
129
+
130
+ save_file(speaker_mapping, mapping_file_path)
131
+ print("Speaker embeddings saved at:", mapping_file_path)
TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+ import sys
8
+
9
+ sys.path.append('.')
10
+ import numpy as np
11
+ from tqdm import tqdm
12
+
13
+ # from TTS.utils.io import load_config
14
+ from TTS.config import load_config
15
+ from TTS.tts.datasets import load_tts_samples
16
+ from TTS.utils.audio import AudioProcessor
17
+
18
+
19
+ def main():
20
+ """Run preprocessing process."""
21
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
22
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
23
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
24
+ parser.add_argument(
25
+ "--data_path",
26
+ type=str,
27
+ required=False,
28
+ help="folder including the target set of wavs overriding dataset config.",
29
+ )
30
+ args, overrides = parser.parse_known_args()
31
+
32
+ CONFIG = load_config(args.config_path)
33
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
34
+
35
+ # load config
36
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
37
+ CONFIG.audio.stats_path = None # discard pre-defined stats
38
+
39
+ # load audio processor
40
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
41
+
42
+ # load the meta data of target dataset
43
+ if args.data_path:
44
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
45
+ else:
46
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
47
+ print(f" > There are {len(dataset_items)} files.")
48
+
49
+ mel_sum = 0
50
+ mel_square_sum = 0
51
+ linear_sum = 0
52
+ linear_square_sum = 0
53
+ N = 0
54
+ for item in tqdm(dataset_items):
55
+ # compute features
56
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
57
+ linear = ap.spectrogram(wav)
58
+ mel = ap.melspectrogram(wav)
59
+
60
+ # compute stats
61
+ N += mel.shape[1]
62
+ mel_sum += mel.sum(1)
63
+ linear_sum += linear.sum(1)
64
+ mel_square_sum += (mel**2).sum(axis=1)
65
+ linear_square_sum += (linear**2).sum(axis=1)
66
+
67
+ mel_mean = mel_sum / N
68
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
69
+ linear_mean = linear_sum / N
70
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
71
+
72
+ output_file_path = args.out_path
73
+ stats = {}
74
+ stats["mel_mean"] = mel_mean
75
+ stats["mel_std"] = mel_scale
76
+ stats["linear_mean"] = linear_mean
77
+ stats["linear_std"] = linear_scale
78
+
79
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
80
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
81
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
82
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
83
+
84
+ # set default config values for mean-var scaling
85
+ CONFIG.audio.stats_path = output_file_path
86
+ CONFIG.audio.signal_norm = True
87
+ # remove redundant values
88
+ del CONFIG.audio.max_norm
89
+ del CONFIG.audio.min_level_db
90
+ del CONFIG.audio.symmetric_norm
91
+ del CONFIG.audio.clip_norm
92
+ stats["audio_config"] = CONFIG.audio.to_dict()
93
+ np.save(output_file_path, stats, allow_pickle=True)
94
+ print(f" > stats saved to {output_file_path}")
95
+
96
+
97
+ if __name__ == "__main__":
98
+ main()
TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+
14
+ class_name_key = encoder_manager.encoder_config.class_name_key
15
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
16
+
17
+ class_acc_dict = {}
18
+
19
+ # compute embeddings for all wav_files
20
+ for item in tqdm(dataset_items):
21
+ class_name = item[class_name_key]
22
+ wav_file = item["audio_file"]
23
+
24
+ # extract the embedding
25
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
26
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
27
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
28
+ if encoder_manager.use_cuda:
29
+ embedding = embedding.cuda()
30
+
31
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
32
+ predicted_label = map_classid_to_classname[str(class_id)]
33
+ else:
34
+ predicted_label = None
35
+
36
+ if class_name is not None and predicted_label is not None:
37
+ is_equal = int(class_name == predicted_label)
38
+ if class_name not in class_acc_dict:
39
+ class_acc_dict[class_name] = [is_equal]
40
+ else:
41
+ class_acc_dict[class_name].append(is_equal)
42
+ else:
43
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
44
+
45
+ acc_avg = 0
46
+ for key, values in class_acc_dict.items():
47
+ acc = sum(values) / len(values)
48
+ print("Class", key, "Accuracy:", acc)
49
+ acc_avg += acc
50
+
51
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
52
+
53
+
54
+ if __name__ == "__main__":
55
+ parser = argparse.ArgumentParser(
56
+ description="""Compute the accuracy of the encoder.\n\n"""
57
+ """
58
+ Example runs:
59
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
60
+ """,
61
+ formatter_class=RawTextHelpFormatter,
62
+ )
63
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
64
+ parser.add_argument(
65
+ "config_path",
66
+ type=str,
67
+ help="Path to model config file.",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "config_dataset_path",
72
+ type=str,
73
+ help="Path to dataset config file.",
74
+ )
75
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
76
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
77
+
78
+ args = parser.parse_args()
79
+
80
+ c_dataset = load_config(args.config_dataset_path)
81
+
82
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
83
+ items = meta_data_train + meta_data_eval
84
+
85
+ enc_manager = SpeakerManager(
86
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
87
+ )
88
+
89
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.generic_utils import count_parameters
19
+
20
+ use_cuda = torch.cuda.is_available()
21
+
22
+
23
+ def setup_loader(ap, r, verbose=False):
24
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
25
+ dataset = TTSDataset(
26
+ outputs_per_step=r,
27
+ compute_linear_spec=False,
28
+ samples=meta_data,
29
+ tokenizer=tokenizer,
30
+ ap=ap,
31
+ batch_group_size=0,
32
+ min_text_len=c.min_text_len,
33
+ max_text_len=c.max_text_len,
34
+ min_audio_len=c.min_audio_len,
35
+ max_audio_len=c.max_audio_len,
36
+ phoneme_cache_path=c.phoneme_cache_path,
37
+ precompute_num_workers=0,
38
+ use_noise_augment=False,
39
+ verbose=verbose,
40
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
41
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
42
+ )
43
+
44
+ if c.use_phonemes and c.compute_input_seq_cache:
45
+ # precompute phonemes to have a better estimate of sequence lengths.
46
+ dataset.compute_input_seq(c.num_loader_workers)
47
+ dataset.preprocess_samples()
48
+
49
+ loader = DataLoader(
50
+ dataset,
51
+ batch_size=c.batch_size,
52
+ shuffle=False,
53
+ collate_fn=dataset.collate_fn,
54
+ drop_last=False,
55
+ sampler=None,
56
+ num_workers=c.num_loader_workers,
57
+ pin_memory=False,
58
+ )
59
+ return loader
60
+
61
+
62
+ def set_filename(wav_path, out_path):
63
+ wav_file = os.path.basename(wav_path)
64
+ file_name = wav_file.split(".")[0]
65
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
66
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
69
+ wavq_path = os.path.join(out_path, "quant", file_name)
70
+ mel_path = os.path.join(out_path, "mel", file_name)
71
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
72
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
73
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
74
+
75
+
76
+ def format_data(data):
77
+ # setup input data
78
+ text_input = data["token_id"]
79
+ text_lengths = data["token_id_lengths"]
80
+ mel_input = data["mel"]
81
+ mel_lengths = data["mel_lengths"]
82
+ item_idx = data["item_idxs"]
83
+ d_vectors = data["d_vectors"]
84
+ speaker_ids = data["speaker_ids"]
85
+ attn_mask = data["attns"]
86
+ avg_text_length = torch.mean(text_lengths.float())
87
+ avg_spec_length = torch.mean(mel_lengths.float())
88
+
89
+ # dispatch data to GPU
90
+ if use_cuda:
91
+ text_input = text_input.cuda(non_blocking=True)
92
+ text_lengths = text_lengths.cuda(non_blocking=True)
93
+ mel_input = mel_input.cuda(non_blocking=True)
94
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
95
+ if speaker_ids is not None:
96
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
97
+ if d_vectors is not None:
98
+ d_vectors = d_vectors.cuda(non_blocking=True)
99
+ if attn_mask is not None:
100
+ attn_mask = attn_mask.cuda(non_blocking=True)
101
+ return (
102
+ text_input,
103
+ text_lengths,
104
+ mel_input,
105
+ mel_lengths,
106
+ speaker_ids,
107
+ d_vectors,
108
+ avg_text_length,
109
+ avg_spec_length,
110
+ attn_mask,
111
+ item_idx,
112
+ )
113
+
114
+
115
+ @torch.no_grad()
116
+ def inference(
117
+ model_name,
118
+ model,
119
+ ap,
120
+ text_input,
121
+ text_lengths,
122
+ mel_input,
123
+ mel_lengths,
124
+ speaker_ids=None,
125
+ d_vectors=None,
126
+ ):
127
+ if model_name == "glow_tts":
128
+ speaker_c = None
129
+ if speaker_ids is not None:
130
+ speaker_c = speaker_ids
131
+ elif d_vectors is not None:
132
+ speaker_c = d_vectors
133
+ outputs = model.inference_with_MAS(
134
+ text_input,
135
+ text_lengths,
136
+ mel_input,
137
+ mel_lengths,
138
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
139
+ )
140
+ model_output = outputs["model_outputs"]
141
+ model_output = model_output.detach().cpu().numpy()
142
+
143
+ elif "tacotron" in model_name:
144
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
145
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
146
+ postnet_outputs = outputs["model_outputs"]
147
+ # normalize tacotron output
148
+ if model_name == "tacotron":
149
+ mel_specs = []
150
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
151
+ for b in range(postnet_outputs.shape[0]):
152
+ postnet_output = postnet_outputs[b]
153
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
154
+ model_output = torch.stack(mel_specs).cpu().numpy()
155
+
156
+ elif model_name == "tacotron2":
157
+ model_output = postnet_outputs.detach().cpu().numpy()
158
+ return model_output
159
+
160
+
161
+ def extract_spectrograms(
162
+ data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
163
+ ):
164
+ model.eval()
165
+ export_metadata = []
166
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
167
+
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantized_wav:
201
+ wavq = ap.quantize(wav)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantized_wav=args.quantized,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import sys
6
+
7
+ sys.path.append('.')
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+
11
+
12
+ def main():
13
+ # pylint: disable=bad-option-value
14
+ parser = argparse.ArgumentParser(
15
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
16
+ """
17
+ Example runs:
18
+
19
+ python TTS/bin/find_unique_chars.py --config_path config.json
20
+ """,
21
+ formatter_class=RawTextHelpFormatter,
22
+ )
23
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
24
+ args = parser.parse_args()
25
+
26
+ c = load_config(args.config_path)
27
+
28
+ # load all datasets
29
+ train_items, eval_items = load_tts_samples(
30
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
31
+ )
32
+
33
+ items = train_items + eval_items
34
+
35
+ texts = "".join(item["text"] for item in items)
36
+ chars = set(texts)
37
+ lower_chars = filter(lambda c: c.islower(), chars)
38
+ chars_force_lower = [c.lower() for c in chars]
39
+ chars_force_lower = set(chars_force_lower)
40
+
41
+ print(f" > Number of unique characters: {len(chars)}")
42
+ print(f" > Unique characters: {''.join(sorted(chars))}")
43
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
44
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ import sys
5
+
6
+ sys.path.append('.')
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ from tqdm.contrib.concurrent import process_map
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets import load_tts_samples
13
+ from TTS.tts.utils.text.phonemizers import Gruut,ESpeak
14
+
15
+
16
+ def compute_phonemes(item):
17
+ text = item["text"]
18
+ ph = phonemizer.phonemize(text).replace("|", "")
19
+ return set(list(ph))
20
+
21
+
22
+ def main():
23
+ # pylint: disable=W0601
24
+ global c, phonemizer
25
+ # pylint: disable=bad-option-value
26
+ parser = argparse.ArgumentParser(
27
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
28
+ """
29
+ Example runs:
30
+
31
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
32
+ """,
33
+ formatter_class=RawTextHelpFormatter,
34
+ )
35
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
36
+ args = parser.parse_args()
37
+
38
+ c = load_config(args.config_path)
39
+
40
+ # load all datasets
41
+ train_items, eval_items = load_tts_samples(
42
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
43
+ )
44
+ items = train_items + eval_items
45
+ print("Num items:", len(items))
46
+
47
+ language_list = [item["language"] for item in items]
48
+ is_lang_def = all(language_list)
49
+
50
+ if not c.phoneme_language or not is_lang_def:
51
+ raise ValueError("Phoneme language must be defined in config.")
52
+
53
+ if not language_list.count(language_list[0]) == len(language_list):
54
+ raise ValueError(
55
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
56
+ )
57
+
58
+ # phonemizer = Gruut(language=language_list[0], keep_puncs=True)
59
+ phonemizer = ESpeak(language="vi", backend="espeak")
60
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
61
+ print(phonemes)
62
+ phones = []
63
+ for ph in phonemes:
64
+ phones.extend(ph)
65
+
66
+ phones = set(phones)
67
+ lower_phones = filter(lambda c: c.islower(), phones)
68
+ phones_force_lower = [c.lower() for c in phones]
69
+ phones_force_lower = set(phones_force_lower)
70
+
71
+ print(f" > Number of unique phonemes: {len(phones)}")
72
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
73
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
74
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ import pathlib
5
+
6
+ from tqdm import tqdm
7
+
8
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
9
+
10
+
11
+ def adjust_path_and_remove_silence(audio_path):
12
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
13
+ # ignore if the file exists
14
+ if os.path.exists(output_path) and not args.force:
15
+ return output_path
16
+
17
+ # create all directory structure
18
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
19
+ # remove the silence and save the audio
20
+ output_path, is_speech = remove_silence(
21
+ model_and_utils,
22
+ audio_path,
23
+ output_path,
24
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
25
+ use_cuda=args.use_cuda,
26
+ )
27
+
28
+ return output_path, is_speech
29
+
30
+
31
+ def preprocess_audios():
32
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
33
+ print("> Number of files: ", len(files))
34
+ if not args.force:
35
+ print("> Ignoring files that already exist in the output idrectory.")
36
+
37
+ if args.trim_just_beginning_and_end:
38
+ print("> Trimming just the beginning and the end with nonspeech parts.")
39
+ else:
40
+ print("> Trimming all nonspeech parts.")
41
+
42
+ filtered_files = []
43
+ if files:
44
+ # create threads
45
+ # num_threads = multiprocessing.cpu_count()
46
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
47
+ for f in tqdm(files):
48
+ output_path, is_speech = adjust_path_and_remove_silence(f)
49
+ if not is_speech:
50
+ filtered_files.append(output_path)
51
+
52
+ # write files that do not have speech
53
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
54
+ for file in filtered_files:
55
+ f.write(file + "\n")
56
+ else:
57
+ print("> No files Found !")
58
+
59
+
60
+ if __name__ == "__main__":
61
+ parser = argparse.ArgumentParser(
62
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
63
+ )
64
+ parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
65
+ parser.add_argument(
66
+ "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
67
+ )
68
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
69
+ parser.add_argument(
70
+ "-g",
71
+ "--glob",
72
+ type=str,
73
+ default="**/*.wav",
74
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
75
+ )
76
+ parser.add_argument(
77
+ "-t",
78
+ "--trim_just_beginning_and_end",
79
+ type=bool,
80
+ default=True,
81
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
82
+ )
83
+ parser.add_argument(
84
+ "-c",
85
+ "--use_cuda",
86
+ type=bool,
87
+ default=False,
88
+ help="If True use cuda",
89
+ )
90
+ args = parser.parse_args()
91
+ # load the model and utils
92
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
93
+ preprocess_audios()
TTS/bin/resample.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from distutils.dir_util import copy_tree
6
+ from multiprocessing import Pool
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ if __name__ == "__main__":
20
+
21
+ parser = argparse.ArgumentParser(
22
+ description="""Resample a folder recusively with librosa
23
+ Can be used in place or create a copy of the folder as an output.\n\n
24
+ Example run:
25
+ python TTS/bin/resample.py
26
+ --input_dir /root/LJSpeech-1.1/
27
+ --output_sr 22050
28
+ --output_dir /root/resampled_LJSpeech-1.1/
29
+ --file_ext wav
30
+ --n_jobs 24
31
+ """,
32
+ formatter_class=RawTextHelpFormatter,
33
+ )
34
+
35
+ parser.add_argument(
36
+ "--input_dir",
37
+ type=str,
38
+ default=None,
39
+ required=True,
40
+ help="Path of the folder containing the audio files to resample",
41
+ )
42
+
43
+ parser.add_argument(
44
+ "--output_sr",
45
+ type=int,
46
+ default=22050,
47
+ required=False,
48
+ help="Samlple rate to which the audio files should be resampled",
49
+ )
50
+
51
+ parser.add_argument(
52
+ "--output_dir",
53
+ type=str,
54
+ default=None,
55
+ required=False,
56
+ help="Path of the destination folder. If not defined, the operation is done in place",
57
+ )
58
+
59
+ parser.add_argument(
60
+ "--file_ext",
61
+ type=str,
62
+ default="wav",
63
+ required=False,
64
+ help="Extension of the audio files to resample",
65
+ )
66
+
67
+ parser.add_argument(
68
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
69
+ )
70
+
71
+ args = parser.parse_args()
72
+
73
+ if args.output_dir:
74
+ print("Recursively copying the input folder...")
75
+ copy_tree(args.input_dir, args.output_dir)
76
+ args.input_dir = args.output_dir
77
+
78
+ print("Resampling the audio files...")
79
+ audio_files = glob.glob(os.path.join(args.input_dir, f"**/*.{args.file_ext}"), recursive=True)
80
+ print(f"Found {len(audio_files)} files...")
81
+ audio_files = list(zip(audio_files, len(audio_files) * [args.output_sr]))
82
+ with Pool(processes=args.n_jobs) as p:
83
+ with tqdm(total=len(audio_files)) as pbar:
84
+ for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
85
+ pbar.update()
86
+
87
+ print("Done !")
TTS/bin/synthesize.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ from argparse import RawTextHelpFormatter
7
+
8
+ # pylint: disable=redefined-outer-name, unused-argument
9
+ from pathlib import Path
10
+
11
+ from TTS.utils.manage import ModelManager
12
+ from TTS.utils.synthesizer import Synthesizer
13
+
14
+
15
+ def str2bool(v):
16
+ if isinstance(v, bool):
17
+ return v
18
+ if v.lower() in ("yes", "true", "t", "y", "1"):
19
+ return True
20
+ if v.lower() in ("no", "false", "f", "n", "0"):
21
+ return False
22
+ raise argparse.ArgumentTypeError("Boolean value expected.")
23
+
24
+
25
+ def main():
26
+ description = """Synthesize speech on command line.
27
+
28
+ You can either use your trained model or choose a model from the provided list.
29
+
30
+ If you don't specify any models, then it uses LJSpeech based English model.
31
+
32
+ ## Example Runs
33
+
34
+ ### Single Speaker Models
35
+
36
+ - List provided models:
37
+
38
+ ```
39
+ $ tts --list_models
40
+ ```
41
+
42
+ - Query info for model info by idx:
43
+
44
+ ```
45
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
46
+ ```
47
+
48
+ - Query info for model info by full name:
49
+
50
+ ```
51
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
52
+ ```
53
+
54
+ - Run TTS with default models:
55
+
56
+ ```
57
+ $ tts --text "Text for TTS"
58
+ ```
59
+
60
+ - Run a TTS model with its default vocoder model:
61
+
62
+ ```
63
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>
64
+ ```
65
+
66
+ - Run with specific TTS and vocoder models from the list:
67
+
68
+ ```
69
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --output_path
70
+ ```
71
+
72
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
73
+
74
+ ```
75
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
76
+ ```
77
+
78
+ - Run your own TTS and Vocoder models:
79
+ ```
80
+ $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
81
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
82
+ ```
83
+
84
+ ### Multi-speaker Models
85
+
86
+ - List the available speakers and choose as <speaker_id> among them:
87
+
88
+ ```
89
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
90
+ ```
91
+
92
+ - Run the multi-speaker TTS model with the target speaker ID:
93
+
94
+ ```
95
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
96
+ ```
97
+
98
+ - Run your own multi-speaker TTS model:
99
+
100
+ ```
101
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
102
+ ```
103
+ """
104
+ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
105
+ # documentation in sync more easily.
106
+ parser = argparse.ArgumentParser(
107
+ description=description.replace(" ```\n", ""),
108
+ formatter_class=RawTextHelpFormatter,
109
+ )
110
+
111
+ parser.add_argument(
112
+ "--list_models",
113
+ type=str2bool,
114
+ nargs="?",
115
+ const=True,
116
+ default=False,
117
+ help="list available pre-trained TTS and vocoder models.",
118
+ )
119
+
120
+ parser.add_argument(
121
+ "--model_info_by_idx",
122
+ type=str,
123
+ default=None,
124
+ help="model info using query format: <model_type>/<model_query_idx>",
125
+ )
126
+
127
+ parser.add_argument(
128
+ "--model_info_by_name",
129
+ type=str,
130
+ default=None,
131
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
132
+ )
133
+
134
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
135
+
136
+ # Args for running pre-trained TTS models.
137
+ parser.add_argument(
138
+ "--model_name",
139
+ type=str,
140
+ default="tts_models/en/ljspeech/tacotron2-DDC",
141
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
142
+ )
143
+ parser.add_argument(
144
+ "--vocoder_name",
145
+ type=str,
146
+ default=None,
147
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
148
+ )
149
+
150
+ # Args for running custom models
151
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
152
+ parser.add_argument(
153
+ "--model_path",
154
+ type=str,
155
+ default=None,
156
+ help="Path to model file.",
157
+ )
158
+ parser.add_argument(
159
+ "--out_path",
160
+ type=str,
161
+ default="tts_output.wav",
162
+ help="Output wav file path.",
163
+ )
164
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
165
+ parser.add_argument(
166
+ "--vocoder_path",
167
+ type=str,
168
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
169
+ default=None,
170
+ )
171
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
172
+ parser.add_argument(
173
+ "--encoder_path",
174
+ type=str,
175
+ help="Path to speaker encoder model file.",
176
+ default=None,
177
+ )
178
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
179
+
180
+ # args for multi-speaker synthesis
181
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
182
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
183
+ parser.add_argument(
184
+ "--speaker_idx",
185
+ type=str,
186
+ help="Target speaker ID for a multi-speaker TTS model.",
187
+ default=None,
188
+ )
189
+ parser.add_argument(
190
+ "--language_idx",
191
+ type=str,
192
+ help="Target language ID for a multi-lingual TTS model.",
193
+ default=None,
194
+ )
195
+ parser.add_argument(
196
+ "--speaker_wav",
197
+ nargs="+",
198
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
199
+ default=None,
200
+ )
201
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
202
+ parser.add_argument(
203
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
204
+ )
205
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
206
+ parser.add_argument(
207
+ "--list_speaker_idxs",
208
+ help="List available speaker ids for the defined multi-speaker model.",
209
+ type=str2bool,
210
+ nargs="?",
211
+ const=True,
212
+ default=False,
213
+ )
214
+ parser.add_argument(
215
+ "--list_language_idxs",
216
+ help="List available language ids for the defined multi-lingual model.",
217
+ type=str2bool,
218
+ nargs="?",
219
+ const=True,
220
+ default=False,
221
+ )
222
+ # aux args
223
+ parser.add_argument(
224
+ "--save_spectogram",
225
+ type=bool,
226
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
227
+ default=False,
228
+ )
229
+ parser.add_argument(
230
+ "--reference_wav",
231
+ type=str,
232
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
233
+ default=None,
234
+ )
235
+ parser.add_argument(
236
+ "--reference_speaker_idx",
237
+ type=str,
238
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
239
+ default=None,
240
+ )
241
+ parser.add_argument(
242
+ "--progress_bar",
243
+ type=str2bool,
244
+ help="If true shows a progress bar for the model download. Defaults to True",
245
+ default=True,
246
+ )
247
+
248
+ args = parser.parse_args()
249
+
250
+ # print the description if either text or list_models is not set
251
+ check_args = [
252
+ args.text,
253
+ args.list_models,
254
+ args.list_speaker_idxs,
255
+ args.list_language_idxs,
256
+ args.reference_wav,
257
+ args.model_info_by_idx,
258
+ args.model_info_by_name,
259
+ ]
260
+ if not any(check_args):
261
+ parser.parse_args(["-h"])
262
+
263
+ # load model manager
264
+ path = Path(__file__).parent / "../.models.json"
265
+ manager = ModelManager(path, progress_bar=args.progress_bar)
266
+
267
+ model_path = None
268
+ config_path = None
269
+ speakers_file_path = None
270
+ language_ids_file_path = None
271
+ vocoder_path = None
272
+ vocoder_config_path = None
273
+ encoder_path = None
274
+ encoder_config_path = None
275
+
276
+ # CASE1 #list : list pre-trained TTS models
277
+ if args.list_models:
278
+ manager.list_models()
279
+ sys.exit()
280
+
281
+ # CASE2 #info : model info of pre-trained TTS models
282
+ if args.model_info_by_idx:
283
+ model_query = args.model_info_by_idx
284
+ manager.model_info_by_idx(model_query)
285
+ sys.exit()
286
+
287
+ if args.model_info_by_name:
288
+ model_query_full_name = args.model_info_by_name
289
+ manager.model_info_by_full_name(model_query_full_name)
290
+ sys.exit()
291
+
292
+ # CASE3: load pre-trained model paths
293
+ if args.model_name is not None and not args.model_path:
294
+ model_path, config_path, model_item = manager.download_model(args.model_name)
295
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
296
+
297
+ if args.vocoder_name is not None and not args.vocoder_path:
298
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
299
+
300
+ # CASE4: set custom model paths
301
+ if args.model_path is not None:
302
+ model_path = args.model_path
303
+ config_path = args.config_path
304
+ speakers_file_path = args.speakers_file_path
305
+ language_ids_file_path = args.language_ids_file_path
306
+
307
+ if args.vocoder_path is not None:
308
+ vocoder_path = args.vocoder_path
309
+ vocoder_config_path = args.vocoder_config_path
310
+
311
+ if args.encoder_path is not None:
312
+ encoder_path = args.encoder_path
313
+ encoder_config_path = args.encoder_config_path
314
+
315
+ # load models
316
+ synthesizer = Synthesizer(
317
+ model_path,
318
+ config_path,
319
+ speakers_file_path,
320
+ language_ids_file_path,
321
+ vocoder_path,
322
+ vocoder_config_path,
323
+ encoder_path,
324
+ encoder_config_path,
325
+ args.use_cuda,
326
+ )
327
+
328
+ # query speaker ids of a multi-speaker model.
329
+ if args.list_speaker_idxs:
330
+ print(
331
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
332
+ )
333
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
334
+ return
335
+
336
+ # query langauge ids of a multi-lingual model.
337
+ if args.list_language_idxs:
338
+ print(
339
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
340
+ )
341
+ print(synthesizer.tts_model.language_manager.name_to_id)
342
+ return
343
+
344
+ # check the arguments against a multi-speaker model.
345
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
346
+ print(
347
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
348
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
349
+ )
350
+ return
351
+
352
+ # RUN THE SYNTHESIS
353
+ if args.text:
354
+ print(" > Text: {}".format(args.text))
355
+
356
+ # kick it
357
+ wav = synthesizer.tts(
358
+ args.text,
359
+ args.speaker_idx,
360
+ args.language_idx,
361
+ args.speaker_wav,
362
+ reference_wav=args.reference_wav,
363
+ style_wav=args.capacitron_style_wav,
364
+ style_text=args.capacitron_style_text,
365
+ reference_speaker_name=args.reference_speaker_idx,
366
+ )
367
+
368
+ # save the results
369
+ print(" > Saving output to {}".format(args.out_path))
370
+ synthesizer.save_wav(wav, args.out_path)
371
+
372
+
373
+ if __name__ == "__main__":
374
+ main()
TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.training import init_training
17
+ from TTS.encoder.utils.visual import plot_embeddings
18
+ from TTS.tts.datasets import load_tts_samples
19
+ from TTS.utils.audio import AudioProcessor
20
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
21
+ from TTS.utils.io import copy_model_files
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/train_tts.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append('.')
5
+ from dataclasses import dataclass, field
6
+
7
+ from trainer import Trainer, TrainerArgs
8
+
9
+ from TTS.config import load_config, register_config
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models import setup_model
12
+
13
+
14
+ @dataclass
15
+ class TrainTTSArgs(TrainerArgs):
16
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
17
+
18
+
19
+ def main():
20
+ os.environ["CUDA_VISIBLE_DEVICES"]="0"
21
+ """Run `tts` model training directly by a `config.json` file."""
22
+ # init trainer args
23
+ train_args = TrainTTSArgs()
24
+ parser = train_args.init_argparse(arg_prefix="")
25
+
26
+ # override trainer args from comman-line args
27
+ args, config_overrides = parser.parse_known_args()
28
+ train_args.parse_args(args)
29
+
30
+ # load config.json and register
31
+ if args.config_path or args.continue_path:
32
+ if args.config_path:
33
+ # init from a file
34
+ config = load_config(args.config_path)
35
+ if len(config_overrides) > 0:
36
+ config.parse_known_args(config_overrides, relaxed_parser=True)
37
+ elif args.continue_path:
38
+ # continue from a prev experiment
39
+ config = load_config(os.path.join(args.continue_path, "config.json"))
40
+ if len(config_overrides) > 0:
41
+ config.parse_known_args(config_overrides, relaxed_parser=True)
42
+ else:
43
+ # init from console args
44
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
45
+
46
+ config_base = BaseTrainingConfig()
47
+ config_base.parse_known_args(config_overrides)
48
+ config = register_config(config_base.model)()
49
+
50
+ # load training samples
51
+ train_samples, eval_samples = load_tts_samples(
52
+ config.datasets,
53
+ eval_split=True,
54
+ eval_split_max_size=config.eval_split_max_size,
55
+ eval_split_size=config.eval_split_size,
56
+ )
57
+
58
+ # init the model from config
59
+ model = setup_model(config, train_samples + eval_samples)
60
+
61
+ # init the trainer and 🚀
62
+ trainer = Trainer(
63
+ train_args,
64
+ model.config,
65
+ config.output_path,
66
+ model=model,
67
+ train_samples=train_samples,
68
+ eval_samples=eval_samples,
69
+ parse_command_line_args=False,
70
+ )
71
+ trainer.fit()
72
+
73
+
74
+ if __name__ == "__main__":
75
+ main()
TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ import sys
4
+
5
+ sys.path.append('.')
6
+ from trainer import Trainer, TrainerArgs
7
+ import torch
8
+ from TTS.config import load_config, register_config
9
+ from TTS.utils.audio import AudioProcessor
10
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
11
+ from TTS.vocoder.models import setup_model
12
+
13
+
14
+ @dataclass
15
+ class TrainVocoderArgs(TrainerArgs):
16
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
17
+
18
+
19
+ def main():
20
+
21
+ os.environ["CUDA_VISIBLE_DEVICES"]="0"
22
+ """Run `tts` model training directly by a `config.json` file."""
23
+ # init trainer args
24
+ train_args = TrainVocoderArgs()
25
+ parser = train_args.init_argparse(arg_prefix="")
26
+
27
+ # override trainer args from comman-line args
28
+ args, config_overrides = parser.parse_known_args()
29
+ train_args.parse_args(args)
30
+
31
+ # load config.json and register
32
+ if args.config_path or args.continue_path:
33
+ if args.config_path:
34
+ # init from a file
35
+ config = load_config(args.config_path)
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ elif args.continue_path:
39
+ # continue from a prev experiment
40
+ config = load_config(os.path.join(args.continue_path, "config.json"))
41
+ if len(config_overrides) > 0:
42
+ config.parse_known_args(config_overrides, relaxed_parser=True)
43
+ else:
44
+ # init from console args
45
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
46
+
47
+ config_base = BaseTrainingConfig()
48
+ config_base.parse_known_args(config_overrides)
49
+ config = register_config(config_base.model)()
50
+
51
+ # load training samples
52
+ if "feature_path" in config and config.feature_path:
53
+ # load pre-computed features
54
+ print(f" > Loading features from: {config.feature_path}")
55
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
56
+ else:
57
+ # load data raw wav files
58
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
59
+
60
+ # setup audio processor
61
+ ap = AudioProcessor(**config.audio)
62
+
63
+ # init the model from config
64
+ model = setup_model(config)
65
+
66
+ # init the trainer and 🚀
67
+ trainer = Trainer(
68
+ train_args,
69
+ config,
70
+ config.output_path,
71
+ model=model,
72
+ train_samples=train_samples,
73
+ eval_samples=eval_samples,
74
+ training_assets={"audio_processor": ap},
75
+ parse_command_line_args=False,
76
+ )
77
+ trainer.fit()
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.utils.audio import AudioProcessor
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.models import setup_model
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
+ parser.add_argument(
23
+ "--num_iter",
24
+ type=int,
25
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
26
+ )
27
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
+ parser.add_argument(
30
+ "--search_depth",
31
+ type=int,
32
+ default=3,
33
+ help="Search granularity. Increasing this increases the run-time exponentially.",
34
+ )
35
+
36
+ # load config
37
+ args = parser.parse_args()
38
+ config = load_config(args.config_path)
39
+
40
+ # setup audio processor
41
+ ap = AudioProcessor(**config.audio)
42
+
43
+ # load dataset
44
+ _, train_data = load_wav_data(args.data_path, 0)
45
+ train_data = train_data[: args.num_samples]
46
+ dataset = WaveGradDataset(
47
+ ap=ap,
48
+ items=train_data,
49
+ seq_len=-1,
50
+ hop_len=ap.hop_length,
51
+ pad_short=config.pad_short,
52
+ conv_pad=config.conv_pad,
53
+ is_training=True,
54
+ return_segments=False,
55
+ use_noise_augment=False,
56
+ use_cache=False,
57
+ verbose=True,
58
+ )
59
+ loader = DataLoader(
60
+ dataset,
61
+ batch_size=1,
62
+ shuffle=False,
63
+ collate_fn=dataset.collate_full_clips,
64
+ drop_last=False,
65
+ num_workers=config.num_loader_workers,
66
+ pin_memory=False,
67
+ )
68
+
69
+ # setup the model
70
+ model = setup_model(config)
71
+ if args.use_cuda:
72
+ model.cuda()
73
+
74
+ # setup optimization parameters
75
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
+ print(f" > base values: {base_values}")
77
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
+ best_error = float("inf")
79
+ best_schedule = None # pylint: disable=C0103
80
+ total_search_iter = len(base_values) ** args.num_iter
81
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
+ beta = exponents * base
83
+ model.compute_noise_level(beta)
84
+ for data in loader:
85
+ mel, audio = data
86
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
+
88
+ if args.use_cuda:
89
+ y_hat = y_hat.cpu()
90
+ y_hat = y_hat.numpy()
91
+
92
+ mel_hat = []
93
+ for i in range(y_hat.shape[0]):
94
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
+ mel_hat.append(torch.from_numpy(m))
96
+
97
+ mel_hat = torch.stack(mel_hat)
98
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
99
+ if mse.item() < best_error:
100
+ best_error = mse.item()
101
+ best_schedule = {"beta": beta}
102
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
103
+ np.save(args.output_path, best_schedule)
TTS/config/.ipynb_checkpoints/__init__-checkpoint.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
41
+ for path in paths:
42
+ try:
43
+ config_class = find_module(path, config_name)
44
+ except ModuleNotFoundError:
45
+ pass
46
+ if config_class is None:
47
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
48
+ return config_class
49
+
50
+
51
+ def _process_model_name(config_dict: Dict) -> str:
52
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
53
+
54
+ Args:
55
+ config_dict (Dict): A dictionary including the config fields.
56
+
57
+ Returns:
58
+ str: Formatted modelname.
59
+ """
60
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
61
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
62
+ return model_name
63
+
64
+
65
+ def load_config(config_path: str) -> Coqpit:
66
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
67
+ to find the corresponding Config class. Then initialize the Config.
68
+
69
+ Args:
70
+ config_path (str): path to the config file.
71
+
72
+ Raises:
73
+ TypeError: given config file has an unknown type.
74
+
75
+ Returns:
76
+ Coqpit: TTS config object.
77
+ """
78
+ config_dict = {}
79
+ ext = os.path.splitext(config_path)[1]
80
+ if ext in (".yml", ".yaml"):
81
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
82
+ data = yaml.safe_load(f)
83
+ elif ext == ".json":
84
+ try:
85
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
86
+ data = json.load(f)
87
+ except json.decoder.JSONDecodeError:
88
+ # backwards compat.
89
+ data = read_json_with_comments(config_path)
90
+ else:
91
+ raise TypeError(f" [!] Unknown config file type {ext}")
92
+ config_dict.update(data)
93
+ model_name = _process_model_name(config_dict)
94
+ config_class = register_config(model_name.lower())
95
+ config = config_class()
96
+ config.from_dict(config_dict)
97
+ return config
98
+
99
+
100
+ def check_config_and_model_args(config, arg_name, value):
101
+ """Check the give argument in `config.model_args` if exist or in `config` for
102
+ the given value.
103
+
104
+ Return False if the argument does not exist in `config.model_args` or `config`.
105
+ This is to patch up the compatibility between models with and without `model_args`.
106
+
107
+ TODO: Remove this in the future with a unified approach.
108
+ """
109
+ if hasattr(config, "model_args"):
110
+ if arg_name in config.model_args:
111
+ return config.model_args[arg_name] == value
112
+ if hasattr(config, arg_name):
113
+ return config[arg_name] == value
114
+ return False
115
+
116
+
117
+ def get_from_config_or_model_args(config, arg_name):
118
+ """Get the given argument from `config.model_args` if exist or in `config`."""
119
+ if hasattr(config, "model_args"):
120
+ if arg_name in config.model_args:
121
+ return config.model_args[arg_name]
122
+ return config[arg_name]
123
+
124
+
125
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
126
+ """Get the given argument from `config.model_args` if exist or in `config`."""
127
+ if hasattr(config, "model_args"):
128
+ if arg_name in config.model_args:
129
+ return config.model_args[arg_name]
130
+ if hasattr(config, arg_name):
131
+ return config[arg_name]
132
+ return def_val
TTS/config/.ipynb_checkpoints/config-checkpoint.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glow_tts",
3
+ "batch_size": 32,
4
+ "eval_batch_size": 16,
5
+ "num_loader_workers": 4,
6
+ "num_eval_loader_workers": 4,
7
+ "run_eval": true,
8
+ "test_delay_epochs": -1,
9
+ "epochs": 1000,
10
+ "text_cleaner": "english_cleaners",
11
+ "use_phonemes": false,
12
+ "phoneme_language": "en-us",
13
+ "phoneme_cache_path": "phoneme_cache",
14
+ "print_step": 25,
15
+ "print_eval": true,
16
+ "mixed_precision": false,
17
+ "output_path": "recipes/ljspeech/glow_tts/",
18
+ "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
19
+ "datasets":[{"formatter": "infore", "meta_file_train":"scripts.csv", "path":"/Users/saltlux/Code/SpeechSynthesis/Dataset/25hours/"}]
20
+ }
TTS/config/.ipynb_checkpoints/shared_configs-checkpoint.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ meta_file_val (str):
216
+ Name of the dataset meta file that defines the instances used at validation.
217
+
218
+ meta_file_attn_mask (str):
219
+ Path to the file that lists the attention mask files used with models that require attention masks to
220
+ train the duration predictor.
221
+ """
222
+
223
+ formatter: str = ""
224
+ dataset_name: str = ""
225
+ path: str = ""
226
+ meta_file_train: str = ""
227
+ ignored_speakers: List[str] = None
228
+ language: str = ""
229
+ meta_file_val: str = ""
230
+ meta_file_attn_mask: str = ""
231
+
232
+ def check_values(
233
+ self,
234
+ ):
235
+ """Check config fields"""
236
+ c = asdict(self)
237
+ check_argument("formatter", c, restricted=True)
238
+ check_argument("path", c, restricted=True)
239
+ check_argument("meta_file_train", c, restricted=True)
240
+ check_argument("meta_file_val", c, restricted=False)
241
+ check_argument("meta_file_attn_mask", c, restricted=False)
242
+
243
+
244
+ @dataclass
245
+ class BaseTrainingConfig(TrainerConfig):
246
+ """Base config to define the basic 🐸TTS training parameters that are shared
247
+ among all the models. It is based on ```Trainer.TrainingConfig```.
248
+
249
+ Args:
250
+ model (str):
251
+ Name of the model that is used in the training.
252
+
253
+ num_loader_workers (int):
254
+ Number of workers for training time dataloader.
255
+
256
+ num_eval_loader_workers (int):
257
+ Number of workers for evaluation time dataloader.
258
+ """
259
+
260
+ model: str = None
261
+ # dataloading
262
+ num_loader_workers: int = 0
263
+ num_eval_loader_workers: int = 0
264
+ use_noise_augment: bool = False
TTS/config/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
41
+ for path in paths:
42
+ try:
43
+ config_class = find_module(path, config_name)
44
+ except ModuleNotFoundError:
45
+ pass
46
+ if config_class is None:
47
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
48
+ return config_class
49
+
50
+
51
+ def _process_model_name(config_dict: Dict) -> str:
52
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
53
+
54
+ Args:
55
+ config_dict (Dict): A dictionary including the config fields.
56
+
57
+ Returns:
58
+ str: Formatted modelname.
59
+ """
60
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
61
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
62
+ return model_name
63
+
64
+
65
+ def load_config(config_path: str) -> Coqpit:
66
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
67
+ to find the corresponding Config class. Then initialize the Config.
68
+
69
+ Args:
70
+ config_path (str): path to the config file.
71
+
72
+ Raises:
73
+ TypeError: given config file has an unknown type.
74
+
75
+ Returns:
76
+ Coqpit: TTS config object.
77
+ """
78
+ config_dict = {}
79
+ ext = os.path.splitext(config_path)[1]
80
+ if ext in (".yml", ".yaml"):
81
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
82
+ data = yaml.safe_load(f)
83
+ elif ext == ".json":
84
+ try:
85
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
86
+ data = json.load(f)
87
+ except json.decoder.JSONDecodeError:
88
+ # backwards compat.
89
+ data = read_json_with_comments(config_path)
90
+ else:
91
+ raise TypeError(f" [!] Unknown config file type {ext}")
92
+ config_dict.update(data)
93
+ model_name = _process_model_name(config_dict)
94
+ config_class = register_config(model_name.lower())
95
+ config = config_class()
96
+ config.from_dict(config_dict)
97
+ return config
98
+
99
+
100
+ def check_config_and_model_args(config, arg_name, value):
101
+ """Check the give argument in `config.model_args` if exist or in `config` for
102
+ the given value.
103
+
104
+ Return False if the argument does not exist in `config.model_args` or `config`.
105
+ This is to patch up the compatibility between models with and without `model_args`.
106
+
107
+ TODO: Remove this in the future with a unified approach.
108
+ """
109
+ if hasattr(config, "model_args"):
110
+ if arg_name in config.model_args:
111
+ return config.model_args[arg_name] == value
112
+ if hasattr(config, arg_name):
113
+ return config[arg_name] == value
114
+ return False
115
+
116
+
117
+ def get_from_config_or_model_args(config, arg_name):
118
+ """Get the given argument from `config.model_args` if exist or in `config`."""
119
+ if hasattr(config, "model_args"):
120
+ if arg_name in config.model_args:
121
+ return config.model_args[arg_name]
122
+ return config[arg_name]
123
+
124
+
125
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
126
+ """Get the given argument from `config.model_args` if exist or in `config`."""
127
+ if hasattr(config, "model_args"):
128
+ if arg_name in config.model_args:
129
+ return config.model_args[arg_name]
130
+ if hasattr(config, arg_name):
131
+ return config[arg_name]
132
+ return def_val
TTS/config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (3.95 kB). View file
 
TTS/config/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (3.93 kB). View file
 
TTS/config/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (3.89 kB). View file
 
TTS/config/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (3.97 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-310.pyc ADDED
Binary file (9.33 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-37.pyc ADDED
Binary file (9.3 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-38.pyc ADDED
Binary file (9.32 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-39.pyc ADDED
Binary file (9.34 kB). View file
 
TTS/config/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glow_tts",
3
+ "batch_size": 32,
4
+ "eval_batch_size": 16,
5
+ "num_loader_workers": 4,
6
+ "num_eval_loader_workers": 4,
7
+ "run_eval": true,
8
+ "test_delay_epochs": -1,
9
+ "epochs": 1000,
10
+ "text_cleaner": "english_cleaners",
11
+ "use_phonemes": false,
12
+ "phoneme_language": "en-us",
13
+ "phoneme_cache_path": "phoneme_cache",
14
+ "print_step": 25,
15
+ "print_eval": true,
16
+ "mixed_precision": false,
17
+ "output_path": "recipes/ljspeech/glow_tts/",
18
+ "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
19
+ "datasets":[{"formatter": "infore", "meta_file_train":"scripts.csv", "path":"/Users/saltlux/Code/SpeechSynthesis/Dataset/25hours/"}],
20
+ "characters":{
21
+ "characters": "abcdeghiklmnopqrstuvxyàáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ",
22
+ "phonemes": null,
23
+ "unique": true
24
+ }
25
+ }
TTS/config/shared_configs.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ meta_file_val (str):
216
+ Name of the dataset meta file that defines the instances used at validation.
217
+
218
+ meta_file_attn_mask (str):
219
+ Path to the file that lists the attention mask files used with models that require attention masks to
220
+ train the duration predictor.
221
+ """
222
+
223
+ formatter: str = ""
224
+ dataset_name: str = ""
225
+ path: str = ""
226
+ meta_file_train: str = ""
227
+ ignored_speakers: List[str] = None
228
+ language: str = ""
229
+ meta_file_val: str = ""
230
+ meta_file_attn_mask: str = ""
231
+
232
+ def check_values(
233
+ self,
234
+ ):
235
+ """Check config fields"""
236
+ c = asdict(self)
237
+ check_argument("formatter", c, restricted=True)
238
+ check_argument("path", c, restricted=True)
239
+ check_argument("meta_file_train", c, restricted=True)
240
+ check_argument("meta_file_val", c, restricted=False)
241
+ check_argument("meta_file_attn_mask", c, restricted=False)
242
+
243
+
244
+ @dataclass
245
+ class BaseTrainingConfig(TrainerConfig):
246
+ """Base config to define the basic 🐸TTS training parameters that are shared
247
+ among all the models. It is based on ```Trainer.TrainingConfig```.
248
+
249
+ Args:
250
+ model (str):
251
+ Name of the model that is used in the training.
252
+
253
+ num_loader_workers (int):
254
+ Number of workers for training time dataloader.
255
+
256
+ num_eval_loader_workers (int):
257
+ Number of workers for evaluation time dataloader.
258
+ """
259
+
260
+ model: str = None
261
+ # dataloading
262
+ num_loader_workers: int = 0
263
+ num_eval_loader_workers: int = 0
264
+ use_noise_augment: bool = False
TTS/encoder/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/__init__.py ADDED
File without changes
TTS/encoder/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (145 Bytes). View file
 
TTS/encoder/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (112 Bytes). View file
 
TTS/encoder/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (143 Bytes). View file
 
TTS/encoder/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (166 Bytes). View file