Spaces:
Running
on
Zero
Running
on
Zero
Added TTS: MaskGCT & StyleTTS kokoro; Edge space fixed
Browse files
app.py
CHANGED
@@ -98,16 +98,24 @@ AVAILABLE_MODELS = {
|
|
98 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0 # overlly jolly
|
99 |
|
100 |
# # Microsoft Edge TTS
|
101 |
-
|
102 |
|
103 |
# IMS-Toucan
|
104 |
-
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
105 |
|
106 |
# IMS-Toucan English non-artificial
|
107 |
'Flux9665/EnglishToucan': 'Flux9665/EnglishToucan', # 5.1
|
108 |
|
109 |
# StyleTTS v2
|
110 |
-
'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
# HF TTS w issues
|
113 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
@@ -276,10 +284,38 @@ HF_SPACES = {
|
|
276 |
'function': '/synthesize',
|
277 |
'text_param_index': 0,
|
278 |
'return_audio_index': 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
'is_zero_gpu_space': True,
|
280 |
'series': 'StyleTTS',
|
281 |
},
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
# TTS w issues
|
284 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
285 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
@@ -411,6 +447,31 @@ OVERRIDE_INPUTS = {
|
|
411 |
3: 8, # lngsteps
|
412 |
},
|
413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
}
|
415 |
|
416 |
hf_clients: Tuple[Client] = {}
|
|
|
98 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0 # overlly jolly
|
99 |
|
100 |
# # Microsoft Edge TTS
|
101 |
+
'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
|
102 |
|
103 |
# IMS-Toucan
|
104 |
+
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1 # randomly changes pitch
|
105 |
|
106 |
# IMS-Toucan English non-artificial
|
107 |
'Flux9665/EnglishToucan': 'Flux9665/EnglishToucan', # 5.1
|
108 |
|
109 |
# StyleTTS v2
|
110 |
+
# 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2',
|
111 |
+
# StyleTTS kokoro
|
112 |
+
'hexgrad/kokoro': 'hexgrad/kokoro',
|
113 |
+
|
114 |
+
# MaskGCT (by Amphion)
|
115 |
+
# DEMANDS 300 seconds of ZeroGPU
|
116 |
+
# 'amphion/maskgct': 'amphion/maskgct',
|
117 |
+
# default ZeroGPU borrow time
|
118 |
+
# 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
|
119 |
|
120 |
# HF TTS w issues
|
121 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
|
284 |
'function': '/synthesize',
|
285 |
'text_param_index': 0,
|
286 |
'return_audio_index': 0,
|
287 |
+
# 'is_zero_gpu_space': True,
|
288 |
+
'series': 'StyleTTS',
|
289 |
+
},
|
290 |
+
|
291 |
+
# StyleTTS v2 kokoro fine tune
|
292 |
+
'hexgrad/kokoro': {
|
293 |
+
'name': 'StyleTTS kokoro',
|
294 |
+
'function': '/generate',
|
295 |
+
'text_param_index': 0,
|
296 |
+
'return_audio_index': 0,
|
297 |
'is_zero_gpu_space': True,
|
298 |
'series': 'StyleTTS',
|
299 |
},
|
300 |
|
301 |
+
# StyleTTS v2 kokoro fine tune
|
302 |
+
'amphion/maskgct': {
|
303 |
+
'name': 'MaskGCT',
|
304 |
+
'function': '/predict',
|
305 |
+
'text_param_index': 1,
|
306 |
+
'return_audio_index': 0,
|
307 |
+
'is_zero_gpu_space': True,
|
308 |
+
'series': 'MaskGCT',
|
309 |
+
},
|
310 |
+
'Svngoku/maskgct-audio-lab': {
|
311 |
+
'name': 'MaskGCT',
|
312 |
+
'function': '/predict',
|
313 |
+
'text_param_index': 1,
|
314 |
+
'return_audio_index': 0,
|
315 |
+
'is_zero_gpu_space': True,
|
316 |
+
'series': 'MaskGCT',
|
317 |
+
},
|
318 |
+
|
319 |
# TTS w issues
|
320 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
321 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
|
|
447 |
3: 8, # lngsteps
|
448 |
},
|
449 |
|
450 |
+
# StyleTTS 2 kokoro
|
451 |
+
'hexgrad/kokoro': {
|
452 |
+
1: "af_0", #voice
|
453 |
+
2: None, #ps
|
454 |
+
3: 1, #speed
|
455 |
+
4: 0.5, #reduce_noise
|
456 |
+
5: 4000, #opening_cut
|
457 |
+
6: 2000, #closing_cut
|
458 |
+
7: 3000, #ease_in
|
459 |
+
8: 1000, #ease_out
|
460 |
+
9: 5000, #pad_before
|
461 |
+
10: 5000, #pad_after
|
462 |
+
},
|
463 |
+
|
464 |
+
# maskGCT (by amphion)
|
465 |
+
'amphion/maskgct': {
|
466 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
467 |
+
2: -1, #target_len
|
468 |
+
3: 25, #n_timesteps
|
469 |
+
},
|
470 |
+
'Svngoku/maskgct-audio-lab': {
|
471 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
472 |
+
2: -1, #target_len
|
473 |
+
3: 25, #n_timesteps
|
474 |
+
},
|
475 |
}
|
476 |
|
477 |
hf_clients: Tuple[Client] = {}
|