Spaces:
Runtime error
Runtime error
add maskGCT api option
Browse files
app.py
CHANGED
@@ -190,6 +190,22 @@ def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
|
|
190 |
print(result)
|
191 |
return result, gr.update(value=result, visible=True)
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
########################
|
195 |
# TALKING PORTRAIT GEN #
|
@@ -264,7 +280,7 @@ css = '''
|
|
264 |
#video-block {
|
265 |
flex: 9;
|
266 |
}
|
267 |
-
#audio-block, #audio-clone-elm {
|
268 |
flex: 1;
|
269 |
}
|
270 |
div#audio-clone-elm > .audio-container > button {
|
@@ -273,6 +289,12 @@ div#audio-clone-elm > .audio-container > button {
|
|
273 |
div#audio-clone-elm > .audio-container > button > .wrap {
|
274 |
font-size: 0.9em;
|
275 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
#text-synth, #voice-desc{
|
277 |
height: 130px;
|
278 |
}
|
@@ -285,7 +307,7 @@ div#audio-clone-elm > .audio-container > button > .wrap {
|
|
285 |
#gen-voice-btn {
|
286 |
flex: 1;
|
287 |
}
|
288 |
-
#parler-tab, #whisperspeech-tab {
|
289 |
padding: 0;
|
290 |
}
|
291 |
#main-submit{
|
@@ -405,6 +427,20 @@ with gr.Blocks(css=css) as demo:
|
|
405 |
elem_id = "audio-clone-elm"
|
406 |
)
|
407 |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
|
409 |
with gr.Column(elem_id="result-column"):
|
410 |
|
@@ -501,6 +537,14 @@ with gr.Blocks(css=css) as demo:
|
|
501 |
show_api = False
|
502 |
)
|
503 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
504 |
submit_btn.click(
|
505 |
fn = generate_talking_portrait,
|
506 |
inputs = [portrait, voice],
|
|
|
190 |
print(result)
|
191 |
return result, gr.update(value=result, visible=True)
|
192 |
|
193 |
+
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
|
194 |
+
try:
|
195 |
+
client = Client("amphion/maskgct")
|
196 |
+
except:
|
197 |
+
raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.")
|
198 |
+
|
199 |
+
result = client.predict(
|
200 |
+
prompt_wav = handle_file(audio_to_clone),
|
201 |
+
target_text = prompt_audio_maskGCT,
|
202 |
+
target_len=-1,
|
203 |
+
n_timesteps=25,
|
204 |
+
api_name="/predict"
|
205 |
+
)
|
206 |
+
print(result)
|
207 |
+
return result, gr.update(value=result, visible=True)
|
208 |
+
|
209 |
|
210 |
########################
|
211 |
# TALKING PORTRAIT GEN #
|
|
|
280 |
#video-block {
|
281 |
flex: 9;
|
282 |
}
|
283 |
+
#audio-block, #audio-clone-elm, audio-clone-elm-maskGCT {
|
284 |
flex: 1;
|
285 |
}
|
286 |
div#audio-clone-elm > .audio-container > button {
|
|
|
289 |
div#audio-clone-elm > .audio-container > button > .wrap {
|
290 |
font-size: 0.9em;
|
291 |
}
|
292 |
+
div#audio-clone-elm-maskGCT > .audio-container > button {
|
293 |
+
height: 180px!important;
|
294 |
+
}
|
295 |
+
div#audio-clone-elm-maskGCT > .audio-container > button > .wrap {
|
296 |
+
font-size: 0.9em;
|
297 |
+
}
|
298 |
#text-synth, #voice-desc{
|
299 |
height: 130px;
|
300 |
}
|
|
|
307 |
#gen-voice-btn {
|
308 |
flex: 1;
|
309 |
}
|
310 |
+
#parler-tab, #whisperspeech-tab, maskGCT-tab {
|
311 |
padding: 0;
|
312 |
}
|
313 |
#main-submit{
|
|
|
427 |
elem_id = "audio-clone-elm"
|
428 |
)
|
429 |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
|
430 |
+
|
431 |
+
with gr.Tab("MaskGCT TTS", elem_id="maskGCT-tab"):
|
432 |
+
prompt_audio_maskGCT = gr.Textbox(
|
433 |
+
label = "Text to synthetize",
|
434 |
+
lines = 2,
|
435 |
+
max_lines = 2,
|
436 |
+
elem_id = "text-synth-maskGCT"
|
437 |
+
)
|
438 |
+
audio_to_clone_maskGCT = gr.Audio(
|
439 |
+
label = "Voice to clone",
|
440 |
+
type = "filepath",
|
441 |
+
elem_id = "audio-clone-elm-maskGCT"
|
442 |
+
)
|
443 |
+
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
|
444 |
|
445 |
with gr.Column(elem_id="result-column"):
|
446 |
|
|
|
537 |
show_api = False
|
538 |
)
|
539 |
|
540 |
+
gen_maskGCT_voice_btn.click(
|
541 |
+
fn = get_maskGCT_TTS,
|
542 |
+
inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT],
|
543 |
+
outputs = [voice, preprocess_audio_file],
|
544 |
+
queue = False,
|
545 |
+
show_api = False
|
546 |
+
)
|
547 |
+
|
548 |
submit_btn.click(
|
549 |
fn = generate_talking_portrait,
|
550 |
inputs = [portrait, voice],
|