Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
import os
|
2 |
import shutil
|
|
|
|
|
|
|
|
|
|
|
3 |
from huggingface_hub import snapshot_download
|
4 |
import gradio as gr
|
5 |
from gradio_client import Client, handle_file
|
@@ -7,87 +12,58 @@ from mutagen.mp3 import MP3
|
|
7 |
from pydub import AudioSegment
|
8 |
from PIL import Image
|
9 |
import ffmpeg
|
|
|
|
|
10 |
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
11 |
-
from scripts.inference import inference_process
|
12 |
-
import argparse
|
13 |
-
import uuid
|
14 |
|
15 |
-
|
16 |
|
|
|
|
|
|
|
17 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
#############
|
22 |
-
# UTILITIES #
|
23 |
-
#############
|
24 |
-
|
25 |
def is_mp3(file_path):
|
26 |
try:
|
27 |
-
|
28 |
return True
|
29 |
-
except Exception
|
30 |
return False
|
31 |
|
32 |
def convert_mp3_to_wav(mp3_file_path, wav_file_path):
|
33 |
-
# Load the MP3 file
|
34 |
audio = AudioSegment.from_mp3(mp3_file_path)
|
35 |
-
# Export as WAV file
|
36 |
audio.export(wav_file_path, format="wav")
|
37 |
return wav_file_path
|
38 |
|
39 |
-
|
40 |
def trim_audio(file_path, output_path, max_duration):
|
41 |
-
# Load the audio file
|
42 |
audio = AudioSegment.from_wav(file_path)
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
# If the audio is longer than the maximum duration, trim it
|
48 |
-
if audio_length > max_duration:
|
49 |
-
trimmed_audio = audio[:max_duration]
|
50 |
-
else:
|
51 |
-
trimmed_audio = audio
|
52 |
-
|
53 |
-
# Export the trimmed audio to a new file
|
54 |
-
trimmed_audio.export(output_path, format="wav")
|
55 |
-
|
56 |
return output_path
|
57 |
|
58 |
-
|
59 |
def add_silence_to_wav(wav_file_path, duration_s=1):
|
60 |
-
# Load the WAV file
|
61 |
audio = AudioSegment.from_wav(wav_file_path)
|
62 |
-
|
63 |
-
|
64 |
-
# Add silence to the end of the audio file
|
65 |
-
audio_with_silence = audio + silence
|
66 |
-
# Export the modified audio
|
67 |
-
audio_with_silence.export(wav_file_path, format="wav")
|
68 |
return wav_file_path
|
69 |
|
70 |
def check_mp3(file_path):
|
71 |
-
|
72 |
if is_mp3(file_path):
|
73 |
unique_id = uuid.uuid4()
|
74 |
wav_file_path = f"{os.path.splitext(file_path)[0]}-{unique_id}.wav"
|
75 |
converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
|
76 |
print(f"File converted to {wav_file_path}")
|
77 |
-
|
78 |
return converted_audio, gr.update(value=converted_audio, visible=True)
|
79 |
else:
|
80 |
print("The file is not an MP3 file.")
|
81 |
-
|
82 |
return file_path, gr.update(value=file_path, visible=True)
|
83 |
|
84 |
def check_and_convert_webp_to_png(input_path, output_path):
|
85 |
try:
|
86 |
-
# Open the image file
|
87 |
with Image.open(input_path) as img:
|
88 |
-
# Check if the image is in WebP format
|
89 |
if img.format == 'WEBP':
|
90 |
-
# Convert and save as PNG
|
91 |
img.save(output_path, 'PNG')
|
92 |
print(f"Converted {input_path} to {output_path}")
|
93 |
return output_path
|
@@ -97,13 +73,10 @@ def check_and_convert_webp_to_png(input_path, output_path):
|
|
97 |
except IOError:
|
98 |
print(f"Cannot open {input_path}. The file might not exist or is not an image.")
|
99 |
|
100 |
-
def
|
101 |
-
|
102 |
-
# convert to png if necessary
|
103 |
-
input_file = input_path
|
104 |
unique_id = uuid.uuid4()
|
105 |
output_file = f"converted_to_png_portrait-{unique_id}.png"
|
106 |
-
ready_png = check_and_convert_webp_to_png(
|
107 |
print(f"PORTRAIT PNG FILE: {ready_png}")
|
108 |
return ready_png
|
109 |
|
@@ -112,263 +85,102 @@ def clear_audio_elms():
|
|
112 |
|
113 |
def change_video_codec(input_file, output_file, codec='libx264', audio_codec='aac'):
|
114 |
try:
|
115 |
-
(
|
116 |
-
ffmpeg
|
117 |
-
.input(input_file)
|
118 |
-
.output(output_file, vcodec=codec, acodec=audio_codec)
|
119 |
-
.run(overwrite_output=True)
|
120 |
-
)
|
121 |
print(f'Successfully changed codec of {input_file} and saved as {output_file}')
|
122 |
except ffmpeg.Error as e:
|
123 |
print(f'Error occurred: {e.stderr.decode()}')
|
124 |
|
125 |
-
|
126 |
-
#######################################################
|
127 |
-
# Gradio APIs for optional image and voice generation #
|
128 |
-
#######################################################
|
129 |
-
|
130 |
def generate_portrait(prompt_image):
|
131 |
-
if
|
132 |
-
raise gr.Error("Can't generate a portrait without a prompt
|
133 |
|
134 |
try:
|
135 |
client = Client("ByteDance/SDXL-Lightning")
|
136 |
-
except:
|
137 |
-
raise gr.Error(
|
138 |
|
139 |
-
result = client.predict(
|
140 |
-
|
141 |
-
ckpt = "4-Step",
|
142 |
-
api_name = "/generate_image"
|
143 |
-
)
|
144 |
-
print(result)
|
145 |
-
|
146 |
-
# convert to png if necessary
|
147 |
-
input_file = result
|
148 |
-
unique_id = uuid.uuid4()
|
149 |
-
output_file = f"converted_to_png_portrait-{unique_id}.png"
|
150 |
-
ready_png = check_and_convert_webp_to_png(input_file, output_file)
|
151 |
-
print(f"PORTRAIT PNG FILE: {ready_png}")
|
152 |
-
|
153 |
-
return ready_png
|
154 |
|
155 |
def generate_voice_with_parler(prompt_audio, voice_description):
|
156 |
-
if
|
157 |
-
raise gr.Error(
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
visible = True
|
163 |
-
)
|
164 |
try:
|
165 |
client = Client("parler-tts/parler_tts_mini")
|
166 |
-
except:
|
167 |
-
raise gr.Error(
|
168 |
|
169 |
-
result = client.predict(
|
170 |
-
text = prompt_audio,
|
171 |
-
description = voice_description,
|
172 |
-
api_name = "/gen_tts"
|
173 |
-
)
|
174 |
-
print(result)
|
175 |
return result, gr.update(value=result, visible=True)
|
176 |
|
177 |
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
|
178 |
try:
|
179 |
client = Client("collabora/WhisperSpeech")
|
180 |
-
except:
|
181 |
-
raise gr.Error(
|
182 |
|
183 |
-
result = client.predict(
|
184 |
-
multilingual_text = prompt_audio_whisperspeech,
|
185 |
-
speaker_audio = handle_file(audio_to_clone),
|
186 |
-
speaker_url = "",
|
187 |
-
cps = 14,
|
188 |
-
api_name = "/whisper_speech_demo"
|
189 |
-
)
|
190 |
-
print(result)
|
191 |
return result, gr.update(value=result, visible=True)
|
192 |
|
193 |
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
|
194 |
try:
|
195 |
client = Client("amphion/maskgct")
|
196 |
-
except:
|
197 |
-
raise gr.Error(
|
198 |
|
199 |
-
result = client.predict(
|
200 |
-
prompt_wav = handle_file(audio_to_clone),
|
201 |
-
target_text = prompt_audio_maskGCT,
|
202 |
-
target_len=-1,
|
203 |
-
n_timesteps=25,
|
204 |
-
api_name="/predict"
|
205 |
-
)
|
206 |
-
print(result)
|
207 |
return result, gr.update(value=result, visible=True)
|
208 |
|
209 |
-
|
210 |
-
########################
|
211 |
-
# TALKING PORTRAIT GEN #
|
212 |
-
########################
|
213 |
-
|
214 |
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
|
215 |
-
|
216 |
unique_id = uuid.uuid4()
|
217 |
-
|
218 |
args = argparse.Namespace(
|
219 |
-
config
|
220 |
-
source_image
|
221 |
-
driving_audio
|
222 |
-
output
|
223 |
-
pose_weight
|
224 |
-
face_weight
|
225 |
-
lip_weight
|
226 |
-
face_expand_ratio
|
227 |
-
checkpoint
|
228 |
)
|
229 |
-
|
230 |
inference_process(args)
|
231 |
-
return f'output-{unique_id}.mp4'
|
232 |
|
233 |
def generate_talking_portrait(portrait, voice, progress=gr.Progress(track_tqdm=True)):
|
234 |
-
|
235 |
-
if portrait is None:
|
236 |
raise gr.Error("Please provide a portrait to animate.")
|
237 |
-
|
238 |
-
if voice is None:
|
239 |
raise gr.Error("Please provide audio (4 seconds max).")
|
240 |
|
241 |
-
if is_shared_ui
|
242 |
-
# Trim audio to AUDIO_MAX_DURATION for better shared experience with community
|
243 |
-
input_file = voice
|
244 |
unique_id = uuid.uuid4()
|
245 |
trimmed_output_file = f"-{unique_id}.wav"
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
# Add 1 second of silence at the end to avoid last word being cut by hallo
|
250 |
ready_audio = add_silence_to_wav(voice)
|
251 |
print(f"1 second of silence added to {voice}")
|
252 |
|
253 |
-
# Call hallo
|
254 |
talking_portrait_vid = run_hallo(portrait, ready_audio)
|
255 |
-
|
256 |
-
# Convert video to readable format
|
257 |
-
|
258 |
final_output_file = f"converted_{talking_portrait_vid}"
|
259 |
change_video_codec(talking_portrait_vid, final_output_file)
|
260 |
|
261 |
return final_output_file
|
262 |
|
263 |
-
|
264 |
css = '''
|
265 |
-
|
266 |
-
margin: 0 auto;
|
267 |
-
}
|
268 |
-
#column-names {
|
269 |
-
margin-top: 50px;
|
270 |
-
}
|
271 |
-
#main-group {
|
272 |
-
background-color: none;
|
273 |
-
}
|
274 |
-
.tabs {
|
275 |
-
background-color: unset;
|
276 |
-
}
|
277 |
-
#image-block {
|
278 |
-
flex: 1;
|
279 |
-
}
|
280 |
-
#video-block {
|
281 |
-
flex: 9;
|
282 |
-
}
|
283 |
-
#audio-block, #audio-clone-elm, audio-clone-elm-maskGCT {
|
284 |
-
flex: 1;
|
285 |
-
}
|
286 |
-
div#audio-clone-elm > .audio-container > button {
|
287 |
-
height: 180px!important;
|
288 |
-
}
|
289 |
-
div#audio-clone-elm > .audio-container > button > .wrap {
|
290 |
-
font-size: 0.9em;
|
291 |
-
}
|
292 |
-
div#audio-clone-elm-maskGCT > .audio-container > button {
|
293 |
-
height: 180px!important;
|
294 |
-
}
|
295 |
-
div#audio-clone-elm-maskGCT > .audio-container > button > .wrap {
|
296 |
-
font-size: 0.9em;
|
297 |
-
}
|
298 |
-
#text-synth, #voice-desc{
|
299 |
-
height: 130px;
|
300 |
-
}
|
301 |
-
#text-synth-wsp {
|
302 |
-
height: 120px;
|
303 |
-
}
|
304 |
-
#text-synth-maskGCT {
|
305 |
-
height: 120px;
|
306 |
-
}
|
307 |
-
#audio-column, #result-column {
|
308 |
-
display: flex;
|
309 |
-
}
|
310 |
-
#gen-voice-btn {
|
311 |
-
flex: 1;
|
312 |
-
}
|
313 |
-
#parler-tab, #whisperspeech-tab, #maskGCT-tab {
|
314 |
-
padding: 0;
|
315 |
-
}
|
316 |
-
#main-submit{
|
317 |
-
flex: 1;
|
318 |
-
}
|
319 |
-
#pro-tips {
|
320 |
-
margin-top: 50px;
|
321 |
-
}
|
322 |
-
div#warning-ready {
|
323 |
-
background-color: #ecfdf5;
|
324 |
-
padding: 0 16px 16px;
|
325 |
-
margin: 20px 0;
|
326 |
-
color: #030303!important;
|
327 |
-
}
|
328 |
-
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
|
329 |
-
color: #057857!important;
|
330 |
-
}
|
331 |
-
div#warning-duplicate {
|
332 |
-
background-color: #ebf5ff;
|
333 |
-
padding: 0 16px 16px;
|
334 |
-
margin: 20px 0;
|
335 |
-
color: #030303!important;
|
336 |
-
}
|
337 |
-
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
|
338 |
-
color: #0f4592!important;
|
339 |
-
}
|
340 |
-
div#warning-duplicate strong {
|
341 |
-
color: #0f4592;
|
342 |
-
}
|
343 |
-
p.actions {
|
344 |
-
display: flex;
|
345 |
-
align-items: center;
|
346 |
-
margin: 20px 0;
|
347 |
-
}
|
348 |
-
div#warning-duplicate .actions a {
|
349 |
-
display: inline-block;
|
350 |
-
margin-right: 10px;
|
351 |
-
}
|
352 |
-
.dark #warning-duplicate {
|
353 |
-
background-color: #0c0c0c !important;
|
354 |
-
border: 1px solid white !important;
|
355 |
-
}
|
356 |
-
div#component-8 {
|
357 |
-
align-items: stretch;
|
358 |
-
}
|
359 |
'''
|
360 |
|
361 |
with gr.Blocks(css=css) as demo:
|
362 |
with gr.Column(elem_id="col-container"):
|
363 |
-
gr.Markdown(""
|
364 |
-
# TTS x Hallo Talking Portrait Generator
|
365 |
-
|
366 |
-
This demo allows you to generate a talking portrait with the help of several open-source projects: SDXL Lightning | Parler TTS | WhisperSpeech | Hallo
|
367 |
-
|
368 |
-
To let the community try and enjoy this demo, video length is limited to 4 seconds audio maximum.
|
369 |
-
|
370 |
-
Duplicate this space to skip the queue and get unlimited video duration. 4-5 seconds of audio will take ~5 minutes per inference, please be patient.
|
371 |
-
""")
|
372 |
with gr.Row(elem_id="column-names"):
|
373 |
gr.Markdown("## 1. Load Portrait")
|
374 |
gr.Markdown("## 2. Load Voice")
|
@@ -376,187 +188,38 @@ with gr.Blocks(css=css) as demo:
|
|
376 |
with gr.Group(elem_id="main-group"):
|
377 |
with gr.Row():
|
378 |
with gr.Column():
|
379 |
-
|
380 |
-
|
381 |
-
sources = ["upload"],
|
382 |
-
type = "filepath",
|
383 |
-
format = "png",
|
384 |
-
elem_id = "image-block"
|
385 |
-
)
|
386 |
-
|
387 |
-
prompt_image = gr.Textbox(
|
388 |
-
label = "Generate image",
|
389 |
-
lines = 2,
|
390 |
-
max_lines = 2
|
391 |
-
)
|
392 |
-
|
393 |
gen_image_btn = gr.Button("Generate portrait (optional)")
|
394 |
-
|
395 |
with gr.Column(elem_id="audio-column"):
|
396 |
-
|
397 |
-
voice = gr.Audio(
|
398 |
-
type = "filepath",
|
399 |
-
elem_id = "audio-block"
|
400 |
-
)
|
401 |
-
|
402 |
preprocess_audio_file = gr.File(visible=False)
|
403 |
-
|
404 |
-
|
405 |
with gr.Tab("Parler TTS", elem_id="parler-tab"):
|
406 |
-
|
407 |
-
|
408 |
-
label = "Text to synthetize",
|
409 |
-
lines = 3,
|
410 |
-
max_lines = 3,
|
411 |
-
elem_id = "text-synth"
|
412 |
-
)
|
413 |
-
|
414 |
-
voice_description = gr.Textbox(
|
415 |
-
label = "Voice description",
|
416 |
-
lines = 3,
|
417 |
-
max_lines = 3,
|
418 |
-
elem_id = "voice-desc"
|
419 |
-
)
|
420 |
-
|
421 |
gen_voice_btn = gr.Button("Generate voice (optional)")
|
422 |
-
|
423 |
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
|
424 |
-
prompt_audio_whisperspeech = gr.Textbox(
|
425 |
-
|
426 |
-
lines = 2,
|
427 |
-
max_lines = 2,
|
428 |
-
elem_id = "text-synth-wsp"
|
429 |
-
)
|
430 |
-
audio_to_clone = gr.Audio(
|
431 |
-
label = "Voice to clone",
|
432 |
-
type = "filepath",
|
433 |
-
elem_id = "audio-clone-elm"
|
434 |
-
)
|
435 |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
|
436 |
-
|
437 |
with gr.Tab("MaskGCT TTS", elem_id="maskGCT-tab"):
|
438 |
-
prompt_audio_maskGCT = gr.Textbox(
|
439 |
-
|
440 |
-
lines = 2,
|
441 |
-
max_lines = 2,
|
442 |
-
elem_id = "text-synth-maskGCT"
|
443 |
-
)
|
444 |
-
audio_to_clone_maskGCT = gr.Audio(
|
445 |
-
label = "Voice to clone",
|
446 |
-
type = "filepath",
|
447 |
-
elem_id = "audio-clone-elm-maskGCT"
|
448 |
-
)
|
449 |
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
result = gr.Video(
|
454 |
-
elem_id="video-block"
|
455 |
-
)
|
456 |
-
|
457 |
submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit")
|
458 |
-
|
459 |
with gr.Row(elem_id="pro-tips"):
|
460 |
-
gr.Markdown(""
|
461 |
-
#
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
For the driving audio:
|
472 |
-
|
473 |
-
1. It must be in WAV format.
|
474 |
-
2. It must be in English since our training datasets are only in this language.
|
475 |
-
3. Ensure the vocals are clear; background music is acceptable.
|
476 |
-
|
477 |
-
|
478 |
-
""")
|
479 |
-
|
480 |
-
gr.Markdown("""
|
481 |
-
# TTS Pro Tips:
|
482 |
-
|
483 |
-
For Parler TTS:
|
484 |
-
|
485 |
-
- Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise
|
486 |
-
- Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech
|
487 |
-
- The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt
|
488 |
-
|
489 |
-
For WhisperSpeech:
|
490 |
-
|
491 |
-
WhisperSpeech is able to quickly clone a voice from an audio sample.
|
492 |
-
|
493 |
-
- Upload a voice sample in the WhisperSpeech tab
|
494 |
-
- Add text to synthetize, hit Generate voice clone button
|
495 |
-
|
496 |
-
""")
|
497 |
-
|
498 |
-
portrait.upload(
|
499 |
-
fn = convert_user_uploded_webp,
|
500 |
-
inputs = [portrait],
|
501 |
-
outputs = [portrait],
|
502 |
-
queue = False,
|
503 |
-
show_api = False
|
504 |
-
)
|
505 |
-
|
506 |
-
voice.upload(
|
507 |
-
fn = check_mp3,
|
508 |
-
inputs = [voice],
|
509 |
-
outputs = [voice, preprocess_audio_file],
|
510 |
-
queue = False,
|
511 |
-
show_api = False
|
512 |
-
)
|
513 |
-
|
514 |
-
voice.clear(
|
515 |
-
fn = clear_audio_elms,
|
516 |
-
inputs = None,
|
517 |
-
outputs = [preprocess_audio_file],
|
518 |
-
queue = False,
|
519 |
-
show_api = False
|
520 |
-
)
|
521 |
-
|
522 |
-
gen_image_btn.click(
|
523 |
-
fn = generate_portrait,
|
524 |
-
inputs = [prompt_image],
|
525 |
-
outputs = [portrait],
|
526 |
-
queue = False,
|
527 |
-
show_api = False
|
528 |
-
)
|
529 |
-
|
530 |
-
gen_voice_btn.click(
|
531 |
-
fn = generate_voice_with_parler,
|
532 |
-
inputs = [prompt_audio, voice_description],
|
533 |
-
outputs = [voice, preprocess_audio_file],
|
534 |
-
queue = False,
|
535 |
-
show_api = False
|
536 |
-
)
|
537 |
-
|
538 |
-
gen_wsp_voice_btn.click(
|
539 |
-
fn = get_whisperspeech,
|
540 |
-
inputs = [prompt_audio_whisperspeech, audio_to_clone],
|
541 |
-
outputs = [voice, preprocess_audio_file],
|
542 |
-
queue = False,
|
543 |
-
show_api = False
|
544 |
-
)
|
545 |
-
|
546 |
-
gen_maskGCT_voice_btn.click(
|
547 |
-
fn = get_maskGCT_TTS,
|
548 |
-
inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT],
|
549 |
-
outputs = [voice, preprocess_audio_file],
|
550 |
-
queue = False,
|
551 |
-
show_api = False
|
552 |
-
)
|
553 |
-
|
554 |
-
submit_btn.click(
|
555 |
-
fn = generate_talking_portrait,
|
556 |
-
inputs = [portrait, voice],
|
557 |
-
outputs = [result],
|
558 |
-
show_api = False
|
559 |
-
)
|
560 |
-
|
561 |
|
562 |
demo.queue(max_size=2).launch(show_error=True, show_api=False)
|
|
|
1 |
import os
|
2 |
import shutil
|
3 |
+
import uuid
|
4 |
+
import argparse
|
5 |
+
from pathlib import Path
|
6 |
+
from concurrent.futures import ThreadPoolExecutor
|
7 |
+
|
8 |
from huggingface_hub import snapshot_download
|
9 |
import gradio as gr
|
10 |
from gradio_client import Client, handle_file
|
|
|
12 |
from pydub import AudioSegment
|
13 |
from PIL import Image
|
14 |
import ffmpeg
|
15 |
+
|
16 |
+
# Set working directory
|
17 |
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
|
|
|
18 |
|
19 |
+
from scripts.inference import inference_process
|
20 |
|
21 |
+
# Constants
|
22 |
+
AUDIO_MAX_DURATION = 4000
|
23 |
+
is_shared_ui = "fffiloni/tts-hallo-talking-portrait" in os.environ.get('SPACE_ID', '')
|
24 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
25 |
|
26 |
+
# Utility Functions
|
|
|
|
|
|
|
|
|
|
|
27 |
def is_mp3(file_path):
|
28 |
try:
|
29 |
+
MP3(file_path)
|
30 |
return True
|
31 |
+
except Exception:
|
32 |
return False
|
33 |
|
34 |
def convert_mp3_to_wav(mp3_file_path, wav_file_path):
|
|
|
35 |
audio = AudioSegment.from_mp3(mp3_file_path)
|
|
|
36 |
audio.export(wav_file_path, format="wav")
|
37 |
return wav_file_path
|
38 |
|
|
|
39 |
def trim_audio(file_path, output_path, max_duration):
|
|
|
40 |
audio = AudioSegment.from_wav(file_path)
|
41 |
+
if len(audio) > max_duration:
|
42 |
+
audio = audio[:max_duration]
|
43 |
+
audio.export(output_path, format="wav")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
return output_path
|
45 |
|
|
|
46 |
def add_silence_to_wav(wav_file_path, duration_s=1):
|
|
|
47 |
audio = AudioSegment.from_wav(wav_file_path)
|
48 |
+
silence = AudioSegment.silent(duration=duration_s * 1000)
|
49 |
+
(audio + silence).export(wav_file_path, format="wav")
|
|
|
|
|
|
|
|
|
50 |
return wav_file_path
|
51 |
|
52 |
def check_mp3(file_path):
|
|
|
53 |
if is_mp3(file_path):
|
54 |
unique_id = uuid.uuid4()
|
55 |
wav_file_path = f"{os.path.splitext(file_path)[0]}-{unique_id}.wav"
|
56 |
converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
|
57 |
print(f"File converted to {wav_file_path}")
|
|
|
58 |
return converted_audio, gr.update(value=converted_audio, visible=True)
|
59 |
else:
|
60 |
print("The file is not an MP3 file.")
|
|
|
61 |
return file_path, gr.update(value=file_path, visible=True)
|
62 |
|
63 |
def check_and_convert_webp_to_png(input_path, output_path):
|
64 |
try:
|
|
|
65 |
with Image.open(input_path) as img:
|
|
|
66 |
if img.format == 'WEBP':
|
|
|
67 |
img.save(output_path, 'PNG')
|
68 |
print(f"Converted {input_path} to {output_path}")
|
69 |
return output_path
|
|
|
73 |
except IOError:
|
74 |
print(f"Cannot open {input_path}. The file might not exist or is not an image.")
|
75 |
|
76 |
+
def convert_user_uploaded_webp(input_path):
|
|
|
|
|
|
|
77 |
unique_id = uuid.uuid4()
|
78 |
output_file = f"converted_to_png_portrait-{unique_id}.png"
|
79 |
+
ready_png = check_and_convert_webp_to_png(input_path, output_file)
|
80 |
print(f"PORTRAIT PNG FILE: {ready_png}")
|
81 |
return ready_png
|
82 |
|
|
|
85 |
|
86 |
def change_video_codec(input_file, output_file, codec='libx264', audio_codec='aac'):
|
87 |
try:
|
88 |
+
ffmpeg.input(input_file).output(output_file, vcodec=codec, acodec=audio_codec).run(overwrite_output=True)
|
|
|
|
|
|
|
|
|
|
|
89 |
print(f'Successfully changed codec of {input_file} and saved as {output_file}')
|
90 |
except ffmpeg.Error as e:
|
91 |
print(f'Error occurred: {e.stderr.decode()}')
|
92 |
|
93 |
+
# Gradio APIs
|
|
|
|
|
|
|
|
|
94 |
def generate_portrait(prompt_image):
|
95 |
+
if not prompt_image:
|
96 |
+
raise gr.Error("Can't generate a portrait without a prompt!")
|
97 |
|
98 |
try:
|
99 |
client = Client("ByteDance/SDXL-Lightning")
|
100 |
+
except Exception:
|
101 |
+
raise gr.Error("ByteDance/SDXL-Lightning space's API might not be ready, please wait, or upload an image instead.")
|
102 |
|
103 |
+
result = client.predict(prompt=prompt_image, ckpt="4-Step", api_name="/generate_image")
|
104 |
+
return convert_user_uploaded_webp(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def generate_voice_with_parler(prompt_audio, voice_description):
|
107 |
+
if not prompt_audio:
|
108 |
+
raise gr.Error("Can't generate a voice without text to synthesize!")
|
109 |
+
|
110 |
+
if not voice_description:
|
111 |
+
gr.Info("For better control, you may want to provide a voice character description next time.", duration=10, visible=True)
|
112 |
+
|
|
|
|
|
113 |
try:
|
114 |
client = Client("parler-tts/parler_tts_mini")
|
115 |
+
except Exception:
|
116 |
+
raise gr.Error("parler-tts/parler_tts_mini space's API might not be ready, please wait, or upload an audio instead.")
|
117 |
|
118 |
+
result = client.predict(text=prompt_audio, description=voice_description, api_name="/gen_tts")
|
|
|
|
|
|
|
|
|
|
|
119 |
return result, gr.update(value=result, visible=True)
|
120 |
|
121 |
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
|
122 |
try:
|
123 |
client = Client("collabora/WhisperSpeech")
|
124 |
+
except Exception:
|
125 |
+
raise gr.Error("collabora/WhisperSpeech space's API might not be ready, please wait, or upload an audio instead.")
|
126 |
|
127 |
+
result = client.predict(multilingual_text=prompt_audio_whisperspeech, speaker_audio=handle_file(audio_to_clone), speaker_url="", cps=14, api_name="/whisper_speech_demo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return result, gr.update(value=result, visible=True)
|
129 |
|
130 |
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone):
|
131 |
try:
|
132 |
client = Client("amphion/maskgct")
|
133 |
+
except Exception:
|
134 |
+
raise gr.Error("amphion/maskgct space's API might not be ready, please wait, or upload an audio instead.")
|
135 |
|
136 |
+
result = client.predict(prompt_wav=handle_file(audio_to_clone), target_text=prompt_audio_maskGCT, target_len=-1, n_timesteps=25, api_name="/predict")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
return result, gr.update(value=result, visible=True)
|
138 |
|
139 |
+
# Talking Portrait Generation
|
|
|
|
|
|
|
|
|
140 |
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
|
|
|
141 |
unique_id = uuid.uuid4()
|
|
|
142 |
args = argparse.Namespace(
|
143 |
+
config='configs/inference/default.yaml',
|
144 |
+
source_image=source_image,
|
145 |
+
driving_audio=driving_audio,
|
146 |
+
output=f'output-{unique_id}.mp4',
|
147 |
+
pose_weight=1.0,
|
148 |
+
face_weight=1.0,
|
149 |
+
lip_weight=1.0,
|
150 |
+
face_expand_ratio=1.2,
|
151 |
+
checkpoint=None
|
152 |
)
|
|
|
153 |
inference_process(args)
|
154 |
+
return f'output-{unique_id}.mp4'
|
155 |
|
156 |
def generate_talking_portrait(portrait, voice, progress=gr.Progress(track_tqdm=True)):
|
157 |
+
if not portrait:
|
|
|
158 |
raise gr.Error("Please provide a portrait to animate.")
|
159 |
+
if not voice:
|
|
|
160 |
raise gr.Error("Please provide audio (4 seconds max).")
|
161 |
|
162 |
+
if is_shared_ui:
|
|
|
|
|
163 |
unique_id = uuid.uuid4()
|
164 |
trimmed_output_file = f"-{unique_id}.wav"
|
165 |
+
voice = trim_audio(voice, trimmed_output_file, AUDIO_MAX_DURATION)
|
166 |
+
|
|
|
|
|
167 |
ready_audio = add_silence_to_wav(voice)
|
168 |
print(f"1 second of silence added to {voice}")
|
169 |
|
|
|
170 |
talking_portrait_vid = run_hallo(portrait, ready_audio)
|
|
|
|
|
|
|
171 |
final_output_file = f"converted_{talking_portrait_vid}"
|
172 |
change_video_codec(talking_portrait_vid, final_output_file)
|
173 |
|
174 |
return final_output_file
|
175 |
|
176 |
+
# Gradio Interface
|
177 |
css = '''
|
178 |
+
/* Your CSS here */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
'''
|
180 |
|
181 |
with gr.Blocks(css=css) as demo:
|
182 |
with gr.Column(elem_id="col-container"):
|
183 |
+
gr.Markdown("# TTS x Hallo Talking Portrait Generator")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
with gr.Row(elem_id="column-names"):
|
185 |
gr.Markdown("## 1. Load Portrait")
|
186 |
gr.Markdown("## 2. Load Voice")
|
|
|
188 |
with gr.Group(elem_id="main-group"):
|
189 |
with gr.Row():
|
190 |
with gr.Column():
|
191 |
+
portrait = gr.Image(sources=["upload"], type="filepath", format="png", elem_id="image-block")
|
192 |
+
prompt_image = gr.Textbox(label="Generate image", lines=2, max_lines=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
gen_image_btn = gr.Button("Generate portrait (optional)")
|
|
|
194 |
with gr.Column(elem_id="audio-column"):
|
195 |
+
voice = gr.Audio(type="filepath", elem_id="audio-block")
|
|
|
|
|
|
|
|
|
|
|
196 |
preprocess_audio_file = gr.File(visible=False)
|
|
|
|
|
197 |
with gr.Tab("Parler TTS", elem_id="parler-tab"):
|
198 |
+
prompt_audio = gr.Textbox(label="Text to synthesize", lines=3, max_lines=3, elem_id="text-synth")
|
199 |
+
voice_description = gr.Textbox(label="Voice description", lines=3, max_lines=3, elem_id="voice-desc")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
gen_voice_btn = gr.Button("Generate voice (optional)")
|
|
|
201 |
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
|
202 |
+
prompt_audio_whisperspeech = gr.Textbox(label="Text to synthesize", lines=2, max_lines=2, elem_id="text-synth-wsp")
|
203 |
+
audio_to_clone = gr.Audio(label="Voice to clone", type="filepath", elem_id="audio-clone-elm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
|
|
|
205 |
with gr.Tab("MaskGCT TTS", elem_id="maskGCT-tab"):
|
206 |
+
prompt_audio_maskGCT = gr.Textbox(label="Text to synthesize", lines=2, max_lines=2, elem_id="text-synth-maskGCT")
|
207 |
+
audio_to_clone_maskGCT = gr.Audio(label="Voice to clone", type="filepath", elem_id="audio-clone-elm-maskGCT")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)")
|
209 |
+
with gr.Column(elem_id="result-column"):
|
210 |
+
result = gr.Video(elem_id="video-block")
|
|
|
|
|
|
|
|
|
|
|
211 |
submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit")
|
|
|
212 |
with gr.Row(elem_id="pro-tips"):
|
213 |
+
gr.Markdown("# Hallo Pro Tips:")
|
214 |
+
gr.Markdown("# TTS Pro Tips:")
|
215 |
+
|
216 |
+
portrait.upload(convert_user_uploaded_webp, inputs=[portrait], outputs=[portrait], queue=False, show_api=False)
|
217 |
+
voice.upload(check_mp3, inputs=[voice], outputs=[voice, preprocess_audio_file], queue=False, show_api=False)
|
218 |
+
voice.clear(clear_audio_elms, inputs=None, outputs=[preprocess_audio_file], queue=False, show_api=False)
|
219 |
+
gen_image_btn.click(generate_portrait, inputs=[prompt_image], outputs=[portrait], queue=False, show_api=False)
|
220 |
+
gen_voice_btn.click(generate_voice_with_parler, inputs=[prompt_audio, voice_description], outputs=[voice, preprocess_audio_file], queue=False, show_api=False)
|
221 |
+
gen_wsp_voice_btn.click(get_whisperspeech, inputs=[prompt_audio_whisperspeech, audio_to_clone], outputs=[voice, preprocess_audio_file], queue=False, show_api=False)
|
222 |
+
gen_maskGCT_voice_btn.click(get_maskGCT_TTS, inputs=[prompt_audio_maskGCT, audio_to_clone_maskGCT], outputs=[voice, preprocess_audio_file], queue=False, show_api=False)
|
223 |
+
submit_btn.click(generate_talking_portrait, inputs=[portrait, voice], outputs=[result], show_api=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
demo.queue(max_size=2).launch(show_error=True, show_api=False)
|