import gradio as gr import os import sys import shutil import uuid import subprocess from glob import glob from huggingface_hub import snapshot_download # Download models os.makedirs("checkpoints", exist_ok=True) snapshot_download( repo_id = "chunyu-li/LatentSync", local_dir = "./checkpoints" ) import argparse from omegaconf import OmegaConf import torch from diffusers import AutoencoderKL, DDIMScheduler from latentsync.models.unet import UNet3DConditionModel from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline from diffusers.utils.import_utils import is_xformers_available from accelerate.utils import set_seed from latentsync.whisper.audio2feature import Audio2Feature def main(video_path, audio_path, progress=gr.Progress(track_tqdm=True)): inference_ckpt_path = "checkpoints/latentsync_unet.pt" unet_config_path = "configs/unet/second_stage.yaml" config = OmegaConf.load(unet_config_path) print(f"Input video path: {video_path}") print(f"Input audio path: {audio_path}") print(f"Loaded checkpoint path: {inference_ckpt_path}") scheduler = DDIMScheduler.from_pretrained("configs") if config.model.cross_attention_dim == 768: whisper_model_path = "checkpoints/whisper/small.pt" elif config.model.cross_attention_dim == 384: whisper_model_path = "checkpoints/whisper/tiny.pt" else: raise NotImplementedError("cross_attention_dim must be 768 or 384") audio_encoder = Audio2Feature(model_path=whisper_model_path, device="cuda", num_frames=config.data.num_frames) vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16) vae.config.scaling_factor = 0.18215 vae.config.shift_factor = 0 unet, _ = UNet3DConditionModel.from_pretrained( OmegaConf.to_container(config.model), inference_ckpt_path, # load checkpoint device="cpu", ) unet = unet.to(dtype=torch.float16) # set xformers if is_xformers_available(): unet.enable_xformers_memory_efficient_attention() pipeline = LipsyncPipeline( vae=vae, audio_encoder=audio_encoder, unet=unet, scheduler=scheduler, ).to("cuda") if seed != -1: set_seed(seed) else: torch.seed() print(f"Initial seed: {torch.initial_seed()}") unique_id = str(uuid.uuid4()) video_out_path = f"video_out{unique_id}.mp4" pipeline( video_path=video_path, audio_path=audio_path, video_out_path=video_out_path, video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"), num_frames=config.data.num_frames, num_inference_steps=config.run.inference_steps, guidance_scale=1.0, weight_dtype=torch.float16, width=config.data.resolution, height=config.data.resolution, ) return video_out_path css=""" div#col-container{ margin: 0 auto; max-width: 982px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("# LatentSync: Audio Conditioned Latent Diffusion Models for Lip Sync") gr.Markdown("LatentSync, an end-to-end lip sync framework based on audio conditioned latent diffusion models without any intermediate motion representation, diverging from previous diffusion-based lip sync methods based on pixel space diffusion or two-stage generation.") gr.HTML("""
""") with gr.Row(): with gr.Column(): video_input = gr.Video(label="Video Control", format="mp4") audio_input = gr.Video(label="Audio Inpit", type="filepath") submit_btn = gr.Button("Submit") with gr.Column(): video_result = gr.Video(label="Result") gr.Examples( examples = [ ["assets/demo1_video.mp4", "assets/demo1_audio.wav"], ["assets/demo2_video.mp4", "assets/demo2_audio.wav"], ["assets/demo3_video.mp4", "assets/demo3_audio.wav"], ], inputs = [video_input, audio_input] ) submit_btn.click( fn = main, inputs = [video_input, audio_input], outputs = [video_result] ) demo.queue().launch(show_api=False, show_error=True)