lfs

1e3b872 3 months ago

19.5 kB

	import os
	import torch
	import numpy as np
	import gc

	import folder_paths
	import comfy.model_management as mm
	import comfy.utils

	from diffusers.models import AutoencoderKLTemporalDecoder
	from diffusers.schedulers import EulerDiscreteScheduler
	from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

	script_directory = os.path.dirname(os.path.abspath(__file__))

	from .mimicmotion.pipelines.pipeline_mimicmotion import MimicMotionPipeline, tensor2vid
	from .mimicmotion.modules.unet import UNetSpatioTemporalConditionModel
	from .mimicmotion.modules.pose_net import PoseNet

	from .lcm_scheduler import AnimateLCMSVDStochasticIterativeScheduler

	def loglinear_interp(t_steps, num_steps):
	"""
	Performs log-linear interpolation of a given array of decreasing numbers.
	"""
	xs = np.linspace(0, 1, len(t_steps))
	ys = np.log(t_steps[::-1])

	new_xs = np.linspace(0, 1, num_steps)
	new_ys = np.interp(new_xs, xs, ys)

	interped_ys = np.exp(new_ys)[::-1].copy()
	return interped_ys


	class MimicMotionModel(torch.nn.Module):
	def __init__(self, base_model_path, lcm=False):
	"""construnct base model components and load pretrained svd model except pose-net
	Args:
	base_model_path (str): pretrained svd model path
	"""
	super().__init__()
	unet_subfolder = "unet_lcm" if lcm else "unet"
	self.unet = UNetSpatioTemporalConditionModel.from_config(
	UNetSpatioTemporalConditionModel.load_config(base_model_path, subfolder=unet_subfolder, variant="fp16"))
	self.vae = AutoencoderKLTemporalDecoder.from_pretrained(
	base_model_path, subfolder="vae", variant="fp16")
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(
	base_model_path, subfolder="image_encoder", variant="fp16")
	self.noise_scheduler = EulerDiscreteScheduler.from_pretrained(
	base_model_path, subfolder="scheduler")
	self.feature_extractor = CLIPImageProcessor.from_pretrained(
	base_model_path, subfolder="feature_extractor")
	# pose_net
	self.pose_net = PoseNet(noise_latent_channels=self.unet.config.block_out_channels[0])

	class DownloadAndLoadMimicMotionModel:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"model": (
	[ 'MimicMotion-fp16.safetensors',
	],
	),
	"precision": (
	[
	'fp32',
	'fp16',
	'bf16',
	], {
	"default": 'fp16'
	}),
	"lcm": ("BOOLEAN", {"default": False}),

	},
	}

	RETURN_TYPES = ("MIMICPIPE",)
	RETURN_NAMES = ("mimic_pipeline",)
	FUNCTION = "loadmodel"
	CATEGORY = "MimicMotionWrapper"

	def loadmodel(self, precision, model, lcm):
	device = mm.get_torch_device()
	mm.soft_empty_cache()
	dtype = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}[precision]

	pbar = comfy.utils.ProgressBar(3)

	download_path = os.path.join(folder_paths.models_dir, "mimicmotion")
	model_path = os.path.join(download_path, model)

	if not os.path.exists(model_path):
	print(f"Downloading model to: {model_path}")
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="Kijai/MimicMotion_pruned",
	allow_patterns=[f"{model}"],
	local_dir=download_path,
	local_dir_use_symlinks=False)

	print(f"Loading model from: {model_path}")
	pbar.update(1)

	svd_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1")
	svd_lcm_path = os.path.join(folder_paths.models_dir, "diffusers", "stable-video-diffusion-img2vid-xt-1-1-lcm", "unet_lcm")

	if lcm and not os.path.exists(svd_lcm_path):
	print(f"Downloading AnimateLCM SVD model to: {model_path}")
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="Kijai/AnimateLCM-SVD-Comfy",
	allow_patterns=[f".json", "diffusion_pytorch_model.fp16.safetensors*"],
	local_dir=svd_path,
	local_dir_use_symlinks=False)
	else:
	if not os.path.exists(svd_path):
	print(f"Downloading SVD model to: {model_path}")
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="vdo/stable-video-diffusion-img2vid-xt-1-1",
	allow_patterns=[f".json", "fp16*"],
	local_dir=svd_path,
	local_dir_use_symlinks=False)
	pbar.update(1)

	mimicmotion_models = MimicMotionModel(svd_path, lcm=lcm).to(device=device).eval()
	mimic_motion_sd = comfy.utils.load_torch_file(model_path)
	mimicmotion_models.load_state_dict(mimic_motion_sd, strict=False)

	if lcm:
	lcm_noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler(
	num_train_timesteps=40,
	sigma_min=0.002,
	sigma_max=700.0,
	sigma_data=1.0,
	s_noise=1.0,
	rho=7,
	clip_denoised=False,
	)
	scheduler = lcm_noise_scheduler
	else:
	scheduler = mimicmotion_models.noise_scheduler

	pipeline = MimicMotionPipeline(
	vae = mimicmotion_models.vae,
	image_encoder = mimicmotion_models.image_encoder,
	unet = mimicmotion_models.unet,
	scheduler = scheduler,
	feature_extractor = mimicmotion_models.feature_extractor,
	pose_net = mimicmotion_models.pose_net,
	)

	pipeline.unet.to(dtype)
	pipeline.pose_net.to(dtype)
	pipeline.vae.to(dtype)
	pipeline.image_encoder.to(dtype)

	mimic_model = {
	'pipeline': pipeline,
	'dtype': dtype
	}
	pbar.update(1)
	return (mimic_model,)

	class DiffusersScheduler:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"scheduler": (
	[
	'EulerDiscreteScheduler',
	'AnimateLCM_SVD'
	],
	),
	"sigma_min": ("FLOAT", {"default": 0.002, "min": 0.0, "max": 700.0, "step": 0.001}),
	"sigma_max": ("FLOAT", {"default": 700.0, "min": 0.0, "max": 700.0, "step": 0.001}),
	"align_your_steps": ("BOOLEAN", {"default": False}),
	},
	}

	RETURN_TYPES = ("DIFFUSERS_SCHEDULER",)
	RETURN_NAMES = ("scheduler",)
	FUNCTION = "loadmodel"
	CATEGORY = "MimicMotionWrapper"

	def loadmodel(self, scheduler, sigma_min, sigma_max, align_your_steps):

	scheduler_config = {
	"beta_end": 0.012,
	"beta_schedule": "scaled_linear",
	"beta_start": 0.00085,
	"clip_sample": False,
	"interpolation_type": "linear",
	"num_train_timesteps": 1000,
	"prediction_type": "v_prediction",
	"set_alpha_to_one": False,
	"sigma_max": sigma_max,
	"sigma_min": sigma_min,
	"skip_prk_steps": True,
	"steps_offset": 1,
	"timestep_spacing": "leading",
	"timestep_type": "continuous",
	"trained_betas": None,
	"use_karras_sigmas": True
	}
	if scheduler == 'EulerDiscreteScheduler':
	noise_scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
	elif scheduler == 'AnimateLCM_SVD':
	noise_scheduler = AnimateLCMSVDStochasticIterativeScheduler(
	num_train_timesteps=40,
	sigma_min=sigma_min,
	sigma_max=sigma_max,
	sigma_data=1.0,
	s_noise=1.0,
	rho=7,
	clip_denoised=False,
	)
	if align_your_steps:
	sigmas = [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.002]

	scheduler_options = {
	"noise_scheduler": noise_scheduler,
	"sigmas": sigmas if align_your_steps else None,
	}

	return (scheduler_options,)

	class MimicMotionSampler:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"mimic_pipeline": ("MIMICPIPE",),
	"ref_image": ("IMAGE",),
	"pose_images": ("IMAGE",),
	"steps": ("INT", {"default": 25, "min": 1, "max": 200, "step": 1}),
	"cfg_min": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 20.0, "step": 0.01}),
	"cfg_max": ("FLOAT", {"default": 2.0, "min": 0.0, "max": 20.0, "step": 0.01}),
	"seed": ("INT", {"default": 0, "min": 0, "max": 0xffffffffffffffff}),
	"fps": ("INT", {"default": 15, "min": 2, "max": 100, "step": 1}),
	"noise_aug_strength": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 10.0, "step": 0.01}),
	"context_size": ("INT", {"default": 16, "min": 1, "max": 128, "step": 1}),
	"context_overlap": ("INT", {"default": 6, "min": 1, "max": 128, "step": 1}),
	"keep_model_loaded": ("BOOLEAN", {"default": True}),
	},
	"optional": {
	"optional_scheduler": ("DIFFUSERS_SCHEDULER",),
	"pose_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
	"pose_start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.01}),
	"pose_end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
	"image_embed_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
	}
	}

	RETURN_TYPES = ("LATENT",)
	RETURN_NAMES = ("samples",)
	FUNCTION = "process"
	CATEGORY = "MimicMotionWrapper"

	def process(self, mimic_pipeline, ref_image, pose_images, cfg_min, cfg_max, steps, seed, noise_aug_strength, fps, keep_model_loaded,
	context_size, context_overlap, optional_scheduler=None, pose_strength=1.0, image_embed_strength=1.0, pose_start_percent=0.0, pose_end_percent=1.0):
	device = mm.get_torch_device()
	offload_device = mm.unet_offload_device()
	mm.unload_all_models()
	mm.soft_empty_cache()
	dtype = mimic_pipeline['dtype']
	pipeline = mimic_pipeline['pipeline']

	original_scheduler = pipeline.scheduler

	if optional_scheduler is not None:
	print("Using optional scheduler: ", optional_scheduler)
	pipeline.scheduler = optional_scheduler['noise_scheduler']
	sigmas = optional_scheduler['sigmas']

	if sigmas is not None and (steps + 1) != len(sigmas):
	sigmas = loglinear_interp(sigmas, steps + 1)
	sigmas = sigmas[-(steps + 1):]
	sigmas[-1] = 0
	print("Using timesteps: ", sigmas)
	else:
	pipeline.scheduler = original_scheduler
	sigmas = None

	B, H, W, C = pose_images.shape

	assert B >= context_size, "The number of poses must be greater than the context size"

	ref_image = ref_image.permute(0, 3, 1, 2)
	pose_images = pose_images.permute(0, 3, 1, 2)

	pose_images = pose_images * 2 - 1

	ref_image = ref_image.to(device).to(dtype)
	pose_images = pose_images.to(device).to(dtype)

	generator = torch.Generator(device=device)
	generator.manual_seed(seed)

	frames = pipeline(
	ref_image,
	image_pose=pose_images,
	num_frames=B,
	tile_size = context_size,
	tile_overlap= context_overlap,
	height=H,
	width=W,
	fps=fps,
	noise_aug_strength=noise_aug_strength,
	num_inference_steps=steps,
	generator=generator,
	min_guidance_scale=cfg_min,
	max_guidance_scale=cfg_max,
	decode_chunk_size=4,
	output_type="latent",
	device=device,
	sigmas=sigmas,
	pose_strength=pose_strength,
	pose_start_percent=pose_start_percent,
	pose_end_percent=pose_end_percent,
	image_embed_strength=image_embed_strength
	).frames

	if not keep_model_loaded:
	pipeline.unet.to(offload_device)
	pipeline.vae.to(offload_device)
	mm.soft_empty_cache()
	gc.collect()

	return {"samples": frames},

	class MimicMotionDecode:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"mimic_pipeline": ("MIMICPIPE",),
	"samples": ("LATENT",),
	"decode_chunk_size": ("INT", {"default": 4, "min": 1, "max": 200, "step": 1})
	},
	}

	RETURN_TYPES = ("IMAGE",)
	RETURN_NAMES = ("images",)
	FUNCTION = "process"
	CATEGORY = "MimicMotionWrapper"

	def process(self, mimic_pipeline, samples, decode_chunk_size):
	mm.soft_empty_cache()

	pipeline = mimic_pipeline['pipeline']
	num_frames = samples['samples'].shape[0]
	try:
	frames = pipeline.decode_latents(samples['samples'], num_frames, decode_chunk_size)
	except:
	frames = pipeline.decode_latents(samples['samples'], num_frames, 1)
	frames = tensor2vid(frames, pipeline.image_processor, output_type="pt")

	frames = frames.squeeze(1)[1:].permute(0, 2, 3, 1).cpu().float()

	return frames,

	class MimicMotionGetPoses:
	@classmethod
	def INPUT_TYPES(s):
	return {"required": {
	"ref_image": ("IMAGE",),
	"pose_images": ("IMAGE",),
	"include_body": ("BOOLEAN", {"default": True}),
	"include_hand": ("BOOLEAN", {"default": True}),
	"include_face": ("BOOLEAN", {"default": True}),
	},
	}

	RETURN_TYPES = ("IMAGE", "IMAGE",)
	RETURN_NAMES = ("poses_with_ref", "pose_images")
	FUNCTION = "process"
	CATEGORY = "MimicMotionWrapper"

	def process(self, ref_image, pose_images, include_body, include_hand, include_face):
	device = mm.get_torch_device()
	from .mimicmotion.dwpose.util import draw_pose
	from .mimicmotion.dwpose.dwpose_detector import DWposeDetector

	assert ref_image.shape[1:3] == pose_images.shape[1:3], "ref_image and pose_images must have the same resolution"

	yolo_model = "yolox_l.onnx"
	dw_pose_model = "dw-ll_ucoco_384.onnx"
	model_base_path = os.path.join(script_directory, "models", "DWPose")

	model_det=os.path.join(model_base_path, yolo_model)
	model_pose=os.path.join(model_base_path, dw_pose_model)

	if not os.path.exists(model_det):
	print(f"Downloading yolo model to: {model_base_path}")
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="yzd-v/DWPose",
	allow_patterns=[f"{yolo_model}"],
	local_dir=model_base_path,
	local_dir_use_symlinks=False)

	if not os.path.exists(model_pose):
	print(f"Downloading dwpose model to: {model_base_path}")
	from huggingface_hub import snapshot_download
	snapshot_download(repo_id="yzd-v/DWPose",
	allow_patterns=[f"{dw_pose_model}"],
	local_dir=model_base_path,
	local_dir_use_symlinks=False)

	dwprocessor = DWposeDetector(
	model_det=os.path.join(model_base_path, "yolox_l.onnx"),
	model_pose=os.path.join(model_base_path, "dw-ll_ucoco_384.onnx"),
	device=device)

	ref_image = ref_image.squeeze(0).cpu().numpy() * 255

	# select ref-keypoint from reference pose for pose rescale
	ref_pose = dwprocessor(ref_image)
	ref_keypoint_id = [0, 1, 2, 5, 8, 11, 14, 15, 16, 17]
	ref_keypoint_id = [i for i in ref_keypoint_id \
	if ref_pose['bodies']['score'].shape[0] > 0 and ref_pose['bodies']['score'][0][i] > 0.3]
	ref_body = ref_pose['bodies']['candidate'][ref_keypoint_id]

	height, width, _ = ref_image.shape
	pose_images_np = pose_images.cpu().numpy() * 255

	# read input video
	pbar = comfy.utils.ProgressBar(len(pose_images_np))
	detected_poses_np_list = []
	for img_np in pose_images_np:
	detected_poses_np_list.append(dwprocessor(img_np))
	pbar.update(1)

	detected_bodies = np.stack(
	[p['bodies']['candidate'] for p in detected_poses_np_list if p['bodies']['candidate'].shape[0] == 18])[:,
	ref_keypoint_id]
	# compute linear-rescale params
	ay, by = np.polyfit(detected_bodies[:, :, 1].flatten(), np.tile(ref_body[:, 1], len(detected_bodies)), 1)
	fh, fw, _ = pose_images_np[0].shape
	ax = ay / (fh / fw / height * width)
	bx = np.mean(np.tile(ref_body[:, 0], len(detected_bodies)) - detected_bodies[:, :, 0].flatten() * ax)
	a = np.array([ax, ay])
	b = np.array([bx, by])
	output_pose = []
	# pose rescale
	for detected_pose in detected_poses_np_list:
	if include_body:
	detected_pose['bodies']['candidate'] = detected_pose['bodies']['candidate'] * a + b
	if include_hand:
	detected_pose['faces'] = detected_pose['faces'] * a + b
	if include_face:
	detected_pose['hands'] = detected_pose['hands'] * a + b
	im = draw_pose(detected_pose, height, width, include_body=include_body, include_hand=include_hand, include_face=include_face)
	output_pose.append(np.array(im))

	output_pose_tensors = [torch.tensor(np.array(im)) for im in output_pose]
	output_tensor = torch.stack(output_pose_tensors) / 255

	ref_pose_img = draw_pose(ref_pose, height, width, include_body=include_body, include_hand=include_hand, include_face=include_face)
	ref_pose_tensor = torch.tensor(np.array(ref_pose_img)) / 255
	output_tensor = torch.cat((ref_pose_tensor.unsqueeze(0), output_tensor))
	output_tensor = output_tensor.permute(0, 2, 3, 1).cpu().float()

	return output_tensor, output_tensor[1:]

	NODE_CLASS_MAPPINGS = {
	"DownloadAndLoadMimicMotionModel": DownloadAndLoadMimicMotionModel,
	"MimicMotionSampler": MimicMotionSampler,
	"MimicMotionGetPoses": MimicMotionGetPoses,
	"MimicMotionDecode": MimicMotionDecode,
	"DiffusersScheduler": DiffusersScheduler,

	}
	NODE_DISPLAY_NAME_MAPPINGS = {
	"DownloadAndLoadMimicMotionModel": "(Down)Load MimicMotionModel",
	"MimicMotionSampler": "MimicMotion Sampler",
	"MimicMotionGetPoses": "MimicMotion GetPoses",
	"MimicMotionDecode": "MimicMotion Decode",
	"DiffusersScheduler": "Diffusers Scheduler",
	}