Spaces:

svjack
/

MotionClone-Text-to-Video

Runtime error

App Files Files Community

MotionClone-Text-to-Video / motionclone /models /scheduler.py

svjack

Upload folder using huggingface_hub

ce68674 verified 16 days ago

raw

history blame contribute delete

7.92 kB

	from typing import Optional, Tuple, Union

	import torch
	from diffusers import DDIMScheduler
	from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput
	from diffusers.utils.torch_utils import randn_tensor


	class CustomDDIMScheduler(DDIMScheduler):
	@torch.no_grad()
	def step(
	self,
	model_output: torch.FloatTensor,
	timestep: int,
	sample: torch.FloatTensor,
	eta: float = 0.0,
	use_clipped_model_output: bool = False,
	generator=None,
	variance_noise: Optional[torch.FloatTensor] = None,
	return_dict: bool = True,

	# Guidance parameters
	score=None,
	guidance_scale=0.0,
	indices=None, # [0]

	) -> Union[DDIMSchedulerOutput, Tuple]:
	"""
	Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
	process from the learned model outputs (most often the predicted noise).

	Args:
	model_output (`torch.FloatTensor`): direct output from learned diffusion model.
	timestep (`int`): current discrete timestep in the diffusion chain.
	sample (`torch.FloatTensor`):
	current instance of sample being created by diffusion process.
	eta (`float`): weight of noise for added noise in diffusion step.
	use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
	predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
	`self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
	coincide with the one provided as input and `use_clipped_model_output` will have not effect.
	generator: random number generator.
	variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
	can directly provide the noise for the variance itself. This is useful for methods such as
	CycleDiffusion. (https://arxiv.org/abs/2210.05559)
	return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class

	Returns:
	[`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
	[`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
	returning a tuple, the first element is the sample tensor.

	"""
	if self.num_inference_steps is None:
	raise ValueError(
	"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
	)

	# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
	# Ideally, read DDIM paper in-detail understanding

	# Notation (<variable name> -> <name in paper>
	# - pred_noise_t -> e_theta(x_t, t)
	# - pred_original_sample -> f_theta(x_t, t) or x_0
	# - std_dev_t -> sigma_t
	# - eta -> η
	# - pred_sample_direction -> "direction pointing to x_t"
	# - pred_prev_sample -> "x_t-1"


	# Support IF models
	if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
	model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
	else:
	predicted_variance = None

	# 1. get previous step value (=t-1)
	prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps

	# 2. compute alphas, betas
	alpha_prod_t = self.alphas_cumprod[timestep]
	alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod

	beta_prod_t = 1 - alpha_prod_t

	# 3. compute predicted original sample from predicted noise also called
	# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	if self.config.prediction_type == "epsilon":
	pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
	pred_epsilon = model_output
	elif self.config.prediction_type == "sample":
	pred_original_sample = model_output
	pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
	elif self.config.prediction_type == "v_prediction":
	pred_original_sample = (alpha_prod_t ** 0.5) * sample - (beta_prod_t ** 0.5) * model_output
	pred_epsilon = (alpha_prod_t ** 0.5) * model_output + (beta_prod_t ** 0.5) * sample
	else:
	raise ValueError(
	f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
	" `v_prediction`"
	)

	# 4. Clip or threshold "predicted x_0"
	if self.config.thresholding:
	pred_original_sample = self._threshold_sample(pred_original_sample)
	elif self.config.clip_sample:
	pred_original_sample = pred_original_sample.clamp(
	-self.config.clip_sample_range, self.config.clip_sample_range
	)

	# 5. compute variance: "sigma_t(η)" -> see formula (16)
	# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
	variance = self._get_variance(timestep, prev_timestep)
	std_dev_t = eta * variance ** (0.5)

	if use_clipped_model_output:
	# the pred_epsilon is always re-derived from the clipped x_0 in Glide
	pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) # [2, 4, 64, 64]

	# 6. apply guidance following the formula (14) from https://arxiv.org/pdf/2105.05233.pdf
	if score is not None and guidance_scale > 0.0: # indices指定了应用guidance的位置，此处indices = [0]
	if indices is not None:
	# import pdb; pdb.set_trace()
	assert pred_epsilon[indices].shape == score.shape, "pred_epsilon[indices].shape != score.shape"
	pred_epsilon[indices] = pred_epsilon[indices] - guidance_scale * (1 - alpha_prod_t) ** (0.5) * score # 只修改了其中第一个[1, 4, 64, 64]的部分
	else:
	assert pred_epsilon.shape == score.shape
	pred_epsilon = pred_epsilon - guidance_scale * (1 - alpha_prod_t) ** (0.5) * score
	#

	# 7. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t 2) (0.5) * pred_epsilon # [2, 4, 64, 64]

	# 8. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction # [2, 4, 64, 64]

	if eta > 0:
	if variance_noise is not None and generator is not None:
	raise ValueError(
	"Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
	" `variance_noise` stays `None`."
	)

	if variance_noise is None:
	variance_noise = randn_tensor(
	model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
	)
	variance = std_dev_t * variance_noise # 最后还要再加一些随机噪声

	prev_sample = prev_sample + variance # [2, 4, 64, 64]
	self.pred_epsilon = pred_epsilon
	if not return_dict:
	return (prev_sample,)

	return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)