Spaces:

noamelata
/

Nested-Diffusion

Runtime error

Nested-Diffusion / NestedScheduler.py

noamelata

initial commit

82ad0f2 over 1 year ago

8.83 kB


	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	from diffusers import DDIMScheduler

	from diffusers.utils import BaseOutput


	@dataclass
	class NestedSchedulerOutput(BaseOutput):
	"""
	Output class for the scheduler's step function output.

	Args:
	prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
	Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
	denoising loop.
	pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
	The predicted denoised sample (x_{0}) based on the model output from the current timestep.
	`pred_original_sample` can be used to preview progress or for guidance.
	"""

	prev_sample: torch.FloatTensor
	pred_original_sample: Optional[torch.FloatTensor] = None



	class NestedScheduler(DDIMScheduler):

	def set_timesteps(self, num_inference_steps: int, max_timestep: int = 1000, device: Union[str, torch.device] = None):
	"""
	Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.

	Args:
	num_inference_steps (`int`):
	the number of diffusion steps used when generating figures with a pre-trained model.
	max_timestep (`int`):
	the highest timestep to use for choosing the timesteps
	"""

	if num_inference_steps > self.config.num_train_timesteps:
	raise ValueError(
	f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
	f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
	f" maximal {self.config.num_train_timesteps} timesteps."
	)

	self.num_inference_steps = num_inference_steps
	max_timestep = min(self.config.num_train_timesteps - 1, max_timestep)
	timesteps = np.linspace(1, max_timestep, min(num_inference_steps, max_timestep)).round()[::-1].copy().astype(np.int64)
	self.timesteps = torch.from_numpy(timesteps).to(device)

	def step(
	self,
	model_output: torch.FloatTensor,
	timestep: int,
	sample: torch.FloatTensor,
	eta: float = 0.0,
	use_clipped_model_output: bool = False,
	generator=None,
	variance_noise: Optional[torch.FloatTensor] = None,
	return_dict: bool = True,
	override_prediction_type = '',
	) -> Union[NestedSchedulerOutput, Tuple]:
	"""
	Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
	process from the learned model outputs (most often the predicted noise).

	Args:
	model_output (`torch.FloatTensor`): direct output from learned diffusion model.
	timestep (`int`): current discrete timestep in the diffusion chain.
	sample (`torch.FloatTensor`):
	current instance of sample being created by diffusion process.
	eta (`float`): weight of noise for added noise in diffusion step.
	use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
	predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
	`self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
	coincide with the one provided as input and `use_clipped_model_output` will have not effect.
	generator: random number generator.
	variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
	can directly provide the noise for the variance itself. This is useful for methods such as
	CycleDiffusion. (https://arxiv.org/abs/2210.05559)
	return_dict (`bool`): option for returning tuple rather than DDIMSchedulerOutput class

	Returns:
	[`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
	[`~schedulers.scheduling_utils.DDIMSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
	returning a tuple, the first element is the sample tensor.

	"""
	if self.num_inference_steps is None:
	raise ValueError(
	"Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
	)

	# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
	# Ideally, read DDIM paper in-detail understanding

	# Notation (<variable name> -> <name in paper>
	# - pred_noise_t -> e_theta(x_t, t)
	# - pred_original_sample -> f_theta(x_t, t) or x_0
	# - std_dev_t -> sigma_t
	# - eta -> η
	# - pred_sample_direction -> "direction pointing to x_t"
	# - pred_prev_sample -> "x_t-1"

	# 1. get previous step value (=t-1)
	# prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
	cur_idx = (self.timesteps == timestep).nonzero().item()
	prev_timestep = self.timesteps[cur_idx + 1] if cur_idx < len(self.timesteps) - 1 else 0

	# 2. compute alphas, betas
	alpha_prod_t = self.alphas_cumprod[timestep]
	alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod

	beta_prod_t = 1 - alpha_prod_t

	# 3. compute predicted original sample from predicted noise also called
	# "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	prediction_type = override_prediction_type if override_prediction_type else self.config.prediction_type
	if prediction_type == "epsilon":
	pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
	pred_epsilon = model_output
	elif prediction_type == "sample":
	pred_original_sample = model_output
	pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
	elif prediction_type == "v_prediction":
	pred_original_sample = (alpha_prod_t*0.5) sample - (beta_prod_t*0.5) model_output
	pred_epsilon = (alpha_prod_t*0.5) model_output + (beta_prod_t*0.5) sample
	else:
	raise ValueError(
	f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
	" `v_prediction`"
	)

	# 4. Clip or threshold "predicted x_0"
	if self.config.thresholding:
	pred_original_sample = self._threshold_sample(pred_original_sample)
	elif self.config.clip_sample:
	pred_original_sample = pred_original_sample.clamp(
	-self.config.clip_sample_range, self.config.clip_sample_range
	)

	# 5. compute variance: "sigma_t(η)" -> see formula (16)
	# σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
	variance = self._get_variance(timestep, prev_timestep)
	std_dev_t = eta * variance ** (0.5)

	if use_clipped_model_output:
	# the pred_epsilon is always re-derived from the clipped x_0 in Glide
	pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)

	# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t2) (0.5) * pred_epsilon

	# 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
	prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction

	if eta > 0:
	if variance_noise is not None and generator is not None:
	raise ValueError(
	"Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
	" `variance_noise` stays `None`."
	)

	if variance_noise is None:
	variance_noise = torch.randn(
	model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
	)
	variance = std_dev_t * variance_noise

	prev_sample = prev_sample + variance

	if not return_dict:
	return (prev_sample,)

	return NestedSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)