Spaces:

We-Want-GPU
/

diffusers-cross-attention-map-SDXL-t2i

Sleeping

App Files Files Community

wooyeolbaek commited on Nov 28, 2024

Commit

c71e173

verified ·

1 Parent(s): 5cb0966

Create modules.py

Browse files

Files changed (1) hide show

modules.py +1757 -0

modules.py ADDED Viewed

	@@ -0,0 +1,1757 @@

+import math
+import inspect
+import numpy as np
+from typing import Any, Dict, Optional, Tuple, Union, List, Callable
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from diffusers.models.attention import _chunked_feed_forward
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
+from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput
+from diffusers.pipelines.flux.pipeline_flux import (
+    retrieve_timesteps,
+    replace_example_docstring,
+    EXAMPLE_DOC_STRING,
+    calculate_shift,
+    XLA_AVAILABLE,
+    FluxPipelineOutput
+)
+# from diffusers.models.transformers import FLUXTransformer2DModel
+from diffusers.utils import (
+    deprecate,
+    BaseOutput,
+    is_torch_version,
+    logging,
+    USE_PEFT_BACKEND,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    AttnProcessor2_0,
+)
+logger = logging.get_logger(__name__)
+attn_maps = {}
+@torch.no_grad()
+@replace_example_docstring(EXAMPLE_DOC_STRING)
+def FluxPipeline_call(
+    self,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 3.5,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    max_sequence_length: int = 512,
+):
+    r"""
+    Function invoked when calling the pipeline for generation.
+    Args:
+        prompt (`str` or `List[str]`, *optional*):
+            The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            instead.
+        prompt_2 (`str` or `List[str]`, *optional*):
+            The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+            will be used instead
+        height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            The height in pixels of the generated image. This is set to 1024 by default for the best results.
+        width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+            The width in pixels of the generated image. This is set to 1024 by default for the best results.
+        num_inference_steps (`int`, *optional*, defaults to 50):
+            The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+            expense of slower inference.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+            in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+            passed will be used. Must be in descending order.
+        guidance_scale (`float`, *optional*, defaults to 7.0):
+            Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            `guidance_scale` is defined as `w` of equation 2. of [Imagen
+            Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+            1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+            usually at the expense of lower image quality.
+        num_images_per_prompt (`int`, *optional*, defaults to 1):
+            The number of images to generate per prompt.
+        generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+            One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+            to make generation deterministic.
+        latents (`torch.FloatTensor`, *optional*):
+            Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+            generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+            tensor will ge generated by sampling using the supplied random `generator`.
+        prompt_embeds (`torch.FloatTensor`, *optional*):
+            Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+            provided, text embeddings will be generated from `prompt` input argument.
+        pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+            Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            If not provided, pooled text embeddings will be generated from `prompt` input argument.
+        output_type (`str`, *optional*, defaults to `"pil"`):
+            The output format of the generate image. Choose between
+            [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+        joint_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        callback_on_step_end (`Callable`, *optional*):
+            A function that calls at the end of each denoising steps during the inference. The function is called
+            with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+            callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+            `callback_on_step_end_tensor_inputs`.
+        callback_on_step_end_tensor_inputs (`List`, *optional*):
+            The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+            will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+            `._callback_tensor_inputs` attribute of your pipeline class.
+        max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+    Examples:
+    Returns:
+        [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+        is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+        images.
+    """
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        height,
+        width,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        max_sequence_length=max_sequence_length,
+    )
+    self._guidance_scale = guidance_scale
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+    device = self._execution_device
+    lora_scale = (
+        self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+    )
+    (
+        prompt_embeds,
+        pooled_prompt_embeds,
+        text_ids,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_embeds=prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        device=device,
+        num_images_per_prompt=num_images_per_prompt,
+        max_sequence_length=max_sequence_length,
+        lora_scale=lora_scale,
+    )
+    # 4. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels // 4
+    latents, latent_image_ids = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+    # 5. Prepare timesteps
+    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+    image_seq_len = latents.shape[1]
+    mu = calculate_shift(
+        image_seq_len,
+        self.scheduler.config.base_image_seq_len,
+        self.scheduler.config.max_image_seq_len,
+        self.scheduler.config.base_shift,
+        self.scheduler.config.max_shift,
+    )
+    timesteps, num_inference_steps = retrieve_timesteps(
+        self.scheduler,
+        num_inference_steps,
+        device,
+        timesteps,
+        sigmas,
+        mu=mu,
+    )
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+    # handle guidance
+    if self.transformer.config.guidance_embeds:
+        guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+        guidance = guidance.expand(latents.shape[0])
+    else:
+        guidance = None
+    # 6. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latents.shape[0]).to(latents.dtype)
+            noise_pred = self.transformer(
+                hidden_states=latents,
+                timestep=timestep / 1000,
+                guidance=guidance,
+                pooled_projections=pooled_prompt_embeds,
+                encoder_hidden_states=prompt_embeds,
+                txt_ids=text_ids,
+                img_ids=latent_image_ids,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+                ##################################################
+                height=height,
+                ##################################################
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+            if XLA_AVAILABLE:
+                xm.mark_step()
+    if output_type == "latent":
+        image = latents
+    else:
+        latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+        latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+    # Offload all models
+    self.maybe_free_model_hooks()
+    if not return_dict:
+        return (image,)
+    return FluxPipelineOutput(images=image)
+def UNet2DConditionModelForward(
+    self,
+    sample: torch.Tensor,
+    timestep: Union[torch.Tensor, float, int],
+    encoder_hidden_states: torch.Tensor,
+    class_labels: Optional[torch.Tensor] = None,
+    timestep_cond: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+    mid_block_additional_residual: Optional[torch.Tensor] = None,
+    down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    return_dict: bool = True,
+) -> Union[UNet2DConditionOutput, Tuple]:
+    r"""
+    The [`UNet2DConditionModel`] forward method.
+    Args:
+        sample (`torch.Tensor`):
+            The noisy input tensor with the following shape `(batch, channel, height, width)`.
+        timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+        encoder_hidden_states (`torch.Tensor`):
+            The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+        class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+            Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+        timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+            Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+            through the `self.time_embedding` layer to obtain the timestep embeddings.
+        attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+            is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+            negative values to the attention scores corresponding to "discard" tokens.
+        cross_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        added_cond_kwargs: (`dict`, *optional*):
+            A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+            are passed along to the UNet blocks.
+        down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+            A tuple of tensors that if specified are added to the residuals of down unet blocks.
+        mid_block_additional_residual: (`torch.Tensor`, *optional*):
+            A tensor that if specified is added to the residual of the middle unet block.
+        down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+            additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+        encoder_attention_mask (`torch.Tensor`):
+            A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+            `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+            which adds large negative values to the attention scores corresponding to "discard" tokens.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+            tuple.
+    Returns:
+        [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned,
+            otherwise a `tuple` is returned where the first element is the sample tensor.
+    """
+    # By default samples have to be AT least a multiple of the overall upsampling factor.
+    # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+    # However, the upsampling interpolation output size can be forced to fit any upsampling size
+    # on the fly if necessary.
+    default_overall_up_factor = 2**self.num_upsamplers
+    # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+    forward_upsample_size = False
+    upsample_size = None
+    for dim in sample.shape[-2:]:
+        if dim % default_overall_up_factor != 0:
+            # Forward upsample size to force interpolation output size.
+            forward_upsample_size = True
+            break
+    # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+    # expects mask of shape:
+    #   [batch, key_tokens]
+    # adds singleton query_tokens dimension:
+    #   [batch,                    1, key_tokens]
+    # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+    #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+    #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+    if attention_mask is not None:
+        # assume that mask is expressed as:
+        #   (1 = keep,      0 = discard)
+        # convert mask into a bias that can be added to attention scores:
+        #       (keep = +0,     discard = -10000.0)
+        attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+        attention_mask = attention_mask.unsqueeze(1)
+    # convert encoder_attention_mask to a bias the same way we do for attention_mask
+    if encoder_attention_mask is not None:
+        encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+        encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+    # 0. center input if necessary
+    if self.config.center_input_sample:
+        sample = 2 * sample - 1.0
+    # 1. time
+    t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+    emb = self.time_embedding(t_emb, timestep_cond)
+    aug_emb = None
+    class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+    if class_emb is not None:
+        if self.config.class_embeddings_concat:
+            emb = torch.cat([emb, class_emb], dim=-1)
+        else:
+            emb = emb + class_emb
+    aug_emb = self.get_aug_embed(
+        emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+    )
+    if self.config.addition_embed_type == "image_hint":
+        aug_emb, hint = aug_emb
+        sample = torch.cat([sample, hint], dim=1)
+    emb = emb + aug_emb if aug_emb is not None else emb
+    if self.time_embed_act is not None:
+        emb = self.time_embed_act(emb)
+    encoder_hidden_states = self.process_encoder_hidden_states(
+        encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+    )
+    # 2. pre-process
+    sample = self.conv_in(sample)
+    # 2.5 GLIGEN position net
+    if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+        cross_attention_kwargs = cross_attention_kwargs.copy()
+        gligen_args = cross_attention_kwargs.pop("gligen")
+        cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+    # 3. down
+    # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
+    # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
+    ################################################################################
+    if cross_attention_kwargs is None:
+        cross_attention_kwargs = {'timestep' : timestep}
+    else:
+        cross_attention_kwargs['timestep'] = timestep
+    ################################################################################
+    if cross_attention_kwargs is not None:
+        cross_attention_kwargs = cross_attention_kwargs.copy()
+        lora_scale = cross_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+    # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+    is_adapter = down_intrablock_additional_residuals is not None
+    # maintain backward compatibility for legacy usage, where
+    #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+    #       but can only use one or the other
+    if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+        deprecate(
+            "T2I should not use down_block_additional_residuals",
+            "1.3.0",
+            "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                    and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                    for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+            standard_warn=False,
+        )
+        down_intrablock_additional_residuals = down_block_additional_residuals
+        is_adapter = True
+    down_block_res_samples = (sample,)
+    for downsample_block in self.down_blocks:
+        if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+            # For t2i-adapter CrossAttnDownBlock2D
+            additional_residuals = {}
+            if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+            sample, res_samples = downsample_block(
+                hidden_states=sample,
+                temb=emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                **additional_residuals,
+            )
+        else:
+            sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                sample += down_intrablock_additional_residuals.pop(0)
+        down_block_res_samples += res_samples
+    if is_controlnet:
+        new_down_block_res_samples = ()
+        for down_block_res_sample, down_block_additional_residual in zip(
+            down_block_res_samples, down_block_additional_residuals
+        ):
+            down_block_res_sample = down_block_res_sample + down_block_additional_residual
+            new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = new_down_block_res_samples
+    # 4. mid
+    if self.mid_block is not None:
+        if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        else:
+            sample = self.mid_block(sample, emb)
+        # To support T2I-Adapter-XL
+        if (
+            is_adapter
+            and len(down_intrablock_additional_residuals) > 0
+            and sample.shape == down_intrablock_additional_residuals[0].shape
+        ):
+            sample += down_intrablock_additional_residuals.pop(0)
+    if is_controlnet:
+        sample = sample + mid_block_additional_residual
+    # 5. up
+    for i, upsample_block in enumerate(self.up_blocks):
+        is_final_block = i == len(self.up_blocks) - 1
+        res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+        down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+        # if we have not reached the final block and need to forward the
+        # upsample size, we do it here
+        if not is_final_block and forward_upsample_size:
+            upsample_size = down_block_res_samples[-1].shape[2:]
+        if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+            sample = upsample_block(
+                hidden_states=sample,
+                temb=emb,
+                res_hidden_states_tuple=res_samples,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                upsample_size=upsample_size,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        else:
+            sample = upsample_block(
+                hidden_states=sample,
+                temb=emb,
+                res_hidden_states_tuple=res_samples,
+                upsample_size=upsample_size,
+            )
+    # 6. post-process
+    if self.conv_norm_out:
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+    sample = self.conv_out(sample)
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (sample,)
+    return UNet2DConditionOutput(sample=sample)
+def SD3Transformer2DModelForward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    pooled_projections: torch.FloatTensor = None,
+    timestep: torch.LongTensor = None,
+    block_controlnet_hidden_states: List = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    return_dict: bool = True,
+) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+    """
+    The [`SD3Transformer2DModel`] forward method.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+            Input `hidden_states`.
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+        pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+            from the embeddings of input conditions.
+        timestep ( `torch.LongTensor`):
+            Used to indicate denoising step.
+        block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+            A list of tensors that if specified are added to the residuals of transformer blocks.
+        joint_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+            tuple.
+    Returns:
+        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+        `tuple` where the first element is the sample tensor.
+    """
+    if joint_attention_kwargs is not None:
+        joint_attention_kwargs = joint_attention_kwargs.copy()
+        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    height, width = hidden_states.shape[-2:]
+    hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
+    temb = self.time_text_embed(timestep, pooled_projections)
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+    for index_block, block in enumerate(self.transformer_blocks):
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(block),
+                hidden_states,
+                encoder_hidden_states,
+                temb,
+                **ckpt_kwargs,
+            )
+        else:
+            encoder_hidden_states, hidden_states = block(
+                hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb,
+                ##########################################################################################
+                timestep=timestep, height=height // self.config.patch_size,
+                ##########################################################################################
+            )
+        # controlnet residual
+        if block_controlnet_hidden_states is not None and block.context_pre_only is False:
+            interval_control = len(self.transformer_blocks) // len(block_controlnet_hidden_states)
+            hidden_states = hidden_states + block_controlnet_hidden_states[index_block // interval_control]
+    hidden_states = self.norm_out(hidden_states, temb)
+    hidden_states = self.proj_out(hidden_states)
+    # unpatchify
+    patch_size = self.config.patch_size
+    height = height // patch_size
+    width = width // patch_size
+    hidden_states = hidden_states.reshape(
+        shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
+    )
+    hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+    output = hidden_states.reshape(
+        shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+    )
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)
+def FluxTransformer2DModelForward(
+    self,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: torch.Tensor = None,
+    pooled_projections: torch.Tensor = None,
+    timestep: torch.LongTensor = None,
+    img_ids: torch.Tensor = None,
+    txt_ids: torch.Tensor = None,
+    guidance: torch.Tensor = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    controlnet_block_samples=None,
+    controlnet_single_block_samples=None,
+    return_dict: bool = True,
+    controlnet_blocks_repeat: bool = False,
+    ##################################################
+    height: int = None,
+    ##################################################
+) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+    """
+    The [`FluxTransformer2DModel`] forward method.
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+            Input `hidden_states`.
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+            Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+        pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+            from the embeddings of input conditions.
+        timestep ( `torch.LongTensor`):
+            Used to indicate denoising step.
+        block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+            A list of tensors that if specified are added to the residuals of transformer blocks.
+        joint_attention_kwargs (`dict`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+            tuple.
+    Returns:
+        If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+        `tuple` where the first element is the sample tensor.
+    """
+    if joint_attention_kwargs is not None:
+        joint_attention_kwargs = joint_attention_kwargs.copy()
+        lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+    else:
+        lora_scale = 1.0
+    if USE_PEFT_BACKEND:
+        # weight the lora layers by setting `lora_scale` for each PEFT layer
+        scale_lora_layers(self, lora_scale)
+    else:
+        if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+            logger.warning(
+                "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+            )
+    hidden_states = self.x_embedder(hidden_states)
+    timestep = timestep.to(hidden_states.dtype) * 1000
+    if guidance is not None:
+        guidance = guidance.to(hidden_states.dtype) * 1000
+    else:
+        guidance = None
+    temb = (
+        self.time_text_embed(timestep, pooled_projections)
+        if guidance is None
+        else self.time_text_embed(timestep, guidance, pooled_projections)
+    )
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+    if txt_ids.ndim == 3:
+        logger.warning(
+            "Passing `txt_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        txt_ids = txt_ids[0]
+    if img_ids.ndim == 3:
+        logger.warning(
+            "Passing `img_ids` 3d torch.Tensor is deprecated."
+            "Please remove the batch dimension and pass it as a 2d torch Tensor"
+        )
+        img_ids = img_ids[0]
+    ids = torch.cat((txt_ids, img_ids), dim=0)
+    image_rotary_emb = self.pos_embed(ids)
+    for index_block, block in enumerate(self.transformer_blocks):
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(block),
+                hidden_states,
+                encoder_hidden_states,
+                temb,
+                image_rotary_emb,
+                **ckpt_kwargs,
+            )
+        else:
+            encoder_hidden_states, hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+                ##########################################################################################
+                timestep=timestep, height=height // self.config.patch_size,
+                ##########################################################################################
+            )
+        # controlnet residual
+        if controlnet_block_samples is not None:
+            interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+            interval_control = int(np.ceil(interval_control))
+            # For Xlabs ControlNet.
+            if controlnet_blocks_repeat:
+                hidden_states = (
+                    hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                )
+            else:
+                hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+    hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+    for index_block, block in enumerate(self.single_transformer_blocks):
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(block),
+                hidden_states,
+                temb,
+                image_rotary_emb,
+                **ckpt_kwargs,
+            )
+        else:
+            hidden_states = block(
+                hidden_states=hidden_states,
+                temb=temb,
+                image_rotary_emb=image_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        # controlnet residual
+        if controlnet_single_block_samples is not None:
+            interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+            interval_control = int(np.ceil(interval_control))
+            hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                + controlnet_single_block_samples[index_block // interval_control]
+            )
+    hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+    hidden_states = self.norm_out(hidden_states, temb)
+    output = self.proj_out(hidden_states)
+    if USE_PEFT_BACKEND:
+        # remove `lora_scale` from each PEFT layer
+        unscale_lora_layers(self, lora_scale)
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)
+def Transformer2DModelForward(
+    self,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    timestep: Optional[torch.LongTensor] = None,
+    added_cond_kwargs: Dict[str, torch.Tensor] = None,
+    class_labels: Optional[torch.LongTensor] = None,
+    cross_attention_kwargs: Dict[str, Any] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    return_dict: bool = True,
+):
+    """
+    The [`Transformer2DModel`] forward method.
+    Args:
+        hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+            Input `hidden_states`.
+        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+            self-attention.
+        timestep ( `torch.LongTensor`, *optional*):
+            Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+        class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+            Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+            `AdaLayerZeroNorm`.
+        cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+            A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+            `self.processor` in
+            [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+        attention_mask ( `torch.Tensor`, *optional*):
+            An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+            is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+            negative values to the attention scores corresponding to "discard" tokens.
+        encoder_attention_mask ( `torch.Tensor`, *optional*):
+            Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                * Mask `(batch, sequence_length)` True = keep, False = discard.
+                * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+            If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+            above. This bias will be added to the cross-attention scores.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+            tuple.
+    Returns:
+        If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
+        otherwise a `tuple` where the first element is the sample tensor.
+    """
+    if cross_attention_kwargs is not None:
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+    # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+    #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+    #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+    # expects mask of shape:
+    #   [batch, key_tokens]
+    # adds singleton query_tokens dimension:
+    #   [batch,                    1, key_tokens]
+    # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+    #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+    #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+    if attention_mask is not None and attention_mask.ndim == 2:
+        # assume that mask is expressed as:
+        #   (1 = keep,      0 = discard)
+        # convert mask into a bias that can be added to attention scores:
+        #       (keep = +0,     discard = -10000.0)
+        attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+        attention_mask = attention_mask.unsqueeze(1)
+    # convert encoder_attention_mask to a bias the same way we do for attention_mask
+    if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+        encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+        encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+    # 1. Input
+    if self.is_input_continuous:
+        batch_size, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states, inner_dim = self._operate_on_continuous_inputs(hidden_states)
+    elif self.is_input_vectorized:
+        hidden_states = self.latent_image_embedding(hidden_states)
+    elif self.is_input_patches:
+        height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+        hidden_states, encoder_hidden_states, timestep, embedded_timestep = self._operate_on_patched_inputs(
+            hidden_states, encoder_hidden_states, timestep, added_cond_kwargs
+        )
+    ####################################################################################################
+    cross_attention_kwargs['height'] = height
+    cross_attention_kwargs['width'] = width
+    ####################################################################################################
+    # 2. Blocks
+    for block in self.transformer_blocks:
+        if self.training and self.gradient_checkpointing:
+            def create_custom_forward(module, return_dict=None):
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+                return custom_forward
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+            hidden_states = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(block),
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                timestep,
+                cross_attention_kwargs,
+                class_labels,
+                **ckpt_kwargs,
+            )
+        else:
+            hidden_states = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+    # 3. Output
+    if self.is_input_continuous:
+        output = self._get_output_for_continuous_inputs(
+            hidden_states=hidden_states,
+            residual=residual,
+            batch_size=batch_size,
+            height=height,
+            width=width,
+            inner_dim=inner_dim,
+        )
+    elif self.is_input_vectorized:
+        output = self._get_output_for_vectorized_inputs(hidden_states)
+    elif self.is_input_patches:
+        output = self._get_output_for_patched_inputs(
+            hidden_states=hidden_states,
+            timestep=timestep,
+            class_labels=class_labels,
+            embedded_timestep=embedded_timestep,
+            height=height,
+            width=width,
+        )
+    if not return_dict:
+        return (output,)
+    return Transformer2DModelOutput(sample=output)
+def BasicTransformerBlockForward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    timestep: Optional[torch.LongTensor] = None,
+    cross_attention_kwargs: Dict[str, Any] = None,
+    class_labels: Optional[torch.LongTensor] = None,
+    added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+) -> torch.Tensor:
+    if cross_attention_kwargs is not None:
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+    # Notice that normalization is always applied before the real computation in the following blocks.
+    # 0. Self-Attention
+    batch_size = hidden_states.shape[0]
+    if self.norm_type == "ada_norm":
+        norm_hidden_states = self.norm1(hidden_states, timestep)
+    elif self.norm_type == "ada_norm_zero":
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+        )
+    elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+        norm_hidden_states = self.norm1(hidden_states)
+    elif self.norm_type == "ada_norm_continuous":
+        norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+    elif self.norm_type == "ada_norm_single":
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+            self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+        ).chunk(6, dim=1)
+        norm_hidden_states = self.norm1(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        norm_hidden_states = norm_hidden_states.squeeze(1)
+    else:
+        raise ValueError("Incorrect norm used")
+    if self.pos_embed is not None:
+        norm_hidden_states = self.pos_embed(norm_hidden_states)
+    # 1. Prepare GLIGEN inputs
+    cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+    gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+    ################################################################################
+    attn_parameters = set(inspect.signature(self.attn1.processor.__call__).parameters.keys())
+    ################################################################################
+    attn_output = self.attn1(
+        norm_hidden_states,
+        encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+        attention_mask=attention_mask,
+        ################################################################################
+        **{k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters},
+        ################################################################################
+    )
+    if self.norm_type == "ada_norm_zero":
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+    elif self.norm_type == "ada_norm_single":
+        attn_output = gate_msa * attn_output
+    hidden_states = attn_output + hidden_states
+    if hidden_states.ndim == 4:
+        hidden_states = hidden_states.squeeze(1)
+    # 1.2 GLIGEN Control
+    if gligen_kwargs is not None:
+        hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+    # 3. Cross-Attention
+    if self.attn2 is not None:
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm2(hidden_states, timestep)
+        elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm2(hidden_states)
+        elif self.norm_type == "ada_norm_single":
+            # For PixArt norm2 isn't applied here:
+            # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+            norm_hidden_states = hidden_states
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        else:
+            raise ValueError("Incorrect norm")
+        if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            **cross_attention_kwargs,
+        )
+        hidden_states = attn_output + hidden_states
+    # 4. Feed-forward
+    # i2vgen doesn't have this norm 🤷‍♂️
+    if self.norm_type == "ada_norm_continuous":
+        norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+    elif not self.norm_type == "ada_norm_single":
+        norm_hidden_states = self.norm3(hidden_states)
+    if self.norm_type == "ada_norm_zero":
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    if self.norm_type == "ada_norm_single":
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+    if self._chunk_size is not None:
+        # "feed_forward_chunk_size" can be used to save memory
+        ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+    else:
+        ff_output = self.ff(norm_hidden_states)
+    if self.norm_type == "ada_norm_zero":
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+    elif self.norm_type == "ada_norm_single":
+        ff_output = gate_mlp * ff_output
+    hidden_states = ff_output + hidden_states
+    if hidden_states.ndim == 4:
+        hidden_states = hidden_states.squeeze(1)
+    return hidden_states
+def JointTransformerBlockForward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    ############################################################
+    height: int = None,
+    timestep: Optional[torch.Tensor] = None,
+    ############################################################
+):
+    if self.use_dual_attention:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
+            hidden_states, emb=temb
+        )
+    else:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+    if self.context_pre_only:
+        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
+    else:
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+    # Attention.
+    attn_output, context_attn_output = self.attn(
+        hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states,
+        ############################################################
+        timestep=timestep, height=height,
+        ############################################################
+    )
+    # Process attention outputs for the `hidden_states`.
+    attn_output = gate_msa.unsqueeze(1) * attn_output
+    hidden_states = hidden_states + attn_output
+    if self.use_dual_attention:
+        attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
+        attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
+        hidden_states = hidden_states + attn_output2
+    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    if self._chunk_size is not None:
+        # "feed_forward_chunk_size" can be used to save memory
+        ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+    else:
+        ff_output = self.ff(norm_hidden_states)
+    ff_output = gate_mlp.unsqueeze(1) * ff_output
+    hidden_states = hidden_states + ff_output
+    # Process attention outputs for the `encoder_hidden_states`.
+    if self.context_pre_only:
+        encoder_hidden_states = None
+    else:
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            context_ff_output = _chunked_feed_forward(
+                self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
+            )
+        else:
+            context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+    return encoder_hidden_states, hidden_states
+def FluxTransformerBlockForward(
+    self,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor,
+    temb: torch.FloatTensor,
+    image_rotary_emb=None,
+    joint_attention_kwargs=None,
+    ############################################################
+    height: int = None,
+    timestep: Optional[torch.Tensor] = None,
+    ############################################################
+):
+    norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+    norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+        encoder_hidden_states, emb=temb
+    )
+    joint_attention_kwargs = joint_attention_kwargs or {}
+    # Attention.
+    attn_output, context_attn_output = self.attn(
+        hidden_states=norm_hidden_states,
+        encoder_hidden_states=norm_encoder_hidden_states,
+        image_rotary_emb=image_rotary_emb,
+        ############################################################
+        timestep=timestep, height=height,
+        ############################################################
+        **joint_attention_kwargs,
+    )
+    # Process attention outputs for the `hidden_states`.
+    attn_output = gate_msa.unsqueeze(1) * attn_output
+    hidden_states = hidden_states + attn_output
+    norm_hidden_states = self.norm2(hidden_states)
+    norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+    ff_output = self.ff(norm_hidden_states)
+    ff_output = gate_mlp.unsqueeze(1) * ff_output
+    hidden_states = hidden_states + ff_output
+    # Process attention outputs for the `encoder_hidden_states`.
+    context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+    encoder_hidden_states = encoder_hidden_states + context_attn_output
+    norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+    norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+    context_ff_output = self.ff_context(norm_encoder_hidden_states)
+    encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+    if encoder_hidden_states.dtype == torch.float16:
+        encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+    return encoder_hidden_states, hidden_states
+def attn_call(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        height: int = None,
+        width: int = None,
+        timestep: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        ####################################################################################################
+        if hasattr(self, "store_attn_map"):
+            self.attn_map = rearrange(attention_probs, 'b (h w) d -> b d h w', h=height)
+            self.timestep = int(timestep.item())
+        ####################################################################################################
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
+    # Efficient implementation equivalent to the following:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias.to(attn_weight.device)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    return torch.dropout(attn_weight, dropout_p, train=True) @ value, attn_weight
+def attn_call2_0(
+    self,
+    attn: Attention,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    temb: Optional[torch.Tensor] = None,
+    height: int = None,
+    width: int = None,
+    timestep: Optional[torch.Tensor] = None,
+    *args,
+    **kwargs,
+) -> torch.Tensor:
+    if len(args) > 0 or kwargs.get("scale", None) is not None:
+        deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+        deprecate("scale", "1.0.0", deprecation_message)
+    residual = hidden_states
+    if attn.spatial_norm is not None:
+        hidden_states = attn.spatial_norm(hidden_states, temb)
+    input_ndim = hidden_states.ndim
+    if input_ndim == 4:
+        batch_size, channel, height, width = hidden_states.shape
+        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+    batch_size, sequence_length, _ = (
+        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+    )
+    if attention_mask is not None:
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        # scaled_dot_product_attention expects attention_mask shape to be
+        # (batch, heads, source_length, target_length)
+        attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+    if attn.group_norm is not None:
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+    query = attn.to_q(hidden_states)
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    elif attn.norm_cross:
+        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+    key = attn.to_k(encoder_hidden_states)
+    value = attn.to_v(encoder_hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    # the output of sdp = (batch, num_heads, seq_len, head_dim)
+    # TODO: add support for attn.scale when we move to Torch 2.1
+    ####################################################################################################
+    if hasattr(self, "store_attn_map"):
+        hidden_states, attention_probs = scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        self.attn_map = rearrange(attention_probs, 'batch attn_head (h w) attn_dim -> batch attn_head h w attn_dim ', h=height) # detach height*width
+        self.timestep = int(timestep.item())
+    else:
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+    ####################################################################################################
+    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) # (b,attn_head,h*w,attn_dim) -> (b,h*w,attn_head*attn_dim)
+    hidden_states = hidden_states.to(query.dtype)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    if input_ndim == 4:
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+    if attn.residual_connection:
+        hidden_states = hidden_states + residual
+    hidden_states = hidden_states / attn.rescale_output_factor
+    return hidden_states
+def lora_attn_call(self, attn: Attention, hidden_states, height, width, *args, **kwargs):
+    self_cls_name = self.__class__.__name__
+    deprecate(
+        self_cls_name,
+        "0.26.0",
+        (
+            f"Make sure use {self_cls_name[4:]} instead by setting"
+            "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+            " `LoraLoaderMixin.load_lora_weights`"
+        ),
+    )
+    attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+    attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+    attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+    attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+    attn._modules.pop("processor")
+    attn.processor = AttnProcessor()
+    ####################################################################################################
+    attn.processor.__call__ = attn_call.__get__(attn.processor, AttnProcessor)
+    ####################################################################################################
+    if hasattr(self, "store_attn_map"):
+        attn.processor.store_attn_map = True
+    return attn.processor(attn, hidden_states, height, width, *args, **kwargs)
+def lora_attn_call2_0(self, attn: Attention, hidden_states, height, width, *args, **kwargs):
+    self_cls_name = self.__class__.__name__
+    deprecate(
+        self_cls_name,
+        "0.26.0",
+        (
+            f"Make sure use {self_cls_name[4:]} instead by setting"
+            "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+            " `LoraLoaderMixin.load_lora_weights`"
+        ),
+    )
+    attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+    attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+    attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+    attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+    attn._modules.pop("processor")
+    attn.processor = AttnProcessor2_0()
+    ####################################################################################################
+    attn.processor.__call__ = attn_call.__get__(attn.processor, AttnProcessor2_0)
+    ####################################################################################################
+    if hasattr(self, "store_attn_map"):
+        attn.processor.store_attn_map = True
+    return attn.processor(attn, hidden_states, height, width, *args, **kwargs)
+def joint_attn_call2_0(
+    self,
+    attn: Attention,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    ############################################################
+    height: int = None,
+    timestep: Optional[torch.Tensor] = None,
+    ############################################################
+    *args,
+    **kwargs,
+) -> torch.FloatTensor:
+    residual = hidden_states
+    batch_size = hidden_states.shape[0]
+    # `sample` projections.
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # `context` projections.
+    if encoder_hidden_states is not None:
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=2)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=2)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=2)
+    ####################################################################################################
+    if hasattr(self, "store_attn_map"):
+        hidden_states, attention_probs = scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        image_length = query.shape[2] - encoder_hidden_states_query_proj.shape[2]
+        # (4,24,4429,4429) -> (4,24,4096,333)
+        attention_probs = attention_probs[:,:,:image_length,image_length:].cpu()
+        self.attn_map = rearrange(
+            attention_probs,
+            'batch attn_head (height width) attn_dim -> batch attn_head height width attn_dim',
+            height = height
+        ) # (4, 24, 4096, 333) -> (4, 24, height, width, 333)
+        self.timestep = timestep[0].cpu().item() # TODO: int -> list
+    else:
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+    ####################################################################################################
+    # hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+    hidden_states = hidden_states.to(query.dtype)
+    if encoder_hidden_states is not None:
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+    # linear proj
+    hidden_states = attn.to_out[0](hidden_states)
+    # dropout
+    hidden_states = attn.to_out[1](hidden_states)
+    if encoder_hidden_states is not None:
+        return hidden_states, encoder_hidden_states
+    else:
+        return hidden_states
+# FluxAttnProcessor2_0
+def flux_attn_call2_0(
+    self,
+    attn: Attention,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    attention_mask: Optional[torch.FloatTensor] = None,
+    image_rotary_emb: Optional[torch.Tensor] = None,
+    ############################################################
+    height: int = None,
+    timestep: Optional[torch.Tensor] = None,
+    ############################################################
+) -> torch.FloatTensor:
+    batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+    # `sample` projections.
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    inner_dim = key.shape[-1]
+    head_dim = inner_dim // attn.heads
+    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+    if attn.norm_q is not None:
+        query = attn.norm_q(query)
+    if attn.norm_k is not None:
+        key = attn.norm_k(key)
+    # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+    if encoder_hidden_states is not None:
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+    if image_rotary_emb is not None:
+        from diffusers.models.embeddings import apply_rotary_emb
+        query = apply_rotary_emb(query, image_rotary_emb)
+        key = apply_rotary_emb(key, image_rotary_emb)
+    ####################################################################################################
+    if hasattr(self, "store_attn_map"):
+        hidden_states, attention_probs = scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        image_length = query.shape[2] - encoder_hidden_states_query_proj.shape[2]
+        # (4,24,4429,4429) -> (4,24,4096,333)
+        attention_probs = attention_probs[:,:,:image_length,image_length:].cpu()
+        self.attn_map = rearrange(
+            attention_probs,
+            'batch attn_head (height width) attn_dim -> batch attn_head height width attn_dim',
+            height = height
+        ) # (4, 24, 4096, 333) -> (4, 24, height, width, 333)
+        self.timestep = timestep[0].cpu().item() # TODO: int -> list
+    else:
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+    ####################################################################################################
+    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+    hidden_states = hidden_states.to(query.dtype)
+    if encoder_hidden_states is not None:
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return hidden_states, encoder_hidden_states
+    else:
+        return hidden_states