AlanB
/

pipeline_animatediff_img2video_mod

Model card Files Files and versions Community

AlanB commited on Feb 26

Commit

64561ea

•

1 Parent(s): 1758d32

New fixes from ARROW

Browse files

Files changed (1) hide show

pipeline.py +61 -20

pipeline.py CHANGED Viewed

@@ -11,9 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import inspect
-from dataclasses import dataclass
 from types import FunctionType
 from typing import Any, Callable, Dict, List, Optional, Union
@@ -25,8 +30,8 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
-#from diffusers.models.unet_motion_model import MotionAdapter
 from diffusers.models.unets.unet_motion_model import MotionAdapter
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
     DDIMScheduler,
@@ -36,7 +41,7 @@ from diffusers.schedulers import (
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
-from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
 from diffusers.utils.torch_utils import randn_tensor
@@ -49,9 +54,10 @@ EXAMPLE_DOC_STRING = """
         >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif, load_image
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
-        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
         >>> image = load_image("snail.png")
         >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
@@ -226,14 +232,9 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
-@dataclass
-class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
-    frames: Union[torch.Tensor, np.ndarray]
 class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
-    Pipeline for text-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
@@ -504,6 +505,41 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
             return image_embeds, uncond_image_embeds
     # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
@@ -766,6 +802,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -819,6 +856,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*):
                 Optional image input to work with IP Adapters.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
@@ -843,8 +883,8 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
         Examples:
         Returns:
-            [`AnimateDiffImgToVideoPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`AnimateDiffImgToVideoPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
@@ -903,12 +943,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
             )
-            if do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
         # 4. Preprocess image
         image = self.image_processor.preprocess(image, height=height, width=width)
@@ -937,7 +974,11 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
         # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -971,7 +1012,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
                         callback(i, t, latents)
         if output_type == "latent":
-            return AnimateDiffImgToVideoPipelineOutput(frames=latents)
         # 10. Post-processing
         video_tensor = self.decode_latents(latents)
@@ -987,4 +1028,4 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
         if not return_dict:
             return (video,)
-        return AnimateDiffImgToVideoPipelineOutput(frames=video)

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+# Note:
+# This pipeline relies on a "hack" discovered by the community that allows
+# the generation of videos given an input image with AnimateDiff. It works
+# by creating a copy of the image `num_frames` times and progressively adding
+# more noise to the image based on the strength and latent interpolation method.
 import inspect
 from types import FunctionType
 from typing import Any, Callable, Dict, List, Optional, Union
 from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.models.unets.unet_motion_model import MotionAdapter
+from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
     DDIMScheduler,
     LMSDiscreteScheduler,
     PNDMScheduler,
 )
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from diffusers.utils.torch_utils import randn_tensor
         >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
         >>> from diffusers.utils import export_to_gif, load_image
+        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
         >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
         >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
+        >>> pipe.scheduler = pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
         >>> image = load_image("snail.png")
         >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
     return timesteps, num_inference_steps
 class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
     r"""
+    Pipeline for image-to-video generation.
     This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
     implemented for all pipelines (downloading, saving, running on a particular device, etc.).
             return image_embeds, uncond_image_embeds
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+                if self.do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+                image_embeds.append(single_image_embeds)
+        else:
+            image_embeds = ip_adapter_image_embeds
+        return image_embeds
     # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
     def decode_latents(self, latents):
         latents = 1 / self.vae.config.scaling_factor * latents
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
         output_type: Optional[str] = "pil",
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*):
                 Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                 `np.array`.
         Examples:
         Returns:
+            [`AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
                 returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
         """
         # 0. Default height and width to unet
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
         if ip_adapter_image is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
             )
         # 4. Preprocess image
         image = self.image_processor.preprocess(image, height=height, width=width)
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         # 8. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
         # 9. Denoising loop
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
                         callback(i, t, latents)
         if output_type == "latent":
+            return AnimateDiffPipelineOutput(frames=latents)
         # 10. Post-processing
         video_tensor = self.decode_latents(latents)
         if not return_dict:
             return (video,)
+        return AnimateDiffPipelineOutput(frames=video)