AlanB commited on
Commit
64561ea
1 Parent(s): 1758d32

New fixes from ARROW

Browse files
Files changed (1) hide show
  1. pipeline.py +61 -20
pipeline.py CHANGED
@@ -11,9 +11,14 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
 
 
 
 
 
 
14
 
15
  import inspect
16
- from dataclasses import dataclass
17
  from types import FunctionType
18
  from typing import Any, Callable, Dict, List, Optional, Union
19
 
@@ -25,8 +30,8 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
25
  from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
26
  from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
27
  from diffusers.models.lora import adjust_lora_scale_text_encoder
28
- #from diffusers.models.unet_motion_model import MotionAdapter
29
  from diffusers.models.unets.unet_motion_model import MotionAdapter
 
30
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline
31
  from diffusers.schedulers import (
32
  DDIMScheduler,
@@ -36,7 +41,7 @@ from diffusers.schedulers import (
36
  LMSDiscreteScheduler,
37
  PNDMScheduler,
38
  )
39
- from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
40
  from diffusers.utils.torch_utils import randn_tensor
41
 
42
 
@@ -49,9 +54,10 @@ EXAMPLE_DOC_STRING = """
49
  >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
50
  >>> from diffusers.utils import export_to_gif, load_image
51
 
 
52
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
53
  >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
54
- >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
55
 
56
  >>> image = load_image("snail.png")
57
  >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
@@ -226,14 +232,9 @@ def retrieve_timesteps(
226
  return timesteps, num_inference_steps
227
 
228
 
229
- @dataclass
230
- class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
231
- frames: Union[torch.Tensor, np.ndarray]
232
-
233
-
234
  class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
235
  r"""
236
- Pipeline for text-to-video generation.
237
 
238
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
239
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
@@ -504,6 +505,41 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
504
 
505
  return image_embeds, uncond_image_embeds
506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
508
  def decode_latents(self, latents):
509
  latents = 1 / self.vae.config.scaling_factor * latents
@@ -766,6 +802,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
766
  prompt_embeds: Optional[torch.FloatTensor] = None,
767
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
768
  ip_adapter_image: Optional[PipelineImageInput] = None,
 
769
  output_type: Optional[str] = "pil",
770
  return_dict: bool = True,
771
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -819,6 +856,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
819
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
820
  ip_adapter_image: (`PipelineImageInput`, *optional*):
821
  Optional image input to work with IP Adapters.
 
 
 
822
  output_type (`str`, *optional*, defaults to `"pil"`):
823
  The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
824
  `np.array`.
@@ -843,8 +883,8 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
843
  Examples:
844
 
845
  Returns:
846
- [`AnimateDiffImgToVideoPipelineOutput`] or `tuple`:
847
- If `return_dict` is `True`, [`AnimateDiffImgToVideoPipelineOutput`] is
848
  returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
849
  """
850
  # 0. Default height and width to unet
@@ -903,12 +943,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
903
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
904
 
905
  if ip_adapter_image is not None:
906
- output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
907
- image_embeds, negative_image_embeds = self.encode_image(
908
- ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
909
  )
910
- if do_classifier_free_guidance:
911
- image_embeds = torch.cat([negative_image_embeds, image_embeds])
912
 
913
  # 4. Preprocess image
914
  image = self.image_processor.preprocess(image, height=height, width=width)
@@ -937,7 +974,11 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
937
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
938
 
939
  # 8. Add image embeds for IP-Adapter
940
- added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
 
 
 
 
941
 
942
  # 9. Denoising loop
943
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -971,7 +1012,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
971
  callback(i, t, latents)
972
 
973
  if output_type == "latent":
974
- return AnimateDiffImgToVideoPipelineOutput(frames=latents)
975
 
976
  # 10. Post-processing
977
  video_tensor = self.decode_latents(latents)
@@ -987,4 +1028,4 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
987
  if not return_dict:
988
  return (video,)
989
 
990
- return AnimateDiffImgToVideoPipelineOutput(frames=video)
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ #
15
+ # Note:
16
+ # This pipeline relies on a "hack" discovered by the community that allows
17
+ # the generation of videos given an input image with AnimateDiff. It works
18
+ # by creating a copy of the image `num_frames` times and progressively adding
19
+ # more noise to the image based on the strength and latent interpolation method.
20
 
21
  import inspect
 
22
  from types import FunctionType
23
  from typing import Any, Callable, Dict, List, Optional, Union
24
 
 
30
  from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
31
  from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
32
  from diffusers.models.lora import adjust_lora_scale_text_encoder
 
33
  from diffusers.models.unets.unet_motion_model import MotionAdapter
34
+ from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
35
  from diffusers.pipelines.pipeline_utils import DiffusionPipeline
36
  from diffusers.schedulers import (
37
  DDIMScheduler,
 
41
  LMSDiscreteScheduler,
42
  PNDMScheduler,
43
  )
44
+ from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
45
  from diffusers.utils.torch_utils import randn_tensor
46
 
47
 
 
54
  >>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
55
  >>> from diffusers.utils import export_to_gif, load_image
56
 
57
+ >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
58
  >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
59
  >>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
60
+ >>> pipe.scheduler = pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
61
 
62
  >>> image = load_image("snail.png")
63
  >>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
 
232
  return timesteps, num_inference_steps
233
 
234
 
 
 
 
 
 
235
  class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
236
  r"""
237
+ Pipeline for image-to-video generation.
238
 
239
  This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
240
  implemented for all pipelines (downloading, saving, running on a particular device, etc.).
 
505
 
506
  return image_embeds, uncond_image_embeds
507
 
508
+ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
509
+ def prepare_ip_adapter_image_embeds(
510
+ self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
511
+ ):
512
+ if ip_adapter_image_embeds is None:
513
+ if not isinstance(ip_adapter_image, list):
514
+ ip_adapter_image = [ip_adapter_image]
515
+
516
+ if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
517
+ raise ValueError(
518
+ f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
519
+ )
520
+
521
+ image_embeds = []
522
+ for single_ip_adapter_image, image_proj_layer in zip(
523
+ ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
524
+ ):
525
+ output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
526
+ single_image_embeds, single_negative_image_embeds = self.encode_image(
527
+ single_ip_adapter_image, device, 1, output_hidden_state
528
+ )
529
+ single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
530
+ single_negative_image_embeds = torch.stack(
531
+ [single_negative_image_embeds] * num_images_per_prompt, dim=0
532
+ )
533
+
534
+ if self.do_classifier_free_guidance:
535
+ single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
536
+ single_image_embeds = single_image_embeds.to(device)
537
+
538
+ image_embeds.append(single_image_embeds)
539
+ else:
540
+ image_embeds = ip_adapter_image_embeds
541
+ return image_embeds
542
+
543
  # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
544
  def decode_latents(self, latents):
545
  latents = 1 / self.vae.config.scaling_factor * latents
 
802
  prompt_embeds: Optional[torch.FloatTensor] = None,
803
  negative_prompt_embeds: Optional[torch.FloatTensor] = None,
804
  ip_adapter_image: Optional[PipelineImageInput] = None,
805
+ ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
806
  output_type: Optional[str] = "pil",
807
  return_dict: bool = True,
808
  callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
 
856
  not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
857
  ip_adapter_image: (`PipelineImageInput`, *optional*):
858
  Optional image input to work with IP Adapters.
859
+ ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
860
+ Pre-generated image embeddings for IP-Adapter. If not
861
+ provided, embeddings are computed from the `ip_adapter_image` input argument.
862
  output_type (`str`, *optional*, defaults to `"pil"`):
863
  The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
864
  `np.array`.
 
883
  Examples:
884
 
885
  Returns:
886
+ [`AnimateDiffPipelineOutput`] or `tuple`:
887
+ If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
888
  returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
889
  """
890
  # 0. Default height and width to unet
 
943
  prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
944
 
945
  if ip_adapter_image is not None:
946
+ image_embeds = self.prepare_ip_adapter_image_embeds(
947
+ ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
 
948
  )
 
 
949
 
950
  # 4. Preprocess image
951
  image = self.image_processor.preprocess(image, height=height, width=width)
 
974
  extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
975
 
976
  # 8. Add image embeds for IP-Adapter
977
+ added_cond_kwargs = (
978
+ {"image_embeds": image_embeds}
979
+ if ip_adapter_image is not None or ip_adapter_image_embeds is not None
980
+ else None
981
+ )
982
 
983
  # 9. Denoising loop
984
  num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
1012
  callback(i, t, latents)
1013
 
1014
  if output_type == "latent":
1015
+ return AnimateDiffPipelineOutput(frames=latents)
1016
 
1017
  # 10. Post-processing
1018
  video_tensor = self.decode_latents(latents)
 
1028
  if not return_dict:
1029
  return (video,)
1030
 
1031
+ return AnimateDiffPipelineOutput(frames=video)