New fixes from ARROW
Browse files- pipeline.py +61 -20
pipeline.py
CHANGED
@@ -11,9 +11,14 @@
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
import inspect
|
16 |
-
from dataclasses import dataclass
|
17 |
from types import FunctionType
|
18 |
from typing import Any, Callable, Dict, List, Optional, Union
|
19 |
|
@@ -25,8 +30,8 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
|
|
25 |
from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
26 |
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
27 |
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
28 |
-
#from diffusers.models.unet_motion_model import MotionAdapter
|
29 |
from diffusers.models.unets.unet_motion_model import MotionAdapter
|
|
|
30 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
31 |
from diffusers.schedulers import (
|
32 |
DDIMScheduler,
|
@@ -36,7 +41,7 @@ from diffusers.schedulers import (
|
|
36 |
LMSDiscreteScheduler,
|
37 |
PNDMScheduler,
|
38 |
)
|
39 |
-
from diffusers.utils import USE_PEFT_BACKEND,
|
40 |
from diffusers.utils.torch_utils import randn_tensor
|
41 |
|
42 |
|
@@ -49,9 +54,10 @@ EXAMPLE_DOC_STRING = """
|
|
49 |
>>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
|
50 |
>>> from diffusers.utils import export_to_gif, load_image
|
51 |
|
|
|
52 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
53 |
>>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
|
54 |
-
>>> pipe.scheduler = DDIMScheduler(
|
55 |
|
56 |
>>> image = load_image("snail.png")
|
57 |
>>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
|
@@ -226,14 +232,9 @@ def retrieve_timesteps(
|
|
226 |
return timesteps, num_inference_steps
|
227 |
|
228 |
|
229 |
-
@dataclass
|
230 |
-
class AnimateDiffImgToVideoPipelineOutput(BaseOutput):
|
231 |
-
frames: Union[torch.Tensor, np.ndarray]
|
232 |
-
|
233 |
-
|
234 |
class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
235 |
r"""
|
236 |
-
Pipeline for
|
237 |
|
238 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
239 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
@@ -504,6 +505,41 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
504 |
|
505 |
return image_embeds, uncond_image_embeds
|
506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
|
508 |
def decode_latents(self, latents):
|
509 |
latents = 1 / self.vae.config.scaling_factor * latents
|
@@ -766,6 +802,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
766 |
prompt_embeds: Optional[torch.FloatTensor] = None,
|
767 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
768 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
|
|
769 |
output_type: Optional[str] = "pil",
|
770 |
return_dict: bool = True,
|
771 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
@@ -819,6 +856,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
819 |
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
820 |
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
821 |
Optional image input to work with IP Adapters.
|
|
|
|
|
|
|
822 |
output_type (`str`, *optional*, defaults to `"pil"`):
|
823 |
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
824 |
`np.array`.
|
@@ -843,8 +883,8 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
843 |
Examples:
|
844 |
|
845 |
Returns:
|
846 |
-
[`
|
847 |
-
If `return_dict` is `True`, [`
|
848 |
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
|
849 |
"""
|
850 |
# 0. Default height and width to unet
|
@@ -903,12 +943,9 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
903 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
904 |
|
905 |
if ip_adapter_image is not None:
|
906 |
-
|
907 |
-
|
908 |
-
ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
|
909 |
)
|
910 |
-
if do_classifier_free_guidance:
|
911 |
-
image_embeds = torch.cat([negative_image_embeds, image_embeds])
|
912 |
|
913 |
# 4. Preprocess image
|
914 |
image = self.image_processor.preprocess(image, height=height, width=width)
|
@@ -937,7 +974,11 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
937 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
938 |
|
939 |
# 8. Add image embeds for IP-Adapter
|
940 |
-
added_cond_kwargs =
|
|
|
|
|
|
|
|
|
941 |
|
942 |
# 9. Denoising loop
|
943 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
@@ -971,7 +1012,7 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
971 |
callback(i, t, latents)
|
972 |
|
973 |
if output_type == "latent":
|
974 |
-
return
|
975 |
|
976 |
# 10. Post-processing
|
977 |
video_tensor = self.decode_latents(latents)
|
@@ -987,4 +1028,4 @@ class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMix
|
|
987 |
if not return_dict:
|
988 |
return (video,)
|
989 |
|
990 |
-
return
|
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
+
#
|
15 |
+
# Note:
|
16 |
+
# This pipeline relies on a "hack" discovered by the community that allows
|
17 |
+
# the generation of videos given an input image with AnimateDiff. It works
|
18 |
+
# by creating a copy of the image `num_frames` times and progressively adding
|
19 |
+
# more noise to the image based on the strength and latent interpolation method.
|
20 |
|
21 |
import inspect
|
|
|
22 |
from types import FunctionType
|
23 |
from typing import Any, Callable, Dict, List, Optional, Union
|
24 |
|
|
|
30 |
from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
|
31 |
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
|
32 |
from diffusers.models.lora import adjust_lora_scale_text_encoder
|
|
|
33 |
from diffusers.models.unets.unet_motion_model import MotionAdapter
|
34 |
+
from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
|
35 |
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
|
36 |
from diffusers.schedulers import (
|
37 |
DDIMScheduler,
|
|
|
41 |
LMSDiscreteScheduler,
|
42 |
PNDMScheduler,
|
43 |
)
|
44 |
+
from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
|
45 |
from diffusers.utils.torch_utils import randn_tensor
|
46 |
|
47 |
|
|
|
54 |
>>> from diffusers import MotionAdapter, DiffusionPipeline, DDIMScheduler
|
55 |
>>> from diffusers.utils import export_to_gif, load_image
|
56 |
|
57 |
+
>>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
|
58 |
>>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
|
59 |
>>> pipe = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter, custom_pipeline="pipeline_animatediff_img2video").to("cuda")
|
60 |
+
>>> pipe.scheduler = pipe.scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler", clip_sample=False, timestep_spacing="linspace", beta_schedule="linear", steps_offset=1)
|
61 |
|
62 |
>>> image = load_image("snail.png")
|
63 |
>>> output = pipe(image=image, prompt="A snail moving on the ground", strength=0.8, latent_interpolation_method="slerp")
|
|
|
232 |
return timesteps, num_inference_steps
|
233 |
|
234 |
|
|
|
|
|
|
|
|
|
|
|
235 |
class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
|
236 |
r"""
|
237 |
+
Pipeline for image-to-video generation.
|
238 |
|
239 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
|
240 |
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
|
|
|
505 |
|
506 |
return image_embeds, uncond_image_embeds
|
507 |
|
508 |
+
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
|
509 |
+
def prepare_ip_adapter_image_embeds(
|
510 |
+
self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
|
511 |
+
):
|
512 |
+
if ip_adapter_image_embeds is None:
|
513 |
+
if not isinstance(ip_adapter_image, list):
|
514 |
+
ip_adapter_image = [ip_adapter_image]
|
515 |
+
|
516 |
+
if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
|
517 |
+
raise ValueError(
|
518 |
+
f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
|
519 |
+
)
|
520 |
+
|
521 |
+
image_embeds = []
|
522 |
+
for single_ip_adapter_image, image_proj_layer in zip(
|
523 |
+
ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
|
524 |
+
):
|
525 |
+
output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
|
526 |
+
single_image_embeds, single_negative_image_embeds = self.encode_image(
|
527 |
+
single_ip_adapter_image, device, 1, output_hidden_state
|
528 |
+
)
|
529 |
+
single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
|
530 |
+
single_negative_image_embeds = torch.stack(
|
531 |
+
[single_negative_image_embeds] * num_images_per_prompt, dim=0
|
532 |
+
)
|
533 |
+
|
534 |
+
if self.do_classifier_free_guidance:
|
535 |
+
single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
|
536 |
+
single_image_embeds = single_image_embeds.to(device)
|
537 |
+
|
538 |
+
image_embeds.append(single_image_embeds)
|
539 |
+
else:
|
540 |
+
image_embeds = ip_adapter_image_embeds
|
541 |
+
return image_embeds
|
542 |
+
|
543 |
# Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
|
544 |
def decode_latents(self, latents):
|
545 |
latents = 1 / self.vae.config.scaling_factor * latents
|
|
|
802 |
prompt_embeds: Optional[torch.FloatTensor] = None,
|
803 |
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
804 |
ip_adapter_image: Optional[PipelineImageInput] = None,
|
805 |
+
ip_adapter_image_embeds: Optional[PipelineImageInput] = None,
|
806 |
output_type: Optional[str] = "pil",
|
807 |
return_dict: bool = True,
|
808 |
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
|
|
856 |
not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
|
857 |
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
858 |
Optional image input to work with IP Adapters.
|
859 |
+
ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
|
860 |
+
Pre-generated image embeddings for IP-Adapter. If not
|
861 |
+
provided, embeddings are computed from the `ip_adapter_image` input argument.
|
862 |
output_type (`str`, *optional*, defaults to `"pil"`):
|
863 |
The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
|
864 |
`np.array`.
|
|
|
883 |
Examples:
|
884 |
|
885 |
Returns:
|
886 |
+
[`AnimateDiffPipelineOutput`] or `tuple`:
|
887 |
+
If `return_dict` is `True`, [`AnimateDiffPipelineOutput`] is
|
888 |
returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
|
889 |
"""
|
890 |
# 0. Default height and width to unet
|
|
|
943 |
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
|
944 |
|
945 |
if ip_adapter_image is not None:
|
946 |
+
image_embeds = self.prepare_ip_adapter_image_embeds(
|
947 |
+
ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
|
|
|
948 |
)
|
|
|
|
|
949 |
|
950 |
# 4. Preprocess image
|
951 |
image = self.image_processor.preprocess(image, height=height, width=width)
|
|
|
974 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
975 |
|
976 |
# 8. Add image embeds for IP-Adapter
|
977 |
+
added_cond_kwargs = (
|
978 |
+
{"image_embeds": image_embeds}
|
979 |
+
if ip_adapter_image is not None or ip_adapter_image_embeds is not None
|
980 |
+
else None
|
981 |
+
)
|
982 |
|
983 |
# 9. Denoising loop
|
984 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
|
|
1012 |
callback(i, t, latents)
|
1013 |
|
1014 |
if output_type == "latent":
|
1015 |
+
return AnimateDiffPipelineOutput(frames=latents)
|
1016 |
|
1017 |
# 10. Post-processing
|
1018 |
video_tensor = self.decode_latents(latents)
|
|
|
1028 |
if not return_dict:
|
1029 |
return (video,)
|
1030 |
|
1031 |
+
return AnimateDiffPipelineOutput(frames=video)
|