diff --git a/diffusers/tests/__init__.py b/diffusers/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/conftest.py b/diffusers/tests/conftest.py deleted file mode 100644 index 6a02a38163ab01b1c2d0d12d5578e06d91b77cc8..0000000000000000000000000000000000000000 --- a/diffusers/tests/conftest.py +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# tests directory-specific settings - this file is run automatically -# by pytest before any tests are run - -import sys -import warnings -from os.path import abspath, dirname, join - - -# allow having multiple repository checkouts and not needing to remember to rerun -# 'pip install -e .[dev]' when switching between checkouts and running tests. -git_repo_path = abspath(join(dirname(dirname(__file__)), "src")) -sys.path.insert(1, git_repo_path) - -# silence FutureWarning warnings in tests since often we can't act on them until -# they become normal warnings - i.e. the tests still need to test the current functionality -warnings.simplefilter(action="ignore", category=FutureWarning) - - -def pytest_addoption(parser): - from diffusers.utils.testing_utils import pytest_addoption_shared - - pytest_addoption_shared(parser) - - -def pytest_terminal_summary(terminalreporter): - from diffusers.utils.testing_utils import pytest_terminal_summary_main - - make_reports = terminalreporter.config.getoption("--make-reports") - if make_reports: - pytest_terminal_summary_main(terminalreporter, id=make_reports) diff --git a/diffusers/tests/fixtures/custom_pipeline/pipeline.py b/diffusers/tests/fixtures/custom_pipeline/pipeline.py deleted file mode 100644 index 9119ae30f42f58aab8a52f303c1879e4b3803468..0000000000000000000000000000000000000000 --- a/diffusers/tests/fixtures/custom_pipeline/pipeline.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and - -# limitations under the License. - - -from typing import Optional, Tuple, Union - -import torch - -from diffusers import DiffusionPipeline, ImagePipelineOutput - - -class CustomLocalPipeline(DiffusionPipeline): - r""" - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the - library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) - - Parameters: - unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. - scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of - [`DDPMScheduler`], or [`DDIMScheduler`]. - """ - - def __init__(self, unet, scheduler): - super().__init__() - self.register_modules(unet=unet, scheduler=scheduler) - - @torch.no_grad() - def __call__( - self, - batch_size: int = 1, - generator: Optional[torch.Generator] = None, - num_inference_steps: int = 50, - output_type: Optional[str] = "pil", - return_dict: bool = True, - **kwargs, - ) -> Union[ImagePipelineOutput, Tuple]: - r""" - Args: - batch_size (`int`, *optional*, defaults to 1): - The number of images to generate. - generator (`torch.Generator`, *optional*): - A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation - deterministic. - eta (`float`, *optional*, defaults to 0.0): - The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM). - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. - - Returns: - [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. - """ - - # Sample gaussian noise to begin loop - image = torch.randn( - (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size), - generator=generator, - ) - image = image.to(self.device) - - # set step values - self.scheduler.set_timesteps(num_inference_steps) - - for t in self.progress_bar(self.scheduler.timesteps): - # 1. predict noise model_output - model_output = self.unet(image, t).sample - - # 2. predict previous mean of image x_t-1 and add variance depending on eta - # eta corresponds to η in paper and should be between [0, 1] - # do x_t -> x_t-1 - image = self.scheduler.step(model_output, t, image).prev_sample - - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - if output_type == "pil": - image = self.numpy_to_pil(image) - - if not return_dict: - return (image,), "This is a local test" - - return ImagePipelineOutput(images=image), "This is a local test" diff --git a/diffusers/tests/fixtures/custom_pipeline/what_ever.py b/diffusers/tests/fixtures/custom_pipeline/what_ever.py deleted file mode 100644 index a8af08d3980a6e9dbd5af240792edf013cef7313..0000000000000000000000000000000000000000 --- a/diffusers/tests/fixtures/custom_pipeline/what_ever.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and - -# limitations under the License. - - -from typing import Optional, Tuple, Union - -import torch - -from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput - - -class CustomLocalPipeline(DiffusionPipeline): - r""" - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the - library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) - - Parameters: - unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image. - scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of - [`DDPMScheduler`], or [`DDIMScheduler`]. - """ - - def __init__(self, unet, scheduler): - super().__init__() - self.register_modules(unet=unet, scheduler=scheduler) - - @torch.no_grad() - def __call__( - self, - batch_size: int = 1, - generator: Optional[torch.Generator] = None, - num_inference_steps: int = 50, - output_type: Optional[str] = "pil", - return_dict: bool = True, - **kwargs, - ) -> Union[ImagePipelineOutput, Tuple]: - r""" - Args: - batch_size (`int`, *optional*, defaults to 1): - The number of images to generate. - generator (`torch.Generator`, *optional*): - A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation - deterministic. - eta (`float`, *optional*, defaults to 0.0): - The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM). - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. - - Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. - """ - - # Sample gaussian noise to begin loop - image = torch.randn( - (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size), - generator=generator, - ) - image = image.to(self.device) - - # set step values - self.scheduler.set_timesteps(num_inference_steps) - - for t in self.progress_bar(self.scheduler.timesteps): - # 1. predict noise model_output - model_output = self.unet(image, t).sample - - # 2. predict previous mean of image x_t-1 and add variance depending on eta - # eta corresponds to η in paper and should be between [0, 1] - # do x_t -> x_t-1 - image = self.scheduler.step(model_output, t, image).prev_sample - - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - if output_type == "pil": - image = self.numpy_to_pil(image) - - if not return_dict: - return (image,), "This is a local test" - - return ImagePipelineOutput(images=image), "This is a local test" diff --git a/diffusers/tests/fixtures/elise_format0.mid b/diffusers/tests/fixtures/elise_format0.mid deleted file mode 100644 index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000 Binary files a/diffusers/tests/fixtures/elise_format0.mid and /dev/null differ diff --git a/diffusers/tests/models/__init__.py b/diffusers/tests/models/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/models/test_models_unet_1d.py b/diffusers/tests/models/test_models_unet_1d.py deleted file mode 100644 index b814f5f88a302c7c0bdc869ab7674c5657eee775..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_unet_1d.py +++ /dev/null @@ -1,284 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import torch - -from diffusers import UNet1DModel -from diffusers.utils import floats_tensor, slow, torch_device - -from ..test_modeling_common import ModelTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class UNet1DModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet1DModel - - @property - def dummy_input(self): - batch_size = 4 - num_features = 14 - seq_len = 16 - - noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device) - time_step = torch.tensor([10] * batch_size).to(torch_device) - - return {"sample": noise, "timestep": time_step} - - @property - def input_shape(self): - return (4, 14, 16) - - @property - def output_shape(self): - return (4, 14, 16) - - def test_ema_training(self): - pass - - def test_training(self): - pass - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_determinism(self): - super().test_determinism() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_outputs_equivalence(self): - super().test_outputs_equivalence() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_save_pretrained(self): - super().test_from_save_pretrained() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_save_pretrained_variant(self): - super().test_from_save_pretrained_variant() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_model_from_pretrained(self): - super().test_model_from_pretrained() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_output(self): - super().test_output() - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": (32, 64, 128, 256), - "in_channels": 14, - "out_channels": 14, - "time_embedding_type": "positional", - "use_timestep_embedding": True, - "flip_sin_to_cos": False, - "freq_shift": 1.0, - "out_block_type": "OutConv1DBlock", - "mid_block_type": "MidResTemporalBlock1D", - "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"), - "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"), - "act_fn": "mish", - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_pretrained_hub(self): - model, loading_info = UNet1DModel.from_pretrained( - "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet" - ) - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_output_pretrained(self): - model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet") - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - num_features = model.in_channels - seq_len = 16 - noise = torch.randn((1, seq_len, num_features)).permute( - 0, 2, 1 - ) # match original, we can update values and remove - time_step = torch.full((num_features,), 0) - - with torch.no_grad(): - output = model(noise, time_step).sample.permute(0, 2, 1) - - output_slice = output[0, -3:, -3:].flatten() - # fmt: off - expected_output_slice = torch.tensor([-2.137172, 1.1426016, 0.3688687, -0.766922, 0.7303146, 0.11038864, -0.4760633, 0.13270172, 0.02591348]) - # fmt: on - self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3)) - - def test_forward_with_norm_groups(self): - # Not implemented yet for this UNet - pass - - @slow - def test_unet_1d_maestro(self): - model_id = "harmonai/maestro-150k" - model = UNet1DModel.from_pretrained(model_id, subfolder="unet") - model.to(torch_device) - - sample_size = 65536 - noise = torch.sin(torch.arange(sample_size)[None, None, :].repeat(1, 2, 1)).to(torch_device) - timestep = torch.tensor([1]).to(torch_device) - - with torch.no_grad(): - output = model(noise, timestep).sample - - output_sum = output.abs().sum() - output_max = output.abs().max() - - assert (output_sum - 224.0896).abs() < 4e-2 - assert (output_max - 0.0607).abs() < 4e-4 - - -class UNetRLModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet1DModel - - @property - def dummy_input(self): - batch_size = 4 - num_features = 14 - seq_len = 16 - - noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device) - time_step = torch.tensor([10] * batch_size).to(torch_device) - - return {"sample": noise, "timestep": time_step} - - @property - def input_shape(self): - return (4, 14, 16) - - @property - def output_shape(self): - return (4, 14, 1) - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_determinism(self): - super().test_determinism() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_outputs_equivalence(self): - super().test_outputs_equivalence() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_save_pretrained(self): - super().test_from_save_pretrained() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_save_pretrained_variant(self): - super().test_from_save_pretrained_variant() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_model_from_pretrained(self): - super().test_model_from_pretrained() - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_output(self): - # UNetRL is a value-function is different output shape - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = torch.Size((inputs_dict["sample"].shape[0], 1)) - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_ema_training(self): - pass - - def test_training(self): - pass - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 14, - "out_channels": 14, - "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"], - "up_block_types": [], - "out_block_type": "ValueFunction", - "mid_block_type": "ValueFunctionMidBlock1D", - "block_out_channels": [32, 64, 128, 256], - "layers_per_block": 1, - "downsample_each_block": True, - "use_timestep_embedding": True, - "freq_shift": 1.0, - "flip_sin_to_cos": False, - "time_embedding_type": "positional", - "act_fn": "mish", - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_from_pretrained_hub(self): - value_function, vf_loading_info = UNet1DModel.from_pretrained( - "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function" - ) - self.assertIsNotNone(value_function) - self.assertEqual(len(vf_loading_info["missing_keys"]), 0) - - value_function.to(torch_device) - image = value_function(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS") - def test_output_pretrained(self): - value_function, vf_loading_info = UNet1DModel.from_pretrained( - "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function" - ) - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - num_features = value_function.in_channels - seq_len = 14 - noise = torch.randn((1, seq_len, num_features)).permute( - 0, 2, 1 - ) # match original, we can update values and remove - time_step = torch.full((num_features,), 0) - - with torch.no_grad(): - output = value_function(noise, time_step).sample - - # fmt: off - expected_output_slice = torch.tensor([165.25] * seq_len) - # fmt: on - self.assertTrue(torch.allclose(output, expected_output_slice, rtol=1e-3)) - - def test_forward_with_norm_groups(self): - # Not implemented yet for this UNet - pass diff --git a/diffusers/tests/models/test_models_unet_2d.py b/diffusers/tests/models/test_models_unet_2d.py deleted file mode 100644 index 8f831fcf7cbfb298c5e4deb489cc0edae1f76a51..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_unet_2d.py +++ /dev/null @@ -1,297 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import math -import unittest - -import torch - -from diffusers import UNet2DModel -from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device - -from ..test_modeling_common import ModelTesterMixin - - -logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False - - -class Unet2DModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet2DModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor([10]).to(torch_device) - - return {"sample": noise, "timestep": time_step} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": (32, 64), - "down_block_types": ("DownBlock2D", "AttnDownBlock2D"), - "up_block_types": ("AttnUpBlock2D", "UpBlock2D"), - "attention_head_dim": None, - "out_channels": 3, - "in_channels": 3, - "layers_per_block": 2, - "sample_size": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - -class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet2DModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 4 - sizes = (32, 32) - - noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor([10]).to(torch_device) - - return {"sample": noise, "timestep": time_step} - - @property - def input_shape(self): - return (4, 32, 32) - - @property - def output_shape(self): - return (4, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "sample_size": 32, - "in_channels": 4, - "out_channels": 4, - "layers_per_block": 2, - "block_out_channels": (32, 64), - "attention_head_dim": 32, - "down_block_types": ("DownBlock2D", "DownBlock2D"), - "up_block_types": ("UpBlock2D", "UpBlock2D"), - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_from_pretrained_hub(self): - model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) - - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input).sample - - assert image is not None, "Make sure output is not None" - - @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU") - def test_from_pretrained_accelerate(self): - model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) - model.to(torch_device) - image = model(**self.dummy_input).sample - - assert image is not None, "Make sure output is not None" - - @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU") - def test_from_pretrained_accelerate_wont_change_results(self): - # by defautl model loading will use accelerate as `low_cpu_mem_usage=True` - model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True) - model_accelerate.to(torch_device) - model_accelerate.eval() - - noise = torch.randn( - 1, - model_accelerate.config.in_channels, - model_accelerate.config.sample_size, - model_accelerate.config.sample_size, - generator=torch.manual_seed(0), - ) - noise = noise.to(torch_device) - time_step = torch.tensor([10] * noise.shape[0]).to(torch_device) - - arr_accelerate = model_accelerate(noise, time_step)["sample"] - - # two models don't need to stay in the device at the same time - del model_accelerate - torch.cuda.empty_cache() - gc.collect() - - model_normal_load, _ = UNet2DModel.from_pretrained( - "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False - ) - model_normal_load.to(torch_device) - model_normal_load.eval() - arr_normal_load = model_normal_load(noise, time_step)["sample"] - - assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3) - - def test_output_pretrained(self): - model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update") - model.eval() - model.to(torch_device) - - noise = torch.randn( - 1, - model.config.in_channels, - model.config.sample_size, - model.config.sample_size, - generator=torch.manual_seed(0), - ) - noise = noise.to(torch_device) - time_step = torch.tensor([10] * noise.shape[0]).to(torch_device) - - with torch.no_grad(): - output = model(noise, time_step).sample - - output_slice = output[0, -1, -3:, -3:].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800]) - # fmt: on - - self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3)) - - -class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet2DModel - - @property - def dummy_input(self, sizes=(32, 32)): - batch_size = 4 - num_channels = 3 - - noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor(batch_size * [10]).to(dtype=torch.int32, device=torch_device) - - return {"sample": noise, "timestep": time_step} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": [32, 64, 64, 64], - "in_channels": 3, - "layers_per_block": 1, - "out_channels": 3, - "time_embedding_type": "fourier", - "norm_eps": 1e-6, - "mid_block_scale_factor": math.sqrt(2.0), - "norm_num_groups": None, - "down_block_types": [ - "SkipDownBlock2D", - "AttnSkipDownBlock2D", - "SkipDownBlock2D", - "SkipDownBlock2D", - ], - "up_block_types": [ - "SkipUpBlock2D", - "SkipUpBlock2D", - "AttnSkipUpBlock2D", - "SkipUpBlock2D", - ], - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @slow - def test_from_pretrained_hub(self): - model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True) - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - inputs = self.dummy_input - noise = floats_tensor((4, 3) + (256, 256)).to(torch_device) - inputs["sample"] = noise - image = model(**inputs) - - assert image is not None, "Make sure output is not None" - - @slow - def test_output_pretrained_ve_mid(self): - model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256") - model.to(torch_device) - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - batch_size = 4 - num_channels = 3 - sizes = (256, 256) - - noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor(batch_size * [1e-4]).to(torch_device) - - with torch.no_grad(): - output = model(noise, time_step).sample - - output_slice = output[0, -3:, -3:, -1].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114]) - # fmt: on - - self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) - - def test_output_pretrained_ve_large(self): - model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update") - model.to(torch_device) - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor(batch_size * [1e-4]).to(torch_device) - - with torch.no_grad(): - output = model(noise, time_step).sample - - output_slice = output[0, -3:, -3:, -1].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256]) - # fmt: on - - self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) - - def test_forward_with_norm_groups(self): - # not required for this model - pass diff --git a/diffusers/tests/models/test_models_unet_2d_condition.py b/diffusers/tests/models/test_models_unet_2d_condition.py deleted file mode 100644 index c0cb9d3d8ebde3322c89dbec44f58f16985f6243..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_unet_2d_condition.py +++ /dev/null @@ -1,944 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import os -import tempfile -import unittest - -import torch -from parameterized import parameterized - -from diffusers import UNet2DConditionModel -from diffusers.models.attention_processor import LoRAAttnProcessor -from diffusers.utils import ( - floats_tensor, - load_hf_numpy, - logging, - require_torch_gpu, - slow, - torch_all_close, - torch_device, -) -from diffusers.utils.import_utils import is_xformers_available - -from ..test_modeling_common import ModelTesterMixin - - -logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False - - -def create_lora_layers(model): - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - lora_attn_procs[name] = lora_attn_procs[name].to(model.device) - - # add 1 to weights to mock trained weights - with torch.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight += 1 - lora_attn_procs[name].to_k_lora.up.weight += 1 - lora_attn_procs[name].to_v_lora.up.weight += 1 - lora_attn_procs[name].to_out_lora.up.weight += 1 - - return lora_attn_procs - - -class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet2DConditionModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 4 - sizes = (32, 32) - - noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - time_step = torch.tensor([10]).to(torch_device) - encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device) - - return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states} - - @property - def input_shape(self): - return (4, 32, 32) - - @property - def output_shape(self): - return (4, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": (32, 64), - "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"), - "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"), - "cross_attention_dim": 32, - "attention_head_dim": 8, - "out_channels": 4, - "in_channels": 4, - "layers_per_block": 2, - "sample_size": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_enable_works(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - - model.enable_xformers_memory_efficient_attention() - - assert ( - model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__ - == "XFormersAttnProcessor" - ), "xformers is not enabled" - - @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS") - def test_gradient_checkpointing(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - - assert not model.is_gradient_checkpointing and model.training - - out = model(**inputs_dict).sample - # run the backwards pass on the model. For backwards pass, for simplicity purpose, - # we won't calculate the loss and rather backprop on out.sum() - model.zero_grad() - - labels = torch.randn_like(out) - loss = (out - labels).mean() - loss.backward() - - # re-instantiate the model now enabling gradient checkpointing - model_2 = self.model_class(**init_dict) - # clone model - model_2.load_state_dict(model.state_dict()) - model_2.to(torch_device) - model_2.enable_gradient_checkpointing() - - assert model_2.is_gradient_checkpointing and model_2.training - - out_2 = model_2(**inputs_dict).sample - # run the backwards pass on the model. For backwards pass, for simplicity purpose, - # we won't calculate the loss and rather backprop on out.sum() - model_2.zero_grad() - loss_2 = (out_2 - labels).mean() - loss_2.backward() - - # compare the output and parameters gradients - self.assertTrue((loss - loss_2).abs() < 1e-5) - named_params = dict(model.named_parameters()) - named_params_2 = dict(model_2.named_parameters()) - for name, param in named_params.items(): - self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5)) - - def test_model_with_attention_head_dim_tuple(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_model_with_use_linear_projection(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["use_linear_projection"] = True - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_model_with_cross_attention_dim_tuple(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["cross_attention_dim"] = (32, 32) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_model_with_simple_projection(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - batch_size, _, _, sample_size = inputs_dict["sample"].shape - - init_dict["class_embed_type"] = "simple_projection" - init_dict["projection_class_embeddings_input_dim"] = sample_size - - inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_model_with_class_embeddings_concat(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - batch_size, _, _, sample_size = inputs_dict["sample"].shape - - init_dict["class_embed_type"] = "simple_projection" - init_dict["projection_class_embeddings_input_dim"] = sample_size - init_dict["class_embeddings_concat"] = True - - inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_model_attention_slicing(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - model.set_attention_slice("auto") - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - model.set_attention_slice("max") - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - model.set_attention_slice(2) - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - def test_model_sliceable_head_dim(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - model = self.model_class(**init_dict) - - def check_sliceable_dim_attr(module: torch.nn.Module): - if hasattr(module, "set_attention_slice"): - assert isinstance(module.sliceable_head_dim, int) - - for child in module.children(): - check_sliceable_dim_attr(child) - - # retrieve number of attention layers - for module in model.children(): - check_sliceable_dim_attr(module) - - def test_special_attn_proc(self): - class AttnEasyProc(torch.nn.Module): - def __init__(self, num): - super().__init__() - self.weight = torch.nn.Parameter(torch.tensor(num)) - self.is_run = False - self.number = 0 - self.counter = 0 - - def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, number=None): - batch_size, sequence_length, _ = hidden_states.shape - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) - - query = attn.to_q(hidden_states) - - encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states - key = attn.to_k(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) - - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) - - attention_probs = attn.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) - - # linear proj - hidden_states = attn.to_out[0](hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) - - hidden_states += self.weight - - self.is_run = True - self.counter += 1 - self.number = number - - return hidden_states - - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - model = self.model_class(**init_dict) - model.to(torch_device) - - processor = AttnEasyProc(5.0) - - model.set_attn_processor(processor) - model(**inputs_dict, cross_attention_kwargs={"number": 123}).sample - - assert processor.counter == 12 - assert processor.is_run - assert processor.number == 123 - - def test_lora_processors(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - model = self.model_class(**init_dict) - model.to(torch_device) - - with torch.no_grad(): - sample1 = model(**inputs_dict).sample - - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - - # add 1 to weights to mock trained weights - with torch.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight += 1 - lora_attn_procs[name].to_k_lora.up.weight += 1 - lora_attn_procs[name].to_v_lora.up.weight += 1 - lora_attn_procs[name].to_out_lora.up.weight += 1 - - # make sure we can set a list of attention processors - model.set_attn_processor(lora_attn_procs) - model.to(torch_device) - - # test that attn processors can be set to itself - model.set_attn_processor(model.attn_processors) - - with torch.no_grad(): - sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample - sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - - assert (sample1 - sample2).abs().max() < 1e-4 - assert (sample3 - sample4).abs().max() < 1e-4 - - # sample 2 and sample 3 should be different - assert (sample2 - sample3).abs().max() > 1e-4 - - def test_lora_save_load(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - - with torch.no_grad(): - old_sample = model(**inputs_dict).sample - - lora_attn_procs = create_lora_layers(model) - model.set_attn_processor(lora_attn_procs) - - with torch.no_grad(): - sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs(tmpdirname) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - torch.manual_seed(0) - new_model = self.model_class(**init_dict) - new_model.to(torch_device) - new_model.load_attn_procs(tmpdirname) - - with torch.no_grad(): - new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - - assert (sample - new_sample).abs().max() < 1e-4 - - # LoRA and no LoRA should NOT be the same - assert (sample - old_sample).abs().max() > 1e-4 - - def test_lora_save_load_safetensors(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - - with torch.no_grad(): - old_sample = model(**inputs_dict).sample - - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - lora_attn_procs[name] = lora_attn_procs[name].to(model.device) - - # add 1 to weights to mock trained weights - with torch.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight += 1 - lora_attn_procs[name].to_k_lora.up.weight += 1 - lora_attn_procs[name].to_v_lora.up.weight += 1 - lora_attn_procs[name].to_out_lora.up.weight += 1 - - model.set_attn_processor(lora_attn_procs) - - with torch.no_grad(): - sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs(tmpdirname, safe_serialization=True) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors"))) - torch.manual_seed(0) - new_model = self.model_class(**init_dict) - new_model.to(torch_device) - new_model.load_attn_procs(tmpdirname) - - with torch.no_grad(): - new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - - assert (sample - new_sample).abs().max() < 1e-4 - - # LoRA and no LoRA should NOT be the same - assert (sample - old_sample).abs().max() > 1e-4 - - def test_lora_save_safetensors_load_torch(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - lora_attn_procs[name] = lora_attn_procs[name].to(model.device) - - model.set_attn_processor(lora_attn_procs) - # Saving as torch, properly reloads with directly filename - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs(tmpdirname) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - torch.manual_seed(0) - new_model = self.model_class(**init_dict) - new_model.to(torch_device) - new_model.load_attn_procs(tmpdirname, weight_name="pytorch_lora_weights.bin") - - def test_lora_save_torch_force_load_safetensors_error(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - lora_attn_procs[name] = lora_attn_procs[name].to(model.device) - - model.set_attn_processor(lora_attn_procs) - # Saving as torch, properly reloads with directly filename - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_attn_procs(tmpdirname) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) - torch.manual_seed(0) - new_model = self.model_class(**init_dict) - new_model.to(torch_device) - with self.assertRaises(IOError) as e: - new_model.load_attn_procs(tmpdirname, use_safetensors=True) - self.assertIn("Error no file named pytorch_lora_weights.safetensors", str(e.exception)) - - def test_lora_on_off(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - - with torch.no_grad(): - old_sample = model(**inputs_dict).sample - - lora_attn_procs = create_lora_layers(model) - model.set_attn_processor(lora_attn_procs) - - with torch.no_grad(): - sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample - - model.set_default_attn_processor() - - with torch.no_grad(): - new_sample = model(**inputs_dict).sample - - assert (sample - new_sample).abs().max() < 1e-4 - assert (sample - old_sample).abs().max() < 1e-4 - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_lora_xformers_on_off(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = (8, 16) - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - lora_attn_procs = create_lora_layers(model) - model.set_attn_processor(lora_attn_procs) - - # default - with torch.no_grad(): - sample = model(**inputs_dict).sample - - model.enable_xformers_memory_efficient_attention() - on_sample = model(**inputs_dict).sample - - model.disable_xformers_memory_efficient_attention() - off_sample = model(**inputs_dict).sample - - assert (sample - on_sample).abs().max() < 1e-4 - assert (sample - off_sample).abs().max() < 1e-4 - - -@slow -class UNet2DConditionModelIntegrationTests(unittest.TestCase): - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return image - - def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"): - revision = "fp16" if fp16 else None - torch_dtype = torch.float16 if fp16 else torch.float32 - - model = UNet2DConditionModel.from_pretrained( - model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision - ) - model.to(torch_device).eval() - - return model - - def test_set_attention_slice_auto(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - unet = self.get_unet_model() - unet.set_attention_slice("auto") - - latents = self.get_latents(33) - encoder_hidden_states = self.get_encoder_hidden_states(33) - timestep = 1 - - with torch.no_grad(): - _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - mem_bytes = torch.cuda.max_memory_allocated() - - assert mem_bytes < 5 * 10**9 - - def test_set_attention_slice_max(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - unet = self.get_unet_model() - unet.set_attention_slice("max") - - latents = self.get_latents(33) - encoder_hidden_states = self.get_encoder_hidden_states(33) - timestep = 1 - - with torch.no_grad(): - _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - mem_bytes = torch.cuda.max_memory_allocated() - - assert mem_bytes < 5 * 10**9 - - def test_set_attention_slice_int(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - unet = self.get_unet_model() - unet.set_attention_slice(2) - - latents = self.get_latents(33) - encoder_hidden_states = self.get_encoder_hidden_states(33) - timestep = 1 - - with torch.no_grad(): - _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - mem_bytes = torch.cuda.max_memory_allocated() - - assert mem_bytes < 5 * 10**9 - - def test_set_attention_slice_list(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - # there are 32 sliceable layers - slice_list = 16 * [2, 3] - unet = self.get_unet_model() - unet.set_attention_slice(slice_list) - - latents = self.get_latents(33) - encoder_hidden_states = self.get_encoder_hidden_states(33) - timestep = 1 - - with torch.no_grad(): - _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - mem_bytes = torch.cuda.max_memory_allocated() - - assert mem_bytes < 5 * 10**9 - - def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - hidden_states = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return hidden_states - - @parameterized.expand( - [ - # fmt: off - [33, 4, [-0.4424, 0.1510, -0.1937, 0.2118, 0.3746, -0.3957, 0.0160, -0.0435]], - [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.2540, 0.3633, -0.0821, 0.1719, -0.0207]], - [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]], - [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_v1_4(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4") - latents = self.get_latents(seed) - encoder_hidden_states = self.get_encoder_hidden_states(seed) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == latents.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]], - [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]], - [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]], - [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True) - latents = self.get_latents(seed, fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == latents.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand( - [ - # fmt: off - [33, 4, [-0.4430, 0.1570, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]], - [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.3430, -0.0536, 0.2114, -0.0436]], - [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.1750]], - [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_v1_5(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5") - latents = self.get_latents(seed) - encoder_hidden_states = self.get_encoder_hidden_states(seed) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == latents.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]], - [17, 0.55, [-0.1290, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]], - [8, 0.89, [-0.5283, 0.1198, 0.0870, -0.1141, 0.9189, -0.0150, 0.5474, 0.4319]], - [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.0280, -1.0020]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True) - latents = self.get_latents(seed, fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == latents.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand( - [ - # fmt: off - [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]], - [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]], - [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.4120, -0.5621, -0.0397, -1.0430]], - [9, 1000, [0.1600, 0.7303, -1.0556, -0.3515, -0.7440, -1.2037, -1.8149, -1.8931]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_inpaint(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting") - latents = self.get_latents(seed, shape=(4, 9, 64, 64)) - encoder_hidden_states = self.get_encoder_hidden_states(seed) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == (4, 4, 64, 64) - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]], - [17, 0.55, [0.0975, -0.2856, -0.3508, -0.4600, 0.3376, 0.2930, -0.2747, -0.7026]], - [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]], - [3, 1000, [0.4790, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]], - # fmt: on - ] - ) - @require_torch_gpu - def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True) - latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == (4, 4, 64, 64) - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand( - [ - # fmt: off - [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]], - [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]], - [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]], - [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]], - # fmt: on - ] - ) - @require_torch_gpu - def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice): - model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True) - latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True) - - timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device) - - with torch.no_grad(): - sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample - - assert sample.shape == latents.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) diff --git a/diffusers/tests/models/test_models_unet_2d_flax.py b/diffusers/tests/models/test_models_unet_2d_flax.py deleted file mode 100644 index 69a0704dca9dae32a7d612b82cbedc0454a0a1b5..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_unet_2d_flax.py +++ /dev/null @@ -1,104 +0,0 @@ -import gc -import unittest - -from parameterized import parameterized - -from diffusers import FlaxUNet2DConditionModel -from diffusers.utils import is_flax_available -from diffusers.utils.testing_utils import load_hf_numpy, require_flax, slow - - -if is_flax_available(): - import jax - import jax.numpy as jnp - - -@slow -@require_flax -class FlaxUNet2DConditionModelIntegrationTests(unittest.TestCase): - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - - def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False): - dtype = jnp.bfloat16 if fp16 else jnp.float32 - image = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype) - return image - - def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"): - dtype = jnp.bfloat16 if fp16 else jnp.float32 - revision = "bf16" if fp16 else None - - model, params = FlaxUNet2DConditionModel.from_pretrained( - model_id, subfolder="unet", dtype=dtype, revision=revision - ) - return model, params - - def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False): - dtype = jnp.bfloat16 if fp16 else jnp.float32 - hidden_states = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype) - return hidden_states - - @parameterized.expand( - [ - # fmt: off - [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]], - [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]], - [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]], - [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]], - # fmt: on - ] - ) - def test_compvis_sd_v1_4_flax_vs_torch_fp16(self, seed, timestep, expected_slice): - model, params = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True) - latents = self.get_latents(seed, fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True) - - sample = model.apply( - {"params": params}, - latents, - jnp.array(timestep, dtype=jnp.int32), - encoder_hidden_states=encoder_hidden_states, - ).sample - - assert sample.shape == latents.shape - - output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32) - expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32) - - # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, in the same hardware - assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2) - - @parameterized.expand( - [ - # fmt: off - [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]], - [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]], - [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]], - [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]], - # fmt: on - ] - ) - def test_stabilityai_sd_v2_flax_vs_torch_fp16(self, seed, timestep, expected_slice): - model, params = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True) - latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True) - encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True) - - sample = model.apply( - {"params": params}, - latents, - jnp.array(timestep, dtype=jnp.int32), - encoder_hidden_states=encoder_hidden_states, - ).sample - - assert sample.shape == latents.shape - - output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32) - expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32) - - # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, on the same hardware - assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2) diff --git a/diffusers/tests/models/test_models_unet_3d_condition.py b/diffusers/tests/models/test_models_unet_3d_condition.py deleted file mode 100644 index 5a0d74a3ea5ad6956e791029d4a3be2528ca4d28..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_unet_3d_condition.py +++ /dev/null @@ -1,241 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers.models import ModelMixin, UNet3DConditionModel -from diffusers.models.attention_processor import LoRAAttnProcessor -from diffusers.utils import ( - floats_tensor, - logging, - skip_mps, - torch_device, -) -from diffusers.utils.import_utils import is_xformers_available - -from ..test_modeling_common import ModelTesterMixin - - -logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False - - -def create_lora_layers(model): - lora_attn_procs = {} - for name in model.attn_processors.keys(): - cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim - if name.startswith("mid_block"): - hidden_size = model.config.block_out_channels[-1] - elif name.startswith("up_blocks"): - block_id = int(name[len("up_blocks.")]) - hidden_size = list(reversed(model.config.block_out_channels))[block_id] - elif name.startswith("down_blocks"): - block_id = int(name[len("down_blocks.")]) - hidden_size = model.config.block_out_channels[block_id] - - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) - lora_attn_procs[name] = lora_attn_procs[name].to(model.device) - - # add 1 to weights to mock trained weights - with torch.no_grad(): - lora_attn_procs[name].to_q_lora.up.weight += 1 - lora_attn_procs[name].to_k_lora.up.weight += 1 - lora_attn_procs[name].to_v_lora.up.weight += 1 - lora_attn_procs[name].to_out_lora.up.weight += 1 - - return lora_attn_procs - - -@skip_mps -class UNet3DConditionModelTests(ModelTesterMixin, unittest.TestCase): - model_class = UNet3DConditionModel - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 4 - num_frames = 4 - sizes = (32, 32) - - noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device) - time_step = torch.tensor([10]).to(torch_device) - encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device) - - return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states} - - @property - def input_shape(self): - return (4, 4, 32, 32) - - @property - def output_shape(self): - return (4, 4, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": (32, 64), - "down_block_types": ( - "CrossAttnDownBlock3D", - "DownBlock3D", - ), - "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"), - "cross_attention_dim": 32, - "attention_head_dim": 8, - "out_channels": 4, - "in_channels": 4, - "layers_per_block": 1, - "sample_size": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_enable_works(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - - model.enable_xformers_memory_efficient_attention() - - assert ( - model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__ - == "XFormersAttnProcessor" - ), "xformers is not enabled" - - # Overriding to set `norm_num_groups` needs to be different for this model. - def test_forward_with_norm_groups(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["norm_num_groups"] = 32 - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - # Overriding since the UNet3D outputs a different structure. - def test_determinism(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - # Warmup pass when using mps (see #372) - if torch_device == "mps" and isinstance(model, ModelMixin): - model(**self.dummy_input) - - first = model(**inputs_dict) - if isinstance(first, dict): - first = first.sample - - second = model(**inputs_dict) - if isinstance(second, dict): - second = second.sample - - out_1 = first.cpu().numpy() - out_2 = second.cpu().numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_model_attention_slicing(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = 8 - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - model.set_attention_slice("auto") - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - model.set_attention_slice("max") - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - model.set_attention_slice(2) - with torch.no_grad(): - output = model(**inputs_dict) - assert output is not None - - # (`attn_processors`) needs to be implemented in this model for this test. - # def test_lora_processors(self): - - # (`attn_processors`) needs to be implemented in this model for this test. - # def test_lora_save_load(self): - - # (`attn_processors`) needs to be implemented for this test in the model. - # def test_lora_save_load_safetensors(self): - - # (`attn_processors`) needs to be implemented for this test in the model. - # def test_lora_save_safetensors_load_torch(self): - - # (`attn_processors`) needs to be implemented for this test. - # def test_lora_save_torch_force_load_safetensors_error(self): - - # (`attn_processors`) needs to be added for this test. - # def test_lora_on_off(self): - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_lora_xformers_on_off(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["attention_head_dim"] = 4 - - torch.manual_seed(0) - model = self.model_class(**init_dict) - model.to(torch_device) - lora_attn_procs = create_lora_layers(model) - model.set_attn_processor(lora_attn_procs) - - # default - with torch.no_grad(): - sample = model(**inputs_dict).sample - - model.enable_xformers_memory_efficient_attention() - on_sample = model(**inputs_dict).sample - - model.disable_xformers_memory_efficient_attention() - off_sample = model(**inputs_dict).sample - - assert (sample - on_sample).abs().max() < 1e-4 - assert (sample - off_sample).abs().max() < 1e-4 - - -# (todo: sayakpaul) implement SLOW tests. diff --git a/diffusers/tests/models/test_models_vae.py b/diffusers/tests/models/test_models_vae.py deleted file mode 100644 index abd4a078e6922f8454bd6b3b7f8a35b53a834d80..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_vae.py +++ /dev/null @@ -1,345 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import torch -from parameterized import parameterized - -from diffusers import AutoencoderKL -from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device - -from ..test_modeling_common import ModelTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase): - model_class = AutoencoderKL - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - - return {"sample": image} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": [32, 64], - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], - "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], - "latent_channels": 4, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_forward_signature(self): - pass - - def test_training(self): - pass - - @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS") - def test_gradient_checkpointing(self): - # enable deterministic behavior for gradient checkpointing - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - - assert not model.is_gradient_checkpointing and model.training - - out = model(**inputs_dict).sample - # run the backwards pass on the model. For backwards pass, for simplicity purpose, - # we won't calculate the loss and rather backprop on out.sum() - model.zero_grad() - - labels = torch.randn_like(out) - loss = (out - labels).mean() - loss.backward() - - # re-instantiate the model now enabling gradient checkpointing - model_2 = self.model_class(**init_dict) - # clone model - model_2.load_state_dict(model.state_dict()) - model_2.to(torch_device) - model_2.enable_gradient_checkpointing() - - assert model_2.is_gradient_checkpointing and model_2.training - - out_2 = model_2(**inputs_dict).sample - # run the backwards pass on the model. For backwards pass, for simplicity purpose, - # we won't calculate the loss and rather backprop on out.sum() - model_2.zero_grad() - loss_2 = (out_2 - labels).mean() - loss_2.backward() - - # compare the output and parameters gradients - self.assertTrue((loss - loss_2).abs() < 1e-5) - named_params = dict(model.named_parameters()) - named_params_2 = dict(model_2.named_parameters()) - for name, param in named_params.items(): - self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5)) - - def test_from_pretrained_hub(self): - model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True) - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy") - model = model.to(torch_device) - model.eval() - - if torch_device == "mps": - generator = torch.manual_seed(0) - else: - generator = torch.Generator(device=torch_device).manual_seed(0) - - image = torch.randn( - 1, - model.config.in_channels, - model.config.sample_size, - model.config.sample_size, - generator=torch.manual_seed(0), - ) - image = image.to(torch_device) - with torch.no_grad(): - output = model(image, sample_posterior=True, generator=generator).sample - - output_slice = output[0, -1, -3:, -3:].flatten().cpu() - - # Since the VAE Gaussian prior's generator is seeded on the appropriate device, - # the expected output slices are not the same for CPU and GPU. - if torch_device == "mps": - expected_output_slice = torch.tensor( - [ - -4.0078e-01, - -3.8323e-04, - -1.2681e-01, - -1.1462e-01, - 2.0095e-01, - 1.0893e-01, - -8.8247e-02, - -3.0361e-01, - -9.8644e-03, - ] - ) - elif torch_device == "cpu": - expected_output_slice = torch.tensor( - [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026] - ) - else: - expected_output_slice = torch.tensor( - [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485] - ) - - self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) - - -@slow -class AutoencoderKLIntegrationTests(unittest.TestCase): - def get_file_format(self, seed, shape): - return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy" - - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False): - dtype = torch.float16 if fp16 else torch.float32 - image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype) - return image - - def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False): - revision = "fp16" if fp16 else None - torch_dtype = torch.float16 if fp16 else torch.float32 - - model = AutoencoderKL.from_pretrained( - model_id, - subfolder="vae", - torch_dtype=torch_dtype, - revision=revision, - ) - model.to(torch_device).eval() - - return model - - def get_generator(self, seed=0): - if torch_device == "mps": - return torch.manual_seed(seed) - return torch.Generator(device=torch_device).manual_seed(seed) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]], - [47, [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]], - # fmt: on - ] - ) - def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(image, generator=generator, sample_posterior=True).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]], - [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]], - # fmt: on - ] - ) - @require_torch_gpu - def test_stable_diffusion_fp16(self, seed, expected_slice): - model = self.get_sd_vae_model(fp16=True) - image = self.get_sd_image(seed, fp16=True) - generator = self.get_generator(seed) - - with torch.no_grad(): - sample = model(image, generator=generator, sample_posterior=True).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-2) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]], - [47, [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]], - # fmt: on - ] - ) - def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - - with torch.no_grad(): - sample = model(image).sample - - assert sample.shape == image.shape - - output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]], - [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]], - # fmt: on - ] - ) - @require_torch_gpu - def test_stable_diffusion_decode(self, seed, expected_slice): - model = self.get_sd_vae_model() - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64)) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - output_slice = sample[-1, -2:, :2, -2:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) - - @parameterized.expand( - [ - # fmt: off - [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]], - [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]], - # fmt: on - ] - ) - @require_torch_gpu - def test_stable_diffusion_decode_fp16(self, seed, expected_slice): - model = self.get_sd_vae_model(fp16=True) - encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True) - - with torch.no_grad(): - sample = model.decode(encoding).sample - - assert list(sample.shape) == [3, 3, 512, 512] - - output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu() - expected_output_slice = torch.tensor(expected_slice) - - assert torch_all_close(output_slice, expected_output_slice, atol=5e-3) - - @parameterized.expand( - [ - # fmt: off - [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]], - [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]], - # fmt: on - ] - ) - def test_stable_diffusion_encode_sample(self, seed, expected_slice): - model = self.get_sd_vae_model() - image = self.get_sd_image(seed) - generator = self.get_generator(seed) - - with torch.no_grad(): - dist = model.encode(image).latent_dist - sample = dist.sample(generator=generator) - - assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]] - - output_slice = sample[0, -1, -3:, -3:].flatten().cpu() - expected_output_slice = torch.tensor(expected_slice) - - tolerance = 1e-3 if torch_device != "mps" else 1e-2 - assert torch_all_close(output_slice, expected_output_slice, atol=tolerance) diff --git a/diffusers/tests/models/test_models_vae_flax.py b/diffusers/tests/models/test_models_vae_flax.py deleted file mode 100644 index 8fedb85eccfc73e9a0900f7bb947887da3ffe4e9..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_vae_flax.py +++ /dev/null @@ -1,39 +0,0 @@ -import unittest - -from diffusers import FlaxAutoencoderKL -from diffusers.utils import is_flax_available -from diffusers.utils.testing_utils import require_flax - -from ..test_modeling_common_flax import FlaxModelTesterMixin - - -if is_flax_available(): - import jax - - -@require_flax -class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase): - model_class = FlaxAutoencoderKL - - @property - def dummy_input(self): - batch_size = 4 - num_channels = 3 - sizes = (32, 32) - - prng_key = jax.random.PRNGKey(0) - image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes)) - - return {"sample": image, "prng_key": prng_key} - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": [32, 64], - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], - "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], - "latent_channels": 4, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict diff --git a/diffusers/tests/models/test_models_vq.py b/diffusers/tests/models/test_models_vq.py deleted file mode 100644 index 66c33e07371e066bad3f0465ab923d67b79b4f52..0000000000000000000000000000000000000000 --- a/diffusers/tests/models/test_models_vq.py +++ /dev/null @@ -1,94 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import torch - -from diffusers import VQModel -from diffusers.utils import floats_tensor, torch_device - -from ..test_modeling_common import ModelTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class VQModelTests(ModelTesterMixin, unittest.TestCase): - model_class = VQModel - - @property - def dummy_input(self, sizes=(32, 32)): - batch_size = 4 - num_channels = 3 - - image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device) - - return {"sample": image} - - @property - def input_shape(self): - return (3, 32, 32) - - @property - def output_shape(self): - return (3, 32, 32) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "block_out_channels": [32, 64], - "in_channels": 3, - "out_channels": 3, - "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], - "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], - "latent_channels": 3, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_forward_signature(self): - pass - - def test_training(self): - pass - - def test_from_pretrained_hub(self): - model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True) - self.assertIsNotNone(model) - self.assertEqual(len(loading_info["missing_keys"]), 0) - - model.to(torch_device) - image = model(**self.dummy_input) - - assert image is not None, "Make sure output is not None" - - def test_output_pretrained(self): - model = VQModel.from_pretrained("fusing/vqgan-dummy") - model.to(torch_device).eval() - - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size) - image = image.to(torch_device) - with torch.no_grad(): - output = model(image).sample - - output_slice = output[0, -1, -3:, -3:].flatten().cpu() - # fmt: off - expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143]) - # fmt: on - self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3)) diff --git a/diffusers/tests/pipeline_params.py b/diffusers/tests/pipeline_params.py deleted file mode 100644 index a0ac6c641c0bafef0f770409e9b75ec0aee013c1..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipeline_params.py +++ /dev/null @@ -1,121 +0,0 @@ -# These are canonical sets of parameters for different types of pipelines. -# They are set on subclasses of `PipelineTesterMixin` as `params` and -# `batch_params`. -# -# If a pipeline's set of arguments has minor changes from one of the common sets -# of arguments, do not make modifications to the existing common sets of arguments. -# I.e. a text to image pipeline with non-configurable height and width arguments -# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. - -TEXT_TO_IMAGE_PARAMS = frozenset( - [ - "prompt", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - "cross_attention_kwargs", - ] -) - -TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) - -IMAGE_VARIATION_PARAMS = frozenset( - [ - "image", - "height", - "width", - "guidance_scale", - ] -) - -IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"]) - -TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset( - [ - "prompt", - "image", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - ] -) - -TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"]) - -TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset( - [ - # Text guided image variation with an image mask - "prompt", - "image", - "mask_image", - "height", - "width", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - ] -) - -TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"]) - -IMAGE_INPAINTING_PARAMS = frozenset( - [ - # image variation with an image mask - "image", - "mask_image", - "height", - "width", - "guidance_scale", - ] -) - -IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"]) - -IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset( - [ - "example_image", - "image", - "mask_image", - "height", - "width", - "guidance_scale", - ] -) - -IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"]) - -CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"]) - -CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"]) - -UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"]) - -UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([]) - -UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"]) - -UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([]) - -TEXT_TO_AUDIO_PARAMS = frozenset( - [ - "prompt", - "audio_length_in_s", - "guidance_scale", - "negative_prompt", - "prompt_embeds", - "negative_prompt_embeds", - "cross_attention_kwargs", - ] -) - -TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"]) -TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"]) - -TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"]) diff --git a/diffusers/tests/pipelines/__init__.py b/diffusers/tests/pipelines/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/altdiffusion/__init__.py b/diffusers/tests/pipelines/altdiffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py deleted file mode 100644 index faa56e18f74835a6f1fa2f63717fc9ba5c0a7e29..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ /dev/null @@ -1,244 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer - -from diffusers import AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.alt_diffusion.modeling_roberta_series import ( - RobertaSeriesConfig, - RobertaSeriesModelWithTransformation, -) -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AltDiffusionPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - - # TODO: address the non-deterministic text encoder (fails for save-load tests) - # torch.manual_seed(0) - # text_encoder_config = RobertaSeriesConfig( - # hidden_size=32, - # project_dim=32, - # intermediate_size=37, - # layer_norm_eps=1e-05, - # num_attention_heads=4, - # num_hidden_layers=5, - # vocab_size=5002, - # ) - # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config) - - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=5002, - ) - text_encoder = CLIPTextModel(text_encoder_config) - - tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") - tokenizer.model_max_length = 77 - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_alt_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - torch.manual_seed(0) - text_encoder_config = RobertaSeriesConfig( - hidden_size=32, - project_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - vocab_size=5002, - ) - # TODO: remove after fixing the non-deterministic text encoder - text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config) - components["text_encoder"] = text_encoder - - alt_pipe = AltDiffusionPipeline(**components) - alt_pipe = alt_pipe.to(device) - alt_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = "A photo of an astronaut" - output = alt_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_alt_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - text_encoder_config = RobertaSeriesConfig( - hidden_size=32, - project_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - vocab_size=5002, - ) - # TODO: remove after fixing the non-deterministic text encoder - text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config) - components["text_encoder"] = text_encoder - alt_pipe = AltDiffusionPipeline(**components) - alt_pipe = alt_pipe.to(device) - alt_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = alt_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch_gpu -class AltDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_alt_diffusion(self): - # make sure here that pndm scheduler skips prk - alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None) - alt_pipe = alt_pipe.to(torch_device) - alt_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np") - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1010, 0.0800, 0.0794, 0.0885, 0.0843, 0.0762, 0.0769, 0.0729, 0.0586]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_alt_diffusion_fast_ddim(self): - scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler") - - alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None) - alt_pipe = alt_pipe.to(torch_device) - alt_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - - output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy") - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4019, 0.4052, 0.3810, 0.4119, 0.3916, 0.3982, 0.4651, 0.4195, 0.5323]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py deleted file mode 100644 index 9396329434059db279d7b276af0301905fbc49cc..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +++ /dev/null @@ -1,299 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import XLMRobertaTokenizer - -from diffusers import ( - AltDiffusionImg2ImgPipeline, - AutoencoderKL, - PNDMScheduler, - UNet2DConditionModel, -) -from diffusers.image_processor import VaeImageProcessor -from diffusers.pipelines.alt_diffusion.modeling_roberta_series import ( - RobertaSeriesConfig, - RobertaSeriesModelWithTransformation, -) -from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = RobertaSeriesConfig( - hidden_size=32, - project_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=5006, - ) - return RobertaSeriesModelWithTransformation(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_stable_diffusion_img2img_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") - tokenizer.model_max_length = 77 - - init_image = self.dummy_image.to(device) - - # make sure here that pndm scheduler skips prk - alt_pipe = AltDiffusionImg2ImgPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) - alt_pipe = alt_pipe.to(device) - alt_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = alt_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - ) - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = alt_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_img2img_fp16(self): - """Test that stable diffusion img2img works with fp16""" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta") - tokenizer.model_max_length = 77 - - init_image = self.dummy_image.to(torch_device) - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - alt_pipe = AltDiffusionImg2ImgPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False) - alt_pipe = alt_pipe.to(torch_device) - alt_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - image = alt_pipe( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - image=init_image, - ).images - - assert image.shape == (1, 32, 32, 3) - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - # resize to resolution that is divisible by 8 but not 16 or 32 - init_image = init_image.resize((760, 504)) - - model_id = "BAAI/AltDiffusion" - pipe = AltDiffusionImg2ImgPipeline.from_pretrained( - model_id, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "A fantasy landscape, trending on artstation" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - strength=0.75, - guidance_scale=7.5, - generator=generator, - output_type="np", - ) - image = output.images[0] - - image_slice = image[255:258, 383:386, -1] - - assert image.shape == (504, 760, 3) - expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - -@slow -@require_torch_gpu -class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_img2img_pipeline_default(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - init_image = init_image.resize((768, 512)) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy" - ) - - model_id = "BAAI/AltDiffusion" - pipe = AltDiffusionImg2ImgPipeline.from_pretrained( - model_id, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "A fantasy landscape, trending on artstation" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - strength=0.75, - guidance_scale=7.5, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 768, 3) - # img2img is flaky across GPUs even in fp32, so using MAE here - assert np.abs(expected_image - image).max() < 1e-3 diff --git a/diffusers/tests/pipelines/audio_diffusion/__init__.py b/diffusers/tests/pipelines/audio_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py deleted file mode 100644 index ba389d9c936df1d096a54b02d332cfa8ac520901..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py +++ /dev/null @@ -1,191 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import ( - AudioDiffusionPipeline, - AutoencoderKL, - DDIMScheduler, - DDPMScheduler, - DiffusionPipeline, - Mel, - UNet2DConditionModel, - UNet2DModel, -) -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class PipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - sample_size=(32, 64), - in_channels=1, - out_channels=1, - layers_per_block=2, - block_out_channels=(128, 128), - down_block_types=("AttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "AttnUpBlock2D"), - ) - return model - - @property - def dummy_unet_condition(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - sample_size=(64, 32), - in_channels=1, - out_channels=1, - layers_per_block=2, - block_out_channels=(128, 128), - down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"), - cross_attention_dim=10, - ) - return model - - @property - def dummy_vqvae_and_unet(self): - torch.manual_seed(0) - vqvae = AutoencoderKL( - sample_size=(128, 64), - in_channels=1, - out_channels=1, - latent_channels=1, - layers_per_block=2, - block_out_channels=(128, 128), - down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"), - up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), - ) - unet = UNet2DModel( - sample_size=(64, 32), - in_channels=1, - out_channels=1, - layers_per_block=2, - block_out_channels=(128, 128), - down_block_types=("AttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "AttnUpBlock2D"), - ) - return vqvae, unet - - @slow - def test_audio_diffusion(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - mel = Mel() - - scheduler = DDPMScheduler() - pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=device).manual_seed(42) - output = pipe(generator=generator, steps=4) - audio = output.audios[0] - image = output.images[0] - - generator = torch.Generator(device=device).manual_seed(42) - output = pipe(generator=generator, steps=4, return_dict=False) - image_from_tuple = output[0][0] - - assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length) - assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1] - image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10] - expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127]) - - assert np.abs(image_slice.flatten() - expected_slice).max() == 0 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0 - - scheduler = DDIMScheduler() - dummy_vqvae_and_unet = self.dummy_vqvae_and_unet - pipe = AudioDiffusionPipeline( - vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler - ) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - np.random.seed(0) - raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,)) - generator = torch.Generator(device=device).manual_seed(42) - output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10) - image = output.images[0] - - assert ( - image.height == self.dummy_vqvae_and_unet[0].sample_size[0] - and image.width == self.dummy_vqvae_and_unet[0].sample_size[1] - ) - image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121]) - - assert np.abs(image_slice.flatten() - expected_slice).max() == 0 - - dummy_unet_condition = self.dummy_unet_condition - pipe = AudioDiffusionPipeline( - vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler - ) - - np.random.seed(0) - encoding = torch.rand((1, 1, 10)) - output = pipe(generator=generator, encoding=encoding) - image = output.images[0] - image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144]) - - assert np.abs(image_slice.flatten() - expected_slice).max() == 0 - - -@slow -@require_torch_gpu -class PipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_audio_diffusion(self): - device = torch_device - - pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256") - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=device).manual_seed(42) - output = pipe(generator=generator) - audio = output.audios[0] - image = output.images[0] - - assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length) - assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1] - image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26]) - - assert np.abs(image_slice.flatten() - expected_slice).max() == 0 diff --git a/diffusers/tests/pipelines/audioldm/__init__.py b/diffusers/tests/pipelines/audioldm/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/audioldm/test_audioldm.py b/diffusers/tests/pipelines/audioldm/test_audioldm.py deleted file mode 100644 index 10de5440eb007ae4cfc57953ea943eeee3500340..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/audioldm/test_audioldm.py +++ /dev/null @@ -1,416 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import gc -import unittest - -import numpy as np -import torch -import torch.nn.functional as F -from transformers import ( - ClapTextConfig, - ClapTextModelWithProjection, - RobertaTokenizer, - SpeechT5HifiGan, - SpeechT5HifiGanConfig, -) - -from diffusers import ( - AudioLDMPipeline, - AutoencoderKL, - DDIMScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - UNet2DConditionModel, -) -from diffusers.utils import slow, torch_device - -from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = AudioLDMPipeline - params = TEXT_TO_AUDIO_PARAMS - batch_params = TEXT_TO_AUDIO_BATCH_PARAMS - required_optional_params = frozenset( - [ - "num_inference_steps", - "num_waveforms_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ] - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=(32, 64), - class_embed_type="simple_projection", - projection_class_embeddings_input_dim=32, - class_embeddings_concat=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=1, - out_channels=1, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = ClapTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - projection_dim=32, - ) - text_encoder = ClapTextModelWithProjection(text_encoder_config) - tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77) - - vocoder_config = SpeechT5HifiGanConfig( - model_in_dim=8, - sampling_rate=16000, - upsample_initial_channel=16, - upsample_rates=[2, 2], - upsample_kernel_sizes=[4, 4], - resblock_kernel_sizes=[3, 7], - resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]], - normalize_before=False, - ) - - vocoder = SpeechT5HifiGan(vocoder_config) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "vocoder": vocoder, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - } - return inputs - - def test_audioldm_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = audioldm_pipe(**inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-2 - - def test_audioldm_prompt_embeds(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = audioldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - text_inputs = audioldm_pipe.tokenizer( - prompt, - padding="max_length", - max_length=audioldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - prompt_embeds = audioldm_pipe.text_encoder( - text_inputs, - ) - prompt_embeds = prompt_embeds.text_embeds - # additional L_2 normalization over each hidden-state - prompt_embeds = F.normalize(prompt_embeds, dim=-1) - - inputs["prompt_embeds"] = prompt_embeds - - # forward - output = audioldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_audioldm_negative_prompt_embeds(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - negative_prompt = 3 * ["this is a negative prompt"] - inputs["negative_prompt"] = negative_prompt - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = audioldm_pipe(**inputs) - audio_1 = output.audios[0] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = audioldm_pipe.tokenizer( - p, - padding="max_length", - max_length=audioldm_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - text_embeds = audioldm_pipe.text_encoder( - text_inputs, - ) - text_embeds = text_embeds.text_embeds - # additional L_2 normalization over each hidden-state - text_embeds = F.normalize(text_embeds, dim=-1) - - embeds.append(text_embeds) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - - # forward - output = audioldm_pipe(**inputs) - audio_2 = output.audios[0] - - assert np.abs(audio_1 - audio_2).max() < 1e-2 - - def test_audioldm_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "egg cracking" - output = audioldm_pipe(**inputs, negative_prompt=negative_prompt) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) == 256 - - audio_slice = audio[:10] - expected_slice = np.array( - [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032] - ) - - assert np.abs(audio_slice - expected_slice).max() < 1e-2 - - def test_audioldm_num_waveforms_per_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(device) - audioldm_pipe.set_progress_bar_config(disable=None) - - prompt = "A hammer hitting a wooden surface" - - # test num_waveforms_per_prompt=1 (default) - audios = audioldm_pipe(prompt, num_inference_steps=2).audios - - assert audios.shape == (1, 256) - - # test num_waveforms_per_prompt=1 (default) for batch of prompts - batch_size = 2 - audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios - - assert audios.shape == (batch_size, 256) - - # test num_waveforms_per_prompt for single prompt - num_waveforms_per_prompt = 2 - audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios - - assert audios.shape == (num_waveforms_per_prompt, 256) - - # test num_waveforms_per_prompt for batch of prompts - batch_size = 2 - audios = audioldm_pipe( - [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt - ).audios - - assert audios.shape == (batch_size * num_waveforms_per_prompt, 256) - - def test_audioldm_audio_length_in_s(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate - - inputs = self.get_dummy_inputs(device) - output = audioldm_pipe(audio_length_in_s=0.016, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.016 - - output = audioldm_pipe(audio_length_in_s=0.032, **inputs) - audio = output.audios[0] - - assert audio.ndim == 1 - assert len(audio) / vocoder_sampling_rate == 0.032 - - def test_audioldm_vocoder_model_in_dim(self): - components = self.get_dummy_components() - audioldm_pipe = AudioLDMPipeline(**components) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - prompt = ["hey"] - - output = audioldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - assert audio_shape == (1, 256) - - config = audioldm_pipe.vocoder.config - config.model_in_dim *= 2 - audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device) - output = audioldm_pipe(prompt, num_inference_steps=1) - audio_shape = output.audios.shape - # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram - assert audio_shape == (1, 256) - - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(test_mean_pixel_difference=False) - - -@slow -# @require_torch_gpu -class AudioLDMPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A hammer hitting a wooden surface", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 2.5, - } - return inputs - - def test_audioldm(self): - audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm") - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - audio = audioldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81920 - - audio_slice = audio[77230:77240] - expected_slice = np.array( - [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315] - ) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 1e-2 - - def test_audioldm_lms(self): - audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm") - audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config) - audioldm_pipe = audioldm_pipe.to(torch_device) - audioldm_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - audio = audioldm_pipe(**inputs).audios[0] - - assert audio.ndim == 1 - assert len(audio) == 81920 - - audio_slice = audio[27780:27790] - expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212]) - max_diff = np.abs(expected_slice - audio_slice).max() - assert max_diff < 1e-2 diff --git a/diffusers/tests/pipelines/dance_diffusion/__init__.py b/diffusers/tests/pipelines/dance_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py deleted file mode 100644 index bbd4aa694b769a0903c505383d9634de8ebd4063..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ /dev/null @@ -1,160 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DanceDiffusionPipeline - params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "callback", - "latents", - "callback_steps", - "output_type", - "num_images_per_prompt", - } - batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS - test_attention_slicing = False - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet1DModel( - block_out_channels=(32, 32, 64), - extra_in_channels=16, - sample_size=512, - sample_rate=16_000, - in_channels=2, - out_channels=2, - flip_sin_to_cos=True, - use_timestep_embedding=False, - time_embedding_type="fourier", - mid_block_type="UNetMidBlock1D", - down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"), - up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"), - ) - scheduler = IPNDMScheduler() - - components = { - "unet": unet, - "scheduler": scheduler, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "batch_size": 1, - "generator": generator, - "num_inference_steps": 4, - } - return inputs - - def test_dance_diffusion(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = DanceDiffusionPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, components["unet"].sample_size) - expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000]) - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - -@slow -@require_torch_gpu -class PipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_dance_diffusion(self): - device = torch_device - - pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k") - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, pipe.unet.sample_size) - expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020]) - - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 - - def test_dance_diffusion_fp16(self): - device = torch_device - - pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096) - audio = output.audios - - audio_slice = audio[0, -3:, -3:] - - assert audio.shape == (1, 2, pipe.unet.sample_size) - expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341]) - - assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/ddim/__init__.py b/diffusers/tests/pipelines/ddim/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/ddim/test_ddim.py b/diffusers/tests/pipelines/ddim/test_ddim.py deleted file mode 100644 index 4d2c4e490d638861c4d06fb3c2ddff489a2773d3..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/ddim/test_ddim.py +++ /dev/null @@ -1,132 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device - -from ...pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DDIMPipeline - params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "num_images_per_prompt", - "latents", - "callback", - "callback_steps", - } - batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - scheduler = DDIMScheduler() - components = {"unet": unet, "scheduler": scheduler} - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "batch_size": 1, - "generator": generator, - "num_inference_steps": 2, - "output_type": "numpy", - } - return inputs - - def test_inference(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - self.assertEqual(image.shape, (1, 32, 32, 3)) - expected_slice = np.array( - [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04] - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - -@slow -@require_torch_gpu -class DDIMPipelineIntegrationTests(unittest.TestCase): - def test_inference_cifar10(self): - model_id = "google/ddpm-cifar10-32" - - unet = UNet2DModel.from_pretrained(model_id) - scheduler = DDIMScheduler() - - ddim = DDIMPipeline(unet=unet, scheduler=scheduler) - ddim.to(torch_device) - ddim.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ddim(generator=generator, eta=0.0, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.1723, 0.1617, 0.1600, 0.1626, 0.1497, 0.1513, 0.1505, 0.1442, 0.1453]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_inference_ema_bedroom(self): - model_id = "google/ddpm-ema-bedroom-256" - - unet = UNet2DModel.from_pretrained(model_id) - scheduler = DDIMScheduler.from_pretrained(model_id) - - ddpm = DDIMPipeline(unet=unet, scheduler=scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ddpm(generator=generator, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.0060, 0.0201, 0.0344, 0.0024, 0.0018, 0.0002, 0.0022, 0.0000, 0.0069]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/ddpm/__init__.py b/diffusers/tests/pipelines/ddpm/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/ddpm/test_ddpm.py b/diffusers/tests/pipelines/ddpm/test_ddpm.py deleted file mode 100644 index 5e3e47cb74fbe07bb9ddf73c40b200bcea945237..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/ddpm/test_ddpm.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class DDPMPipelineFastTests(unittest.TestCase): - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - def test_fast_inference(self): - device = "cpu" - unet = self.dummy_uncond_unet - scheduler = DDPMScheduler() - - ddpm = DDPMPipeline(unet=unet, scheduler=scheduler) - ddpm.to(device) - ddpm.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=device).manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array( - [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_inference_predict_sample(self): - unet = self.dummy_uncond_unet - scheduler = DDPMScheduler(prediction_type="sample") - - ddpm = DDPMPipeline(unet=unet, scheduler=scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images - - generator = torch.manual_seed(0) - image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0] - - image_slice = image[0, -3:, -3:, -1] - image_eps_slice = image_eps[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - tolerance = 1e-2 if torch_device != "mps" else 3e-2 - assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance - - -@slow -@require_torch_gpu -class DDPMPipelineIntegrationTests(unittest.TestCase): - def test_inference_cifar10(self): - model_id = "google/ddpm-cifar10-32" - - unet = UNet2DModel.from_pretrained(model_id) - scheduler = DDPMScheduler.from_pretrained(model_id) - - ddpm = DDPMPipeline(unet=unet, scheduler=scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ddpm(generator=generator, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4200, 0.3588, 0.1939, 0.3847, 0.3382, 0.2647, 0.4155, 0.3582, 0.3385]) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/dit/__init__.py b/diffusers/tests/pipelines/dit/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/dit/test_dit.py b/diffusers/tests/pipelines/dit/test_dit.py deleted file mode 100644 index c514c3c7fa1d7b7a83307a04c37ca63dece289e5..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/dit/test_dit.py +++ /dev/null @@ -1,152 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel -from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import ( - CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, - CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS, -) -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = DiTPipeline - params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents", - "num_images_per_prompt", - "callback", - "callback_steps", - } - batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - transformer = Transformer2DModel( - sample_size=16, - num_layers=2, - patch_size=4, - attention_head_dim=8, - num_attention_heads=2, - in_channels=4, - out_channels=8, - attention_bias=True, - activation_fn="gelu-approximate", - num_embeds_ada_norm=1000, - norm_type="ada_norm_zero", - norm_elementwise_affine=False, - ) - vae = AutoencoderKL() - scheduler = DDIMScheduler() - components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler} - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "class_labels": [1], - "generator": generator, - "num_inference_steps": 2, - "output_type": "numpy", - } - return inputs - - def test_inference(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - self.assertEqual(image.shape, (1, 16, 16, 3)) - expected_slice = np.array([0.4380, 0.4141, 0.5159, 0.0000, 0.4282, 0.6680, 0.5485, 0.2545, 0.6719]) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) - - -@require_torch_gpu -@slow -class DiTPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_dit_256(self): - generator = torch.manual_seed(0) - - pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256") - pipe.to("cuda") - - words = ["vase", "umbrella", "white shark", "white wolf"] - ids = pipe.get_label_ids(words) - - images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images - - for word, image in zip(words, images): - expected_image = load_numpy( - f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy" - ) - assert np.abs((expected_image - image).max()) < 1e-2 - - def test_dit_512(self): - pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") - - words = ["vase", "umbrella"] - ids = pipe.get_label_ids(words) - - generator = torch.manual_seed(0) - images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images - - for word, image in zip(words, images): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - f"/dit/{word}_512.npy" - ) - - assert np.abs((expected_image - image).max()) < 1e-1 diff --git a/diffusers/tests/pipelines/karras_ve/__init__.py b/diffusers/tests/pipelines/karras_ve/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/karras_ve/test_karras_ve.py b/diffusers/tests/pipelines/karras_ve/test_karras_ve.py deleted file mode 100644 index 391e61a2b9c90c58049270a192884bd358621c52..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/karras_ve/test_karras_ve.py +++ /dev/null @@ -1,86 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class KarrasVePipelineFastTests(unittest.TestCase): - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - def test_inference(self): - unet = self.dummy_uncond_unet - scheduler = KarrasVeScheduler() - - pipe = KarrasVePipeline(unet=unet, scheduler=scheduler) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images - - generator = torch.manual_seed(0) - image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch -class KarrasVePipelineIntegrationTests(unittest.TestCase): - def test_inference(self): - model_id = "google/ncsnpp-celebahq-256" - model = UNet2DModel.from_pretrained(model_id) - scheduler = KarrasVeScheduler() - - pipe = KarrasVePipeline(unet=model, scheduler=scheduler) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/latent_diffusion/__init__.py b/diffusers/tests/pipelines/latent_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py deleted file mode 100644 index 3f2dbe5cec2a324d80fe7bcca1efffe9bcd3ab02..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py +++ /dev/null @@ -1,202 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel -from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = LDMTextToImagePipeline - params = TEXT_TO_IMAGE_PARAMS - { - "negative_prompt", - "negative_prompt_embeds", - "cross_attention_kwargs", - "prompt_embeds", - } - required_optional_params = PipelineTesterMixin.required_optional_params - { - "num_images_per_prompt", - "callback", - "callback_steps", - } - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=(32, 64), - in_channels=3, - out_channels=3, - down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"), - up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"), - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vqvae": vae, - "bert": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_inference_text2img(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = LDMTextToImagePipeline(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 16, 16, 3) - expected_slice = np.array([0.59450, 0.64078, 0.55509, 0.51229, 0.69640, 0.36960, 0.59296, 0.60801, 0.49332]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - -@slow -@require_torch_gpu -class LDMTextToImagePipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, dtype=torch.float32, seed=0): - generator = torch.manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_ldm_default_ddim(self): - pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878]) - max_diff = np.abs(expected_slice - image_slice).max() - assert max_diff < 1e-3 - - -@nightly -@require_torch_gpu -class LDMTextToImagePipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, dtype=torch.float32, seed=0): - generator = torch.manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "latents": latents, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_ldm_default_ddim(self): - pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py deleted file mode 100644 index f1aa2f08efbaac9d5d8ce55b2a01ebf9fc538bd1..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ /dev/null @@ -1,131 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import unittest - -import numpy as np -import torch - -from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel -from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class LDMSuperResolutionPipelineFastTests(unittest.TestCase): - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=6, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - @property - def dummy_vq_model(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - ) - return model - - def test_inference_superresolution(self): - device = "cpu" - unet = self.dummy_uncond_unet - scheduler = DDIMScheduler() - vqvae = self.dummy_vq_model - - ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler) - ldm.to(device) - ldm.set_progress_bar_config(disable=None) - - init_image = self.dummy_image.to(device) - - generator = torch.Generator(device=device).manual_seed(0) - image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_inference_superresolution_fp16(self): - unet = self.dummy_uncond_unet - scheduler = DDIMScheduler() - vqvae = self.dummy_vq_model - - # put models in fp16 - unet = unet.half() - vqvae = vqvae.half() - - ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler) - ldm.to(torch_device) - ldm.set_progress_bar_config(disable=None) - - init_image = self.dummy_image.to(torch_device) - - image = ldm(init_image, num_inference_steps=2, output_type="numpy").images - - assert image.shape == (1, 64, 64, 3) - - -@slow -@require_torch -class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase): - def test_inference_superresolution(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/vq_diffusion/teddy_bear_pool.png" - ) - init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"]) - - ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto") - ldm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.7560, 0.7425, 0.7257, 0.6907]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py deleted file mode 100644 index aa7b33730d1815d2b1de20b48c6106407cc41770..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel - -from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class LDMPipelineFastTests(unittest.TestCase): - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - @property - def dummy_vq_model(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - def test_inference_uncond(self): - unet = self.dummy_uncond_unet - scheduler = DDIMScheduler() - vae = self.dummy_vq_model - - ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler) - ldm.to(torch_device) - ldm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images - - generator = torch.manual_seed(0) - image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172]) - tolerance = 1e-2 if torch_device != "mps" else 3e-2 - - assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance - - -@slow -@require_torch -class LDMPipelineIntegrationTests(unittest.TestCase): - def test_inference_uncond(self): - ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256") - ldm.to(torch_device) - ldm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 256, 256, 3) - expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447]) - tolerance = 1e-2 if torch_device != "mps" else 3e-2 - - assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance diff --git a/diffusers/tests/pipelines/paint_by_example/__init__.py b/diffusers/tests/pipelines/paint_by_example/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py deleted file mode 100644 index 81d1989200ac1ddbab305d5143ec98bcd654f46b..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py +++ /dev/null @@ -1,210 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPImageProcessor, CLIPVisionConfig - -from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder -from diffusers.utils import floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = PaintByExamplePipeline - params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS - batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - config = CLIPVisionConfig( - hidden_size=32, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - image_size=32, - patch_size=4, - ) - image_encoder = PaintByExampleImageEncoder(config, proj_size=32) - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "image_encoder": image_encoder, - "safety_checker": None, - "feature_extractor": feature_extractor, - } - return components - - def convert_to_pt(self, image): - image = np.array(image.convert("RGB")) - image = image[None].transpose(0, 3, 1, 2) - image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 - return image - - def get_dummy_inputs(self, device="cpu", seed=0): - # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) - example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32)) - - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "example_image": example_image, - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_paint_by_example_inpaint(self): - components = self.get_dummy_components() - - # make sure here that pndm scheduler skips prk - pipe = PaintByExamplePipeline(**components) - pipe = pipe.to("cpu") - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - output = pipe(**inputs) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4701, 0.5555, 0.3994, 0.5107, 0.5691, 0.4517, 0.5125, 0.4769, 0.4539]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_paint_by_example_image_tensor(self): - device = "cpu" - inputs = self.get_dummy_inputs() - inputs.pop("mask_image") - image = self.convert_to_pt(inputs.pop("image")) - mask_image = image.clamp(0, 1) / 2 - - # make sure here that pndm scheduler skips prk - pipe = PaintByExamplePipeline(**self.get_dummy_components()) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - output = pipe(image=image, mask_image=mask_image[:, 0], **inputs) - out_1 = output.images - - image = image.cpu().permute(0, 2, 3, 1)[0] - mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0] - - image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB") - - output = pipe(**self.get_dummy_inputs()) - out_2 = output.images - - assert out_1.shape == (1, 64, 64, 3) - assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2 - - -@slow -@require_torch_gpu -class PaintByExamplePipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_paint_by_example(self): - # make sure here that pndm scheduler skips prk - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/dog_in_bucket.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/mask.png" - ) - example_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/paint_by_example/panda.jpg" - ) - - pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(321) - output = pipe( - image=init_image, - mask_image=mask_image, - example_image=example_image, - generator=generator, - guidance_scale=5.0, - num_inference_steps=50, - output_type="np", - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/pndm/__init__.py b/diffusers/tests/pipelines/pndm/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/pndm/test_pndm.py b/diffusers/tests/pipelines/pndm/test_pndm.py deleted file mode 100644 index bed5fea561dc670220c1864c614b68718e96a7ae..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/pndm/test_pndm.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class PNDMPipelineFastTests(unittest.TestCase): - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - def test_inference(self): - unet = self.dummy_uncond_unet - scheduler = PNDMScheduler() - - pndm = PNDMPipeline(unet=unet, scheduler=scheduler) - pndm.to(torch_device) - pndm.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images - - generator = torch.manual_seed(0) - image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch -class PNDMPipelineIntegrationTests(unittest.TestCase): - def test_inference_cifar10(self): - model_id = "google/ddpm-cifar10-32" - - unet = UNet2DModel.from_pretrained(model_id) - scheduler = PNDMScheduler() - - pndm = PNDMPipeline(unet=unet, scheduler=scheduler) - pndm.to(torch_device) - pndm.set_progress_bar_config(disable=None) - generator = torch.manual_seed(0) - image = pndm(generator=generator, output_type="numpy").images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/repaint/__init__.py b/diffusers/tests/pipelines/repaint/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/repaint/test_repaint.py b/diffusers/tests/pipelines/repaint/test_repaint.py deleted file mode 100644 index 060e6c9161baab099bc11b3d843dd4b48f7e2fb6..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/repaint/test_repaint.py +++ /dev/null @@ -1,162 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel -from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device - -from ...pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = RePaintPipeline - params = IMAGE_INPAINTING_PARAMS - {"width", "height", "guidance_scale"} - required_optional_params = PipelineTesterMixin.required_optional_params - { - "latents", - "num_images_per_prompt", - "callback", - "callback_steps", - } - batch_params = IMAGE_INPAINTING_BATCH_PARAMS - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - torch.manual_seed(0) - unet = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - scheduler = RePaintScheduler() - components = {"unet": unet, "scheduler": scheduler} - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - image = np.random.RandomState(seed).standard_normal((1, 3, 32, 32)) - image = torch.from_numpy(image).to(device=device, dtype=torch.float32) - mask = (image > 0).to(device=device, dtype=torch.float32) - inputs = { - "image": image, - "mask_image": mask, - "generator": generator, - "num_inference_steps": 5, - "eta": 0.0, - "jump_length": 2, - "jump_n_sample": 2, - "output_type": "numpy", - } - return inputs - - def test_repaint(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = RePaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([1.0000, 0.5426, 0.5497, 0.2200, 1.0000, 1.0000, 0.5623, 1.0000, 0.6274]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - # RePaint can hardly be made deterministic since the scheduler is currently always - # nondeterministic - @unittest.skip("non-deterministic pipeline") - def test_inference_batch_single_identical(self): - return super().test_inference_batch_single_identical() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - -@nightly -@require_torch_gpu -class RepaintPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_celebahq(self): - original_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" - "repaint/celeba_hq_256.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" - "repaint/celeba_hq_256_result.npy" - ) - - model_id = "google/ddpm-ema-celebahq-256" - unet = UNet2DModel.from_pretrained(model_id) - scheduler = RePaintScheduler.from_pretrained(model_id) - - repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device) - repaint.set_progress_bar_config(disable=None) - repaint.enable_attention_slicing() - - generator = torch.manual_seed(0) - output = repaint( - original_image, - mask_image, - num_inference_steps=250, - eta=0.0, - jump_length=10, - jump_n_sample=10, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (256, 256, 3) - assert np.abs(expected_image - image).mean() < 1e-2 diff --git a/diffusers/tests/pipelines/score_sde_ve/__init__.py b/diffusers/tests/pipelines/score_sde_ve/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py deleted file mode 100644 index 036ecc3f6bf3c3a61780933c0a404ca91abe5dc4..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py +++ /dev/null @@ -1,91 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class ScoreSdeVeipelineFastTests(unittest.TestCase): - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - def test_inference(self): - unet = self.dummy_uncond_unet - scheduler = ScoreSdeVeScheduler() - - sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler) - sde_ve.to(torch_device) - sde_ve.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images - - generator = torch.manual_seed(0) - image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[ - 0 - ] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch -class ScoreSdeVePipelineIntegrationTests(unittest.TestCase): - def test_inference(self): - model_id = "google/ncsnpp-church-256" - model = UNet2DModel.from_pretrained(model_id) - - scheduler = ScoreSdeVeScheduler.from_pretrained(model_id) - - sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler) - sde_ve.to(torch_device) - sde_ve.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 256, 256, 3) - - expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py b/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py deleted file mode 100644 index b312c8184390c0eb7df751cbbbf1e1b5146fb428..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py +++ /dev/null @@ -1,601 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline -from diffusers.utils import floats_tensor, nightly, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class SafeDiffusionPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_semantic_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_semantic_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_semantic_diffusion_no_safety_checker(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None - ) - assert isinstance(pipe, StableDiffusionPipeline) - assert isinstance(pipe.scheduler, LMSDiscreteScheduler) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_semantic_diffusion_fp16(self): - """Test that stable diffusion works with fp16""" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images - - assert image.shape == (1, 64, 64, 3) - - -@nightly -@require_torch_gpu -class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_positive_guidance(self): - torch_device = "cuda" - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a photo of a cat" - edit = { - "editing_prompt": ["sunglasses"], - "reverse_editing_direction": [False], - "edit_warmup_steps": 10, - "edit_guidance_scale": 6, - "edit_threshold": 0.95, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 3 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.34673113, - 0.38492733, - 0.37597352, - 0.34086335, - 0.35650748, - 0.35579205, - 0.3384763, - 0.34340236, - 0.3573271, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.41887826, - 0.37728766, - 0.30138272, - 0.41416335, - 0.41664985, - 0.36283392, - 0.36191246, - 0.43364465, - 0.43001732, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_negative_guidance(self): - torch_device = "cuda" - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "an image of a crowded boulevard, realistic, 4k" - edit = { - "editing_prompt": "crowd, crowded, people", - "reverse_editing_direction": True, - "edit_warmup_steps": 10, - "edit_guidance_scale": 8.3, - "edit_threshold": 0.9, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 9 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.43497998, - 0.91814065, - 0.7540739, - 0.55580205, - 0.8467265, - 0.5389691, - 0.62574506, - 0.58897763, - 0.50926757, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.3089719, - 0.30500144, - 0.29016042, - 0.30630964, - 0.325687, - 0.29419225, - 0.2908091, - 0.28723598, - 0.27696294, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_multi_cond_guidance(self): - torch_device = "cuda" - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a castle next to a river" - edit = { - "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"], - "reverse_editing_direction": False, - "edit_warmup_steps": [15, 18], - "edit_guidance_scale": 6, - "edit_threshold": [0.9, 0.8], - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 48 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.75163555, - 0.76037145, - 0.61785, - 0.9189673, - 0.8627701, - 0.85189694, - 0.8512813, - 0.87012076, - 0.8312857, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.73553365, - 0.7537271, - 0.74341905, - 0.66480356, - 0.6472925, - 0.63039416, - 0.64812905, - 0.6749717, - 0.6517102, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_guidance_fp16(self): - torch_device = "cuda" - pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a photo of a cat" - edit = { - "editing_prompt": ["sunglasses"], - "reverse_editing_direction": [False], - "edit_warmup_steps": 10, - "edit_guidance_scale": 6, - "edit_threshold": 0.95, - "edit_momentum_scale": 0.5, - "edit_mom_beta": 0.6, - } - - seed = 3 - guidance_scale = 7 - - # no sega enabled - generator = torch.Generator(torch_device) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.34887695, - 0.3876953, - 0.375, - 0.34423828, - 0.3581543, - 0.35717773, - 0.3383789, - 0.34570312, - 0.359375, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # with sega enabled - # generator = torch.manual_seed(seed) - generator.manual_seed(seed) - output = pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - **edit, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [ - 0.42285156, - 0.36914062, - 0.29077148, - 0.42041016, - 0.41918945, - 0.35498047, - 0.3618164, - 0.4423828, - 0.43115234, - ] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py b/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py deleted file mode 100644 index 594d7c598f7507d07973e9e2cd8f62a5f0a1b7fd..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ /dev/null @@ -1,235 +0,0 @@ -# coding=utf-8 -# Copyright 2022 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline -from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder -from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device -from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime - -from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -MIDI_FILE = "./tests/fixtures/elise_format0.mid" - - -class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = SpectrogramDiffusionPipeline - required_optional_params = PipelineTesterMixin.required_optional_params - { - "callback", - "latents", - "callback_steps", - "output_type", - "num_images_per_prompt", - } - test_attention_slicing = False - test_cpu_offload = False - batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS - params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - notes_encoder = SpectrogramNotesEncoder( - max_length=2048, - vocab_size=1536, - d_model=768, - dropout_rate=0.1, - num_layers=1, - num_heads=1, - d_kv=4, - d_ff=2048, - feed_forward_proj="gated-gelu", - ) - - continuous_encoder = SpectrogramContEncoder( - input_dims=128, - targets_context_length=256, - d_model=768, - dropout_rate=0.1, - num_layers=1, - num_heads=1, - d_kv=4, - d_ff=2048, - feed_forward_proj="gated-gelu", - ) - - decoder = T5FilmDecoder( - input_dims=128, - targets_length=256, - max_decoder_noise_time=20000.0, - d_model=768, - num_layers=1, - num_heads=1, - d_kv=4, - d_ff=2048, - dropout_rate=0.1, - ) - - scheduler = DDPMScheduler() - - components = { - "notes_encoder": notes_encoder.eval(), - "continuous_encoder": continuous_encoder.eval(), - "decoder": decoder.eval(), - "scheduler": scheduler, - "melgan": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "input_tokens": [ - [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033 - ], - "generator": generator, - "num_inference_steps": 4, - "output_type": "mel", - } - return inputs - - def test_spectrogram_diffusion(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = SpectrogramDiffusionPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - mel = output.audios - - mel_slice = mel[0, -3:, -3:] - - assert mel_slice.shape == (3, 3) - expected_slice = np.array( - [-11.512925, -4.788215, -0.46172905, -2.051715, -10.539147, -10.970963, -9.091634, 4.0, 4.0] - ) - assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - def test_inference_batch_single_identical(self): - pass - - def test_inference_batch_consistent(self): - pass - - @skip_mps - def test_progress_bar(self): - return super().test_progress_bar() - - -@slow -@require_torch_gpu -@require_onnxruntime -@require_note_seq -class PipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_callback(self): - # TODO - test that pipeline can decode tokens in a callback - # so that music can be played live - device = torch_device - - pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") - melgan = pipe.melgan - pipe.melgan = None - - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - def callback(step, mel_output): - # decode mel to audio - audio = melgan(input_features=mel_output.astype(np.float32))[0] - assert len(audio[0]) == 81920 * (step + 1) - # simulate that audio is played - return audio - - processor = MidiProcessor() - input_tokens = processor(MIDI_FILE) - - input_tokens = input_tokens[:3] - generator = torch.manual_seed(0) - pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel") - - def test_spectrogram_fast(self): - device = torch_device - - pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - processor = MidiProcessor() - - input_tokens = processor(MIDI_FILE) - # just run two denoising loops - input_tokens = input_tokens[:2] - - generator = torch.manual_seed(0) - output = pipe(input_tokens, num_inference_steps=2, generator=generator) - - audio = output.audios[0] - - assert abs(np.abs(audio).sum() - 3612.841) < 1e-1 - - def test_spectrogram(self): - device = torch_device - - pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion") - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - processor = MidiProcessor() - - input_tokens = processor(MIDI_FILE) - - # just run 4 denoising loops - input_tokens = input_tokens[:4] - - generator = torch.manual_seed(0) - output = pipe(input_tokens, num_inference_steps=100, generator=generator) - - audio = output.audios[0] - assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/__init__.py b/diffusers/tests/pipelines/stable_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py deleted file mode 100644 index 5282cfd8dd2472ca8bf1bb785c6ee69268d4be52..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel -from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = CycleDiffusionPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { - "negative_prompt", - "height", - "width", - "negative_prompt_embeds", - } - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - num_train_timesteps=1000, - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "An astronaut riding an elephant", - "source_prompt": "An astronaut riding a horse", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "eta": 0.1, - "strength": 0.8, - "guidance_scale": 3, - "source_guidance_scale": 1, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_cycle(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - pipe = CycleDiffusionPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = pipe(**inputs) - images = output.images - - image_slice = images[0, -3:, -3:, -1] - - assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4459, 0.4943, 0.4544, 0.6643, 0.5474, 0.4327, 0.5701, 0.5959, 0.5179]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_cycle_fp16(self): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.half() - pipe = CycleDiffusionPipeline(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs) - images = output.images - - image_slice = images[0, -3:, -3:, -1] - - assert images.shape == (1, 32, 32, 3) - expected_slice = np.array([0.3506, 0.4543, 0.446, 0.4575, 0.5195, 0.4155, 0.5273, 0.518, 0.4116]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @unittest.skip("non-deterministic pipeline") - def test_inference_batch_single_identical(self): - return super().test_inference_batch_single_identical() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - -@slow -@require_torch_gpu -class CycleDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_cycle_diffusion_pipeline_fp16(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/cycle-diffusion/black_colored_car.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car_fp16.npy" - ) - init_image = init_image.resize((512, 512)) - - model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = CycleDiffusionPipeline.from_pretrained( - model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16" - ) - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - source_prompt = "A black colored car" - prompt = "A blue colored car" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - source_prompt=source_prompt, - image=init_image, - num_inference_steps=100, - eta=0.1, - strength=0.85, - guidance_scale=3, - source_guidance_scale=1, - generator=generator, - output_type="np", - ) - image = output.images - - # the values aren't exactly equal, but the images look the same visually - assert np.abs(image - expected_image).max() < 5e-1 - - def test_cycle_diffusion_pipeline(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/cycle-diffusion/black_colored_car.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car.npy" - ) - init_image = init_image.resize((512, 512)) - - model_id = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None) - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - source_prompt = "A black colored car" - prompt = "A blue colored car" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - source_prompt=source_prompt, - image=init_image, - num_inference_steps=100, - eta=0.1, - strength=0.85, - guidance_scale=3, - source_guidance_scale=1, - generator=generator, - output_type="np", - ) - image = output.images - - assert np.abs(image - expected_image).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py deleted file mode 100644 index 74783faae421cb0a10a89fda4f19454f4cf834a8..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py +++ /dev/null @@ -1,306 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile -import unittest - -import numpy as np - -from diffusers import ( - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - OnnxStableDiffusionPipeline, - PNDMScheduler, -) -from diffusers.utils.testing_utils import is_onnx_available, nightly, require_onnxruntime, require_torch_gpu - -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin - - -if is_onnx_available(): - import onnxruntime as ort - - -class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase): - hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline" - - def get_dummy_inputs(self, seed=0): - generator = np.random.RandomState(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_pipeline_default_ddim(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.65072, 0.58492, 0.48219, 0.55521, 0.53180, 0.55939, 0.50697, 0.39800, 0.46455]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_pipeline_pndm(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.65863, 0.59425, 0.49326, 0.56313, 0.53875, 0.56627, 0.51065, 0.39777, 0.46330]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_pipeline_lms(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_pipeline_euler(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_pipeline_euler_ancestral(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.53817, 0.60812, 0.47384, 0.49530, 0.51894, 0.49814, 0.47984, 0.38958, 0.44271]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_pipeline_dpm_multistep(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.53895, 0.60808, 0.47933, 0.49608, 0.51886, 0.49950, 0.48053, 0.38957, 0.44200]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@nightly -@require_onnxruntime -@require_torch_gpu -class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase): - @property - def gpu_provider(self): - return ( - "CUDAExecutionProvider", - { - "gpu_mem_limit": "15000000000", # 15GB - "arena_extend_strategy": "kSameAsRequested", - }, - ) - - @property - def gpu_options(self): - options = ort.SessionOptions() - options.enable_mem_pattern = False - return options - - def test_inference_default_pndm(self): - # using the PNDM scheduler by default - sd_pipe = OnnxStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - np.random.seed(0) - output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=10, output_type="np") - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0452, 0.0390, 0.0087, 0.0350, 0.0617, 0.0364, 0.0544, 0.0523, 0.0720]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_inference_ddim(self): - ddim_scheduler = DDIMScheduler.from_pretrained( - "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx" - ) - sd_pipe = OnnxStableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - revision="onnx", - scheduler=ddim_scheduler, - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "open neural network exchange" - generator = np.random.RandomState(0) - output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np") - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2867, 0.1974, 0.1481, 0.7294, 0.7251, 0.6667, 0.4194, 0.5642, 0.6486]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_inference_k_lms(self): - lms_scheduler = LMSDiscreteScheduler.from_pretrained( - "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx" - ) - sd_pipe = OnnxStableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - revision="onnx", - scheduler=lms_scheduler, - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "open neural network exchange" - generator = np.random.RandomState(0) - output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np") - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2306, 0.1959, 0.1593, 0.6549, 0.6394, 0.5408, 0.5065, 0.6010, 0.6161]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_intermediate_state(self): - number_of_steps = 0 - - def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - test_callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 0: - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.6772, -0.3835, -1.2456, 0.1905, -1.0974, 0.6967, -1.9353, 0.0178, 1.0167] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - elif step == 5: - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.3351, 0.2241, -0.1837, -0.2325, -0.6577, 0.3393, -0.0241, 0.5899, 1.3875] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - - test_callback_fn.has_been_called = False - - pipe = OnnxStableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "Andromeda galaxy in a bottle" - - generator = np.random.RandomState(0) - pipe( - prompt=prompt, - num_inference_steps=5, - guidance_scale=7.5, - generator=generator, - callback=test_callback_fn, - callback_steps=1, - ) - assert test_callback_fn.has_been_called - assert number_of_steps == 6 - - def test_stable_diffusion_no_safety_checker(self): - pipe = OnnxStableDiffusionPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - assert isinstance(pipe, OnnxStableDiffusionPipeline) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = OnnxStableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py deleted file mode 100644 index e1aa2f6dc0a1641f217f0b20ef93d2f82cf15140..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py +++ /dev/null @@ -1,245 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import unittest - -import numpy as np - -from diffusers import ( - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - OnnxStableDiffusionImg2ImgPipeline, - PNDMScheduler, -) -from diffusers.utils import floats_tensor -from diffusers.utils.testing_utils import ( - is_onnx_available, - load_image, - nightly, - require_onnxruntime, - require_torch_gpu, -) - -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin - - -if is_onnx_available(): - import onnxruntime as ort - - -class OnnxStableDiffusionImg2ImgPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase): - hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline" - - def get_dummy_inputs(self, seed=0): - image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed)) - generator = np.random.RandomState(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_pipeline_default_ddim(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.69643, 0.58484, 0.50314, 0.58760, 0.55368, 0.59643, 0.51529, 0.41217, 0.49087]) - assert np.abs(image_slice - expected_slice).max() < 1e-1 - - def test_pipeline_pndm(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.61737, 0.54642, 0.53183, 0.54465, 0.52742, 0.60525, 0.49969, 0.40655, 0.48154]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_lms(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - # warmup pass to apply optimizations - _ = pipe(**self.get_dummy_inputs()) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.52761, 0.59977, 0.49033, 0.49619, 0.54282, 0.50311, 0.47600, 0.40918, 0.45203]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_euler(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_euler_ancestral(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_dpm_multistep(self): - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 128, 128, 3) - expected_slice = np.array([0.65331, 0.58277, 0.48204, 0.56059, 0.53665, 0.56235, 0.50969, 0.40009, 0.46552]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - -@nightly -@require_onnxruntime -@require_torch_gpu -class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase): - @property - def gpu_provider(self): - return ( - "CUDAExecutionProvider", - { - "gpu_mem_limit": "15000000000", # 15GB - "arena_extend_strategy": "kSameAsRequested", - }, - ) - - @property - def gpu_options(self): - options = ort.SessionOptions() - options.enable_mem_pattern = False - return options - - def test_inference_default_pndm(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - init_image = init_image.resize((768, 512)) - # using the PNDM scheduler by default - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A fantasy landscape, trending on artstation" - - generator = np.random.RandomState(0) - output = pipe( - prompt=prompt, - image=init_image, - strength=0.75, - guidance_scale=7.5, - num_inference_steps=10, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 383:386, -1] - - assert images.shape == (1, 512, 768, 3) - expected_slice = np.array([0.4909, 0.5059, 0.5372, 0.4623, 0.4876, 0.5049, 0.4820, 0.4956, 0.5019]) - # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues - - assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 - - def test_inference_k_lms(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - init_image = init_image.resize((768, 512)) - lms_scheduler = LMSDiscreteScheduler.from_pretrained( - "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx" - ) - pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", - revision="onnx", - scheduler=lms_scheduler, - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A fantasy landscape, trending on artstation" - - generator = np.random.RandomState(0) - output = pipe( - prompt=prompt, - image=init_image, - strength=0.75, - guidance_scale=7.5, - num_inference_steps=20, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 383:386, -1] - - assert images.shape == (1, 512, 768, 3) - expected_slice = np.array([0.8043, 0.926, 0.9581, 0.8119, 0.8954, 0.913, 0.7209, 0.7463, 0.7431]) - # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues - - assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py deleted file mode 100644 index 16287d64d154872f50b49b822daec79641f11f11..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py +++ /dev/null @@ -1,141 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -from diffusers import LMSDiscreteScheduler, OnnxStableDiffusionInpaintPipeline -from diffusers.utils.testing_utils import ( - is_onnx_available, - load_image, - nightly, - require_onnxruntime, - require_torch_gpu, -) - -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin - - -if is_onnx_available(): - import onnxruntime as ort - - -class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase): - # FIXME: add fast tests - pass - - -@nightly -@require_onnxruntime -@require_torch_gpu -class OnnxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): - @property - def gpu_provider(self): - return ( - "CUDAExecutionProvider", - { - "gpu_mem_limit": "15000000000", # 15GB - "arena_extend_strategy": "kSameAsRequested", - }, - ) - - @property - def gpu_options(self): - options = ort.SessionOptions() - options.enable_mem_pattern = False - return options - - def test_inference_default_pndm(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ) - pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A red cat sitting on a park bench" - - generator = np.random.RandomState(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - guidance_scale=7.5, - num_inference_steps=10, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 255:258, -1] - - assert images.shape == (1, 512, 512, 3) - expected_slice = np.array([0.2514, 0.3007, 0.3517, 0.1790, 0.2382, 0.3167, 0.1944, 0.2273, 0.2464]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_inference_k_lms(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ) - lms_scheduler = LMSDiscreteScheduler.from_pretrained( - "runwayml/stable-diffusion-inpainting", subfolder="scheduler", revision="onnx" - ) - pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", - revision="onnx", - scheduler=lms_scheduler, - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A red cat sitting on a park bench" - - generator = np.random.RandomState(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - guidance_scale=7.5, - num_inference_steps=20, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 255:258, -1] - - assert images.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0086, 0.0077, 0.0083, 0.0093, 0.0107, 0.0139, 0.0094, 0.0097, 0.0125]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py deleted file mode 100644 index 235aa32f7338579210520c675b3776b830cbe3da..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -from diffusers import OnnxStableDiffusionInpaintPipelineLegacy -from diffusers.utils.testing_utils import ( - is_onnx_available, - load_image, - load_numpy, - nightly, - require_onnxruntime, - require_torch_gpu, -) - - -if is_onnx_available(): - import onnxruntime as ort - - -@nightly -@require_onnxruntime -@require_torch_gpu -class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase): - @property - def gpu_provider(self): - return ( - "CUDAExecutionProvider", - { - "gpu_mem_limit": "15000000000", # 15GB - "arena_extend_strategy": "kSameAsRequested", - }, - ) - - @property - def gpu_options(self): - options = ort.SessionOptions() - options.enable_mem_pattern = False - return options - - def test_inference(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo_mask.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/red_cat_sitting_on_a_park_bench_onnx.npy" - ) - - # using the PNDM scheduler by default - pipe = OnnxStableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", - revision="onnx", - safety_checker=None, - feature_extractor=None, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A red cat sitting on a park bench" - - generator = np.random.RandomState(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - strength=0.75, - guidance_scale=7.5, - num_inference_steps=15, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py deleted file mode 100644 index d1527a42a1e56b3de663c596a8457fab5006bfb2..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py +++ /dev/null @@ -1,232 +0,0 @@ -# coding=utf-8 -# Copyright 2022 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import random -import unittest - -import numpy as np -import torch - -from diffusers import ( - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - OnnxStableDiffusionUpscalePipeline, - PNDMScheduler, -) -from diffusers.utils import floats_tensor -from diffusers.utils.testing_utils import ( - is_onnx_available, - load_image, - nightly, - require_onnxruntime, - require_torch_gpu, -) - -from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin - - -if is_onnx_available(): - import onnxruntime as ort - - -class OnnxStableDiffusionUpscalePipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase): - # TODO: is there an appropriate internal test set? - hub_checkpoint = "ssube/stable-diffusion-x4-upscaler-onnx" - - def get_dummy_inputs(self, seed=0): - image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed)) - generator = torch.manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": image, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_pipeline_default_ddpm(self): - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - # started as 128, should now be 512 - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223] - ) - assert np.abs(image_slice - expected_slice).max() < 1e-1 - - def test_pipeline_pndm(self): - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.6898892, 0.59240556, 0.52499527, 0.58866215, 0.52258235, 0.52572715, 0.62414473, 0.6174387, 0.6214964] - ) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_dpm_multistep(self): - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.7659278, 0.76437664, 0.75579107, 0.7691116, 0.77666986, 0.7727672, 0.7758664, 0.7812226, 0.76942515] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_euler(self): - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223] - ) - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - def test_pipeline_euler_ancestral(self): - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider") - pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.77424496, 0.773601, 0.7645288, 0.7769598, 0.7772739, 0.7738688, 0.78187233, 0.77879584, 0.767043] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - -@nightly -@require_onnxruntime -@require_torch_gpu -class OnnxStableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): - @property - def gpu_provider(self): - return ( - "CUDAExecutionProvider", - { - "gpu_mem_limit": "15000000000", # 15GB - "arena_extend_strategy": "kSameAsRequested", - }, - ) - - @property - def gpu_options(self): - options = ort.SessionOptions() - options.enable_mem_pattern = False - return options - - def test_inference_default_ddpm(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - init_image = init_image.resize((128, 128)) - # using the PNDM scheduler by default - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained( - "ssube/stable-diffusion-x4-upscaler-onnx", - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A fantasy landscape, trending on artstation" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - guidance_scale=7.5, - num_inference_steps=10, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 383:386, -1] - - assert images.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4883, 0.4947, 0.4980, 0.4975, 0.4982, 0.4980, 0.5000, 0.5006, 0.4972]) - # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues - - assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 - - def test_inference_k_lms(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - init_image = init_image.resize((128, 128)) - lms_scheduler = LMSDiscreteScheduler.from_pretrained( - "ssube/stable-diffusion-x4-upscaler-onnx", subfolder="scheduler" - ) - pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained( - "ssube/stable-diffusion-x4-upscaler-onnx", - scheduler=lms_scheduler, - provider=self.gpu_provider, - sess_options=self.gpu_options, - ) - pipe.set_progress_bar_config(disable=None) - - prompt = "A fantasy landscape, trending on artstation" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - guidance_scale=7.5, - num_inference_steps=20, - generator=generator, - output_type="np", - ) - images = output.images - image_slice = images[0, 255:258, 383:386, -1] - - assert images.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.50173753, 0.50223356, 0.502039, 0.50233036, 0.5023725, 0.5022601, 0.5018758, 0.50234085, 0.50241566] - ) - # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues - - assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py deleted file mode 100644 index 857122782d354cd5fcd5b69daf2f601be799c5d1..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ /dev/null @@ -1,1025 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import gc -import tempfile -import time -import unittest - -import numpy as np -import torch -from huggingface_hub import hf_hub_download -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPipeline, - UNet2DConditionModel, - logging, -) -from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu - -from ...models.test_models_unet_2d_condition import create_lora_layers -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_lora(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - # forward 1 - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - # set lora layers - lora_attn_procs = create_lora_layers(sd_pipe.unet) - sd_pipe.unet.set_attn_processor(lora_attn_procs) - sd_pipe = sd_pipe.to(torch_device) - - # forward 2 - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0}) - image = output.images - image_slice_1 = image[0, -3:, -3:, -1] - - # forward 3 - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5}) - image = output.images - image_slice_2 = image[0, -3:, -3:, -1] - - assert np.abs(image_slice - image_slice_1).max() < 1e-2 - assert np.abs(image_slice - image_slice_2).max() > 1e-2 - - def test_stable_diffusion_prompt_embeds(self): - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - text_inputs = sd_pipe.tokenizer( - prompt, - padding="max_length", - max_length=sd_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - prompt_embeds = sd_pipe.text_encoder(text_inputs)[0] - - inputs["prompt_embeds"] = prompt_embeds - - # forward - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - def test_stable_diffusion_negative_prompt_embeds(self): - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - negative_prompt = 3 * ["this is a negative prompt"] - inputs["negative_prompt"] = negative_prompt - inputs["prompt"] = 3 * [inputs["prompt"]] - - # forward - output = sd_pipe(**inputs) - image_slice_1 = output.images[0, -3:, -3:, -1] - - inputs = self.get_dummy_inputs(torch_device) - prompt = 3 * [inputs.pop("prompt")] - - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = sd_pipe.tokenizer( - p, - padding="max_length", - max_length=sd_pipe.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - text_inputs = text_inputs["input_ids"].to(torch_device) - - embeds.append(sd_pipe.text_encoder(text_inputs)[0]) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - - # forward - output = sd_pipe(**inputs) - image_slice_2 = output.images[0, -3:, -3:, -1] - - assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4 - - def test_stable_diffusion_ddim_factor_8(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs, height=136, width=136) - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 136, 136, 3) - expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_no_safety_checker(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None - ) - assert isinstance(pipe, StableDiffusionPipeline) - assert isinstance(pipe.scheduler, LMSDiscreteScheduler) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - def test_stable_diffusion_k_lms(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [ - 0.47082293033599854, - 0.5371589064598083, - 0.4562119245529175, - 0.5220914483070374, - 0.5733777284622192, - 0.4795039892196655, - 0.5465868711471558, - 0.5074326395988464, - 0.5042197108268738, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_k_euler_ancestral(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [ - 0.4707113206386566, - 0.5372191071510315, - 0.4563021957874298, - 0.5220003724098206, - 0.5734264850616455, - 0.4794946610927582, - 0.5463782548904419, - 0.5074145197868347, - 0.504422664642334, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_k_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [ - 0.47082313895225525, - 0.5371587872505188, - 0.4562119245529175, - 0.5220913887023926, - 0.5733776688575745, - 0.47950395941734314, - 0.546586811542511, - 0.5074326992034912, - 0.5042197108268738, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_vae_slicing(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - image_count = 4 - - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = [inputs["prompt"]] * image_count - output_1 = sd_pipe(**inputs) - - # make sure sliced vae decode yields the same result - sd_pipe.enable_vae_slicing() - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = [inputs["prompt"]] * image_count - output_2 = sd_pipe(**inputs) - - # there is a small discrepancy at image borders vs. full batch decode - assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3 - - def test_stable_diffusion_vae_tiling(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - - # make sure here that pndm scheduler skips prk - components["safety_checker"] = None - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - # Test that tiled decode at 512x512 yields the same result as the non-tiled decode - generator = torch.Generator(device=device).manual_seed(0) - output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - # make sure tiled vae decode yields the same result - sd_pipe.enable_vae_tiling() - generator = torch.Generator(device=device).manual_seed(0) - output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1 - - # test that tiled decode works with various shapes - shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)] - for shape in shapes: - zeros = torch.zeros(shape).to(device) - sd_pipe.vae.decode(zeros) - - def test_stable_diffusion_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array( - [ - 0.5108221173286438, - 0.5688379406929016, - 0.4685141146183014, - 0.5098261833190918, - 0.5657756328582764, - 0.4631010890007019, - 0.5226285457611084, - 0.49129390716552734, - 0.4899061322212219, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_long_prompt(self): - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - do_classifier_free_guidance = True - negative_prompt = None - num_images_per_prompt = 1 - logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") - - prompt = 25 * "@" - with CaptureLogger(logger) as cap_logger_3: - text_embeddings_3 = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - prompt = 100 * "@" - with CaptureLogger(logger) as cap_logger: - text_embeddings = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - negative_prompt = "Hello" - with CaptureLogger(logger) as cap_logger_2: - text_embeddings_2 = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape - assert text_embeddings.shape[1] == 77 - - assert cap_logger.out == cap_logger_2.out - # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25 - assert cap_logger.out.count("@") == 25 - assert cap_logger_3.out == "" - - def test_stable_diffusion_height_width_opt(self): - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "hey" - - output = sd_pipe(prompt, num_inference_steps=1, output_type="np") - image_shape = output.images[0].shape[:2] - assert image_shape == (64, 64) - - output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np") - image_shape = output.images[0].shape[:2] - assert image_shape == (96, 96) - - config = dict(sd_pipe.unet.config) - config["sample_size"] = 96 - sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device) - output = sd_pipe(prompt, num_inference_steps=1, output_type="np") - image_shape = output.images[0].shape[:2] - assert image_shape == (192, 192) - - -@slow -@require_torch_gpu -class StableDiffusionPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_1_1_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_1_4_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_dpm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - # enable attention slicing - pipe.enable_attention_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image_sliced = pipe(**inputs).images - - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - # make sure that less than 3.75 GB is allocated - assert mem_bytes < 3.75 * 10**9 - - # disable slicing - pipe.disable_attention_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image = pipe(**inputs).images - - # make sure that more than 3.75 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes > 3.75 * 10**9 - assert np.abs(image_sliced - image).max() < 1e-3 - - def test_stable_diffusion_vae_slicing(self): - torch.cuda.reset_peak_memory_stats() - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - # enable vae slicing - pipe.enable_vae_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - inputs["prompt"] = [inputs["prompt"]] * 4 - inputs["latents"] = torch.cat([inputs["latents"]] * 4) - image_sliced = pipe(**inputs).images - - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - # make sure that less than 4 GB is allocated - assert mem_bytes < 4e9 - - # disable vae slicing - pipe.disable_vae_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - inputs["prompt"] = [inputs["prompt"]] * 4 - inputs["latents"] = torch.cat([inputs["latents"]] * 4) - image = pipe(**inputs).images - - # make sure that more than 4 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes > 4e9 - # There is a small discrepancy at the image borders vs. a fully batched version. - assert np.abs(image_sliced - image).max() < 1e-2 - - def test_stable_diffusion_vae_tiling(self): - torch.cuda.reset_peak_memory_stats() - model_id = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.unet = pipe.unet.to(memory_format=torch.channels_last) - pipe.vae = pipe.vae.to(memory_format=torch.channels_last) - - prompt = "a photograph of an astronaut riding a horse" - - # enable vae tiling - pipe.enable_vae_tiling() - pipe.enable_model_cpu_offload() - generator = torch.Generator(device="cpu").manual_seed(0) - output_chunked = pipe( - [prompt], - width=1024, - height=1024, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ) - image_chunked = output_chunked.images - - mem_bytes = torch.cuda.max_memory_allocated() - - # disable vae tiling - pipe.disable_vae_tiling() - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe( - [prompt], - width=1024, - height=1024, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ) - image = output.images - - assert mem_bytes < 1e10 - assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2 - - def test_stable_diffusion_fp16_vs_autocast(self): - # this test makes sure that the original model with autocast - # and the new model with fp16 yield the same result - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image_fp16 = pipe(**inputs).images - - with torch.autocast(torch_device): - inputs = self.get_inputs(torch_device) - image_autocast = pipe(**inputs).images - - # Make sure results are close enough - diff = np.abs(image_fp16.flatten() - image_autocast.flatten()) - # They ARE different since ops are not run always at the same precision - # however, they should be extremely close. - assert diff.mean() < 2e-2 - - def test_stable_diffusion_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == inputs["num_inference_steps"] - - def test_stable_diffusion_low_cpu_mem_usage(self): - pipeline_id = "CompVis/stable-diffusion-v1-4" - - start_time = time.time() - pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) - pipeline_low_cpu_mem_usage.to(torch_device) - low_cpu_mem_usage_time = time.time() - start_time - - start_time = time.time() - _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False) - normal_load_time = time.time() - start_time - - assert 2 * low_cpu_mem_usage_time < normal_load_time - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.8 GB is allocated - assert mem_bytes < 2.8 * 10**9 - - def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - - # Normal inference - - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - torch_dtype=torch.float16, - ) - pipe.unet.set_default_attn_processor() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - outputs = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() - - # With model offloading - - # Reload but don't move to cuda - pipe = StableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - torch_dtype=torch.float16, - ) - pipe.unet.set_default_attn_processor() - - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - inputs = self.get_inputs(torch_device, dtype=torch.float16) - - outputs_offloaded = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() - - assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3 - assert mem_bytes_offloaded < mem_bytes - assert mem_bytes_offloaded < 3.5 * 10**9 - for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker: - assert module.device == torch.device("cpu") - - # With attention slicing - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe.enable_attention_slicing() - _ = pipe(**inputs) - mem_bytes_slicing = torch.cuda.max_memory_allocated() - - assert mem_bytes_slicing < mem_bytes_offloaded - assert mem_bytes_slicing < 3 * 10**9 - - def test_stable_diffusion_textual_inversion(self): - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons") - - a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt") - a111_file_neg = hf_hub_download( - "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt" - ) - pipe.load_textual_inversion(a111_file) - pipe.load_textual_inversion(a111_file_neg) - pipe.to("cuda") - - generator = torch.Generator(device="cpu").manual_seed(1) - - prompt = "An logo of a turtle in strong Style-Winter with " - neg_prompt = "Style-Winter-neg" - - image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0] - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy" - ) - - max_diff = np.abs(expected_image - image).max() - assert max_diff < 5e-2 - - -@nightly -@require_torch_gpu -class StableDiffusionPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_1_4_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_1_5_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device) - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_euler(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device) - sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_dpm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device) - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py deleted file mode 100644 index d556e6318f430d2761bb2ef02556b5bf0d1fcb88..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py +++ /dev/null @@ -1,594 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - ControlNetModel, - DDIMScheduler, - StableDiffusionControlNetPipeline, - UNet2DConditionModel, -) -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel -from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionControlNetPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - torch.manual_seed(0) - controlnet = ControlNetModel( - block_out_channels=(32, 64), - layers_per_block=2, - in_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - cross_attention_dim=32, - conditioning_embedding_out_channels=(16, 32), - ) - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - image = randn_tensor( - (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - "image": image, - } - - return inputs - - def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - -class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionControlNetPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - torch.manual_seed(0) - controlnet1 = ControlNetModel( - block_out_channels=(32, 64), - layers_per_block=2, - in_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - cross_attention_dim=32, - conditioning_embedding_out_channels=(16, 32), - ) - torch.manual_seed(0) - controlnet2 = ControlNetModel( - block_out_channels=(32, 64), - layers_per_block=2, - in_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - cross_attention_dim=32, - conditioning_embedding_out_channels=(16, 32), - ) - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - controlnet = MultiControlNetModel([controlnet1, controlnet2]) - - components = { - "unet": unet, - "controlnet": controlnet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - controlnet_embedder_scale_factor = 2 - - images = [ - randn_tensor( - (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ), - randn_tensor( - (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor), - generator=generator, - device=torch.device(device), - ), - ] - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - "image": images, - } - - return inputs - - def test_attention_slicing_forward_pass(self): - return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(expected_max_diff=2e-3) - - def test_save_pretrained_raise_not_implemented_exception(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - with tempfile.TemporaryDirectory() as tmpdir: - try: - # save_pretrained is not implemented for Multi-ControlNet - pipe.save_pretrained(tmpdir) - except NotImplementedError: - pass - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_float16(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_local(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_optional_components(self): - ... - - -@slow -@require_torch_gpu -class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_canny(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (768, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_depth(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Stormtrooper's lecture" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_hed(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "oil painting of handsome old man, masterpiece" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (704, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_mlsd(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "room" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (704, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_normal(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "cute toy" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_openpose(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "Chef in the kitchen" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (768, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_scribble(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(5) - prompt = "bag" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (640, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_seg(self): - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(5) - prompt = "house" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (512, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-3 - - def test_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - prompt = "house" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png" - ) - - _ = pipe( - prompt, - image, - num_inference_steps=2, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 7 GB is allocated - assert mem_bytes < 4 * 10**9 - - -@slow -@require_torch_gpu -class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_pose_and_canny(self): - controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") - controlnet_pose = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=[controlnet_pose, controlnet_canny] - ) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird and Chef" - image_canny = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - image_pose = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png" - ) - - output = pipe(prompt, [image_pose, image_canny], generator=generator, output_type="np", num_inference_steps=3) - - image = output.images[0] - - assert image.shape == (768, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy" - ) - - assert np.abs(expected_image - image).max() < 5e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py deleted file mode 100644 index 268c013201775c8c78960960669ace207670fd51..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py +++ /dev/null @@ -1,127 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -from diffusers import FlaxControlNetModel, FlaxStableDiffusionControlNetPipeline -from diffusers.utils import is_flax_available, load_image, slow -from diffusers.utils.testing_utils import require_flax - - -if is_flax_available(): - import jax - import jax.numpy as jnp - from flax.jax_utils import replicate - from flax.training.common_utils import shard - - -@slow -@require_flax -class FlaxStableDiffusionControlNetPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - - def test_canny(self): - controlnet, controlnet_params = FlaxControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.bfloat16 - ) - pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16 - ) - params["controlnet"] = controlnet_params - - prompts = "bird" - num_samples = jax.device_count() - prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples) - - canny_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - processed_image = pipe.prepare_image_inputs([canny_image] * num_samples) - - rng = jax.random.PRNGKey(0) - rng = jax.random.split(rng, jax.device_count()) - - p_params = replicate(params) - prompt_ids = shard(prompt_ids) - processed_image = shard(processed_image) - - images = pipe( - prompt_ids=prompt_ids, - image=processed_image, - params=p_params, - prng_seed=rng, - num_inference_steps=50, - jit=True, - ).images - assert images.shape == (jax.device_count(), 1, 768, 512, 3) - - images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:]) - image_slice = images[0, 253:256, 253:256, -1] - - output_slice = jnp.asarray(jax.device_get(image_slice.flatten())) - expected_slice = jnp.array( - [0.167969, 0.116699, 0.081543, 0.154297, 0.132812, 0.108887, 0.169922, 0.169922, 0.205078] - ) - print(f"output_slice: {output_slice}") - assert jnp.abs(output_slice - expected_slice).max() < 1e-2 - - def test_pose(self): - controlnet, controlnet_params = FlaxControlNetModel.from_pretrained( - "lllyasviel/sd-controlnet-openpose", from_pt=True, dtype=jnp.bfloat16 - ) - pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16 - ) - params["controlnet"] = controlnet_params - - prompts = "Chef in the kitchen" - num_samples = jax.device_count() - prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples) - - pose_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png" - ) - processed_image = pipe.prepare_image_inputs([pose_image] * num_samples) - - rng = jax.random.PRNGKey(0) - rng = jax.random.split(rng, jax.device_count()) - - p_params = replicate(params) - prompt_ids = shard(prompt_ids) - processed_image = shard(processed_image) - - images = pipe( - prompt_ids=prompt_ids, - image=processed_image, - params=p_params, - prng_seed=rng, - num_inference_steps=50, - jit=True, - ).images - assert images.shape == (jax.device_count(), 1, 768, 512, 3) - - images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:]) - image_slice = images[0, 253:256, 253:256, -1] - - output_slice = jnp.asarray(jax.device_get(image_slice.flatten())) - expected_slice = jnp.array( - [[0.271484, 0.261719, 0.275391, 0.277344, 0.279297, 0.291016, 0.294922, 0.302734, 0.302734]] - ) - print(f"output_slice: {output_slice}") - assert jnp.abs(output_slice - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py deleted file mode 100644 index 01c2e22e48161b125e783273b55e395050646609..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ /dev/null @@ -1,306 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection - -from diffusers import ( - AutoencoderKL, - DPMSolverMultistepScheduler, - PNDMScheduler, - StableDiffusionImageVariationPipeline, - UNet2DConditionModel, -) -from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionImageVariationPipeline - params = IMAGE_VARIATION_PARAMS - batch_params = IMAGE_VARIATION_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - image_encoder_config = CLIPVisionConfig( - hidden_size=32, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - image_size=32, - patch_size=4, - ) - image_encoder = CLIPVisionModelWithProjection(image_encoder_config) - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "image_encoder": image_encoder, - "feature_extractor": feature_extractor, - "safety_checker": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32)) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "image": image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_img_variation_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionImageVariationPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5167, 0.5746, 0.4835, 0.4914, 0.5605, 0.4691, 0.5201, 0.4898, 0.4958]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_img_variation_multiple_images(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionImageVariationPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["image"] = 2 * [inputs["image"]] - output = sd_pipe(**inputs) - - image = output.images - - image_slice = image[-1, -3:, -3:, -1] - - assert image.shape == (2, 64, 64, 3) - expected_slice = np.array([0.6568, 0.5470, 0.5684, 0.5444, 0.5945, 0.6221, 0.5508, 0.5531, 0.5263]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - -@slow -@require_torch_gpu -class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_imgvar/input_image_vermeer.png" - ) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "image": init_image, - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_img_variation_pipeline_default(self): - sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained( - "lambdalabs/sd-image-variations-diffusers", safety_checker=None - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_img_variation_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionImageVariationPipeline.from_pretrained( - "fusing/sd-image-variations-diffusers", - safety_checker=None, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == inputs["num_inference_steps"] - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - model_id = "fusing/sd-image-variations-diffusers" - pipe = StableDiffusionImageVariationPipeline.from_pretrained( - model_id, safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.6 GB is allocated - assert mem_bytes < 2.6 * 10**9 - - -@nightly -@require_torch_gpu -class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_imgvar/input_image_vermeer.png" - ) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "image": init_image, - "latents": latents, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_img_variation_pndm(self): - sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers") - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img_variation_dpm(self): - sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py deleted file mode 100644 index e27f83fc04feb38edb85755dd9eaa48d528b95f8..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ /dev/null @@ -1,544 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionImg2ImgPipeline, - UNet2DConditionModel, -) -from diffusers.image_processor import VaeImageProcessor -from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionImg2ImgPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - if input_image_type == "pt": - input_image = image - elif input_image_type == "np": - input_image = image.cpu().numpy().transpose(0, 2, 3, 1) - elif input_image_type == "pil": - input_image = image.cpu().numpy().transpose(0, 2, 3, 1) - input_image = VaeImageProcessor.numpy_to_pil(input_image) - else: - raise ValueError(f"unsupported input_image_type {input_image_type}.") - - if output_type not in ["pt", "np", "pil"]: - raise ValueError(f"unsupported output_type {output_type}") - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": input_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": output_type, - } - return inputs - - def test_stable_diffusion_img2img_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_multiple_init_images(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = [inputs["prompt"]] * 2 - inputs["image"] = inputs["image"].repeat(2, 1, 1, 1) - image = sd_pipe(**inputs).images - image_slice = image[-1, -3:, -3:, -1] - - assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_k_lms(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - @skip_mps - def test_pt_np_pil_outputs_equivalent(self): - device = "cpu" - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0] - output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0] - output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0] - - assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4 - assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4 - - @skip_mps - def test_image_types_consistent(self): - device = "cpu" - components = self.get_dummy_components() - sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0] - output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0] - output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0] - - assert np.abs(output_pt - output_np).max() <= 1e-4 - assert np.abs(output_pil - output_np).max() <= 1e-2 - - -@slow -@require_torch_gpu -class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/sketch-mountains-input.png" - ) - inputs = { - "prompt": "a fantasy landscape, concept art, high resolution", - "image": init_image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - def test_stable_diffusion_img2img_default(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([0.4300, 0.4662, 0.4930, 0.3990, 0.4307, 0.4525, 0.3719, 0.4064, 0.3923]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_k_lms(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([0.0389, 0.0346, 0.0415, 0.0290, 0.0218, 0.0210, 0.0408, 0.0567, 0.0271]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_ddim(self): - pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_img2img_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 96) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([-0.4958, 0.5107, 1.1045, 2.7539, 4.6680, 3.8320, 1.5049, 1.8633, 2.6523]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 96) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([-0.4956, 0.5078, 1.0918, 2.7520, 4.6484, 3.8125, 1.5146, 1.8633, 2.6367]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 2 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.2 GB is allocated - assert mem_bytes < 2.2 * 10**9 - - def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - - # Normal inference - - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - safety_checker=None, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() - - # With model offloading - - # Reload but don't move to cuda - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - safety_checker=None, - torch_dtype=torch.float16, - ) - - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - _ = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() - - assert mem_bytes_offloaded < mem_bytes - for module in pipe.text_encoder, pipe.unet, pipe.vae: - assert module.device == torch.device("cpu") - - def test_stable_diffusion_img2img_pipeline_multiple_of_8(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/img2img/sketch-mountains-input.jpg" - ) - # resize to resolution that is divisible by 8 but not 16 or 32 - init_image = init_image.resize((760, 504)) - - model_id = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionImg2ImgPipeline.from_pretrained( - model_id, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "A fantasy landscape, trending on artstation" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - strength=0.75, - guidance_scale=7.5, - generator=generator, - output_type="np", - ) - image = output.images[0] - - image_slice = image[255:258, 383:386, -1] - - assert image.shape == (504, 760, 3) - expected_slice = np.array([0.9393, 0.9500, 0.9399, 0.9438, 0.9458, 0.9400, 0.9455, 0.9414, 0.9423]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 - - -@nightly -@require_torch_gpu -class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/sketch-mountains-input.png" - ) - inputs = { - "prompt": "a fantasy landscape, concept art, high resolution", - "image": init_image, - "generator": generator, - "num_inference_steps": 50, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - def test_img2img_pndm(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/stable_diffusion_1_5_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img2img_ddim(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/stable_diffusion_1_5_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img2img_lms(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/stable_diffusion_1_5_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img2img_dpm(self): - sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 30 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_img2img/stable_diffusion_1_5_dpm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py deleted file mode 100644 index 3553679e0ef6db5a088f36f34558b10bf0d638d4..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ /dev/null @@ -1,538 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DPMSolverMultistepScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionInpaintPipeline, - UNet2DConditionModel, -) -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image -from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionInpaintPipeline - params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_inpaint(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInpaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4723, 0.5731, 0.3939, 0.5441, 0.5922, 0.4392, 0.5059, 0.4651, 0.4474]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_image_tensor(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInpaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - output = sd_pipe(**inputs) - out_pil = output.images - - inputs = self.get_dummy_inputs(device) - inputs["image"] = torch.tensor(np.array(inputs["image"]) / 127.5 - 1).permute(2, 0, 1).unsqueeze(0) - inputs["mask_image"] = torch.tensor(np.array(inputs["mask_image"]) / 255).permute(2, 0, 1)[:1].unsqueeze(0) - output = sd_pipe(**inputs) - out_tensor = output.images - - assert out_pil.shape == (1, 64, 64, 3) - assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 5e-2 - - -@slow -@require_torch_gpu -class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): - def setUp(self): - super().setUp() - - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_inpaint_ddim(self): - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0427, 0.0460, 0.0483, 0.0460, 0.0584, 0.0521, 0.1549, 0.1695, 0.1794]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_inpaint_fp16(self): - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1350, 0.1123, 0.1350, 0.1641, 0.1328, 0.1230, 0.1289, 0.1531, 0.1687]) - - assert np.abs(expected_slice - image_slice).max() < 5e-2 - - def test_stable_diffusion_inpaint_pndm(self): - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None - ) - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_inpaint_k_lms(self): - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.9314, 0.7575, 0.9432, 0.8885, 0.9028, 0.7298, 0.9811, 0.9667, 0.7633]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.2 GB is allocated - assert mem_bytes < 2.2 * 10**9 - - -@nightly -@require_torch_gpu -class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "Face of a yellow cat, high resolution, sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_inpaint_ddim(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/stable_diffusion_inpaint_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_pndm(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") - sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/stable_diffusion_inpaint_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_lms(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/stable_diffusion_inpaint_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_dpm(self): - sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 30 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/stable_diffusion_inpaint_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - -class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase): - def test_pil_inputs(self): - im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8) - im = Image.fromarray(im) - mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5 - mask = Image.fromarray((mask * 255).astype(np.uint8)) - - t_mask, t_masked = prepare_mask_and_masked_image(im, mask) - - self.assertTrue(isinstance(t_mask, torch.Tensor)) - self.assertTrue(isinstance(t_masked, torch.Tensor)) - - self.assertEqual(t_mask.ndim, 4) - self.assertEqual(t_masked.ndim, 4) - - self.assertEqual(t_mask.shape, (1, 1, 32, 32)) - self.assertEqual(t_masked.shape, (1, 3, 32, 32)) - - self.assertTrue(t_mask.dtype == torch.float32) - self.assertTrue(t_masked.dtype == torch.float32) - - self.assertTrue(t_mask.min() >= 0.0) - self.assertTrue(t_mask.max() <= 1.0) - self.assertTrue(t_masked.min() >= -1.0) - self.assertTrue(t_masked.min() <= 1.0) - - self.assertTrue(t_mask.sum() > 0.0) - - def test_np_inputs(self): - im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8) - im_pil = Image.fromarray(im_np) - mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5 - mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8)) - - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil) - - self.assertTrue((t_mask_np == t_mask_pil).all()) - self.assertTrue((t_masked_np == t_masked_pil).all()) - - def test_torch_3D_2D_inputs(self): - im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5 - im_np = im_tensor.numpy().transpose(1, 2, 0) - mask_np = mask_tensor.numpy() - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_3D_3D_inputs(self): - im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5 - im_np = im_tensor.numpy().transpose(1, 2, 0) - mask_np = mask_tensor.numpy()[0] - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_4D_2D_inputs(self): - im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5 - im_np = im_tensor.numpy()[0].transpose(1, 2, 0) - mask_np = mask_tensor.numpy() - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_4D_3D_inputs(self): - im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5 - im_np = im_tensor.numpy()[0].transpose(1, 2, 0) - mask_np = mask_tensor.numpy()[0] - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_4D_4D_inputs(self): - im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (1, 1, 32, 32), dtype=torch.uint8) > 127.5 - im_np = im_tensor.numpy()[0].transpose(1, 2, 0) - mask_np = mask_tensor.numpy()[0][0] - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_batch_4D_3D(self): - im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (2, 32, 32), dtype=torch.uint8) > 127.5 - - im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor] - mask_nps = [mask.numpy() for mask in mask_tensor] - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)] - t_mask_np = torch.cat([n[0] for n in nps]) - t_masked_np = torch.cat([n[1] for n in nps]) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_torch_batch_4D_4D(self): - im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8) - mask_tensor = torch.randint(0, 255, (2, 1, 32, 32), dtype=torch.uint8) > 127.5 - - im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor] - mask_nps = [mask.numpy()[0] for mask in mask_tensor] - - t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor) - nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)] - t_mask_np = torch.cat([n[0] for n in nps]) - t_masked_np = torch.cat([n[1] for n in nps]) - - self.assertTrue((t_mask_tensor == t_mask_np).all()) - self.assertTrue((t_masked_tensor == t_masked_np).all()) - - def test_shape_mismatch(self): - # test height and width - with self.assertRaises(AssertionError): - prepare_mask_and_masked_image(torch.randn(3, 32, 32), torch.randn(64, 64)) - # test batch dim - with self.assertRaises(AssertionError): - prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 64, 64)) - # test batch dim - with self.assertRaises(AssertionError): - prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 1, 64, 64)) - - def test_type_mismatch(self): - # test tensors-only - with self.assertRaises(TypeError): - prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.rand(3, 32, 32).numpy()) - # test tensors-only - with self.assertRaises(TypeError): - prepare_mask_and_masked_image(torch.rand(3, 32, 32).numpy(), torch.rand(3, 32, 32)) - - def test_channels_first(self): - # test channels first for 3D tensors - with self.assertRaises(AssertionError): - prepare_mask_and_masked_image(torch.rand(32, 32, 3), torch.rand(3, 32, 32)) - - def test_tensor_range(self): - # test im <= 1 - with self.assertRaises(ValueError): - prepare_mask_and_masked_image(torch.ones(3, 32, 32) * 2, torch.rand(32, 32)) - # test im >= -1 - with self.assertRaises(ValueError): - prepare_mask_and_masked_image(torch.ones(3, 32, 32) * (-2), torch.rand(32, 32)) - # test mask <= 1 - with self.assertRaises(ValueError): - prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2) - # test mask >= 0 - with self.assertRaises(ValueError): - prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1) diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py deleted file mode 100644 index 15d94414ea2fa881a29d4876cb072da56492a0a0..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ /dev/null @@ -1,538 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionInpaintPipelineLegacy, - UNet2DConditionModel, - UNet2DModel, - VQModel, -) -from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device -from diffusers.utils.testing_utils import load_numpy, require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_uncond_unet(self): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_cond_unet_inpaint(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vq_model(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_stable_diffusion_inpaint_legacy(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - negative_prompt = "french fries" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - prompt, - negative_prompt=negative_prompt, - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self): - device = "cpu" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - # test num_images_per_prompt=1 (default) - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (1, 32, 32, 3) - - # test num_images_per_prompt=1 (default) for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - - assert images.shape == (batch_size, 32, 32, 3) - - # test num_images_per_prompt for single prompt - num_images_per_prompt = 2 - images = sd_pipe( - prompt, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (num_images_per_prompt, 32, 32, 3) - - # test num_images_per_prompt for batch of prompts - batch_size = 2 - images = sd_pipe( - [prompt] * batch_size, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - num_images_per_prompt=num_images_per_prompt, - ).images - - assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3) - - -@slow -@require_torch_gpu -class StableDiffusionInpaintLegacyPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "A red cat sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_inpaint_legacy_pndm(self): - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.5665, 0.6117, 0.6430, 0.4057, 0.4594, 0.5658, 0.1596, 0.3106, 0.4305]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_inpaint_legacy_k_lms(self): - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.4534, 0.4467, 0.4329, 0.4329, 0.4339, 0.4220, 0.4244, 0.4332, 0.4426]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_inpaint_legacy_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.5977, 1.5449, 1.0586, -0.3250, 0.7383, -0.0862, 0.4631, -0.2571, -1.1289]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.5190, 1.1621, 0.6885, 0.2424, 0.3337, -0.1617, 0.6914, -0.1957, -0.5474]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3 - - callback_fn.has_been_called = False - - pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 2 - - -@nightly -@require_torch_gpu -class StableDiffusionInpaintLegacyPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint/input_bench_mask.png" - ) - inputs = { - "prompt": "A red cat sitting on a park bench", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 50, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_inpaint_pndm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_ddim(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_lms(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_inpaint_dpm(self): - sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 30 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py deleted file mode 100644 index 25b0c6ea1432972a6303423ea8517420a6ab9499..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ /dev/null @@ -1,350 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionInstructPix2PixPipeline, - UNet2DConditionModel, -) -from diffusers.utils import floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionInstructPix2PixPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"} - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=8, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB") - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "image_guidance_scale": 1, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_pix2pix_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInstructPix2PixPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.7318, 0.3723, 0.4662, 0.623, 0.5770, 0.5014, 0.4281, 0.5550, 0.4813]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInstructPix2PixPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.7323, 0.3688, 0.4611, 0.6255, 0.5746, 0.5017, 0.433, 0.5553, 0.4827]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_multiple_init_images(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInstructPix2PixPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = [inputs["prompt"]] * 2 - - image = np.array(inputs["image"]).astype(np.float32) / 255.0 - image = torch.from_numpy(image).unsqueeze(0).to(device) - image = image.permute(0, 3, 1, 2) - inputs["image"] = image.repeat(2, 1, 1, 1) - - image = sd_pipe(**inputs).images - image_slice = image[-1, -3:, -3:, -1] - - assert image.shape == (2, 32, 32, 3) - expected_slice = np.array([0.606, 0.5712, 0.5099, 0.598, 0.5805, 0.7205, 0.6793, 0.554, 0.5607]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionInstructPix2PixPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - slice = [round(x, 4) for x in image_slice.flatten().tolist()] - print(",".join([str(x) for x in slice])) - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array([0.726, 0.3902, 0.4868, 0.585, 0.5672, 0.511, 0.3906, 0.551, 0.4846]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - -@slow -@require_torch_gpu -class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - image = load_image( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg" - ) - inputs = { - "prompt": "turn him into a cyborg", - "image": image, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "image_guidance_scale": 1.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_pix2pix_default(self): - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.5902, 0.6015, 0.6027, 0.5983, 0.6092, 0.6061, 0.5765, 0.5785, 0.5555]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_k_lms(self): - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.6578, 0.6817, 0.6972, 0.6761, 0.6856, 0.6916, 0.6428, 0.6516, 0.6301]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_ddim(self): - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3828, 0.3834, 0.3818, 0.3792, 0.3865, 0.3752, 0.3792, 0.3847, 0.3753]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([-0.2463, -0.4644, -0.9756, 1.5176, 1.4414, 0.7866, 0.9897, 0.8521, 0.7983]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([-0.2644, -0.4626, -0.9653, 1.5176, 1.4551, 0.7686, 0.9805, 0.8452, 0.8115]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 3 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.2 GB is allocated - assert mem_bytes < 2.2 * 10**9 - - def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self): - inputs = self.get_inputs() - # resize to resolution that is divisible by 8 but not 16 or 32 - inputs["image"] = inputs["image"].resize((504, 504)) - - model_id = "timbrooks/instruct-pix2pix" - pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( - model_id, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - output = pipe(**inputs) - image = output.images[0] - - image_slice = image[255:258, 383:386, -1] - - assert image.shape == (504, 504, 3) - expected_slice = np.array([0.2726, 0.2529, 0.2664, 0.2655, 0.2641, 0.2642, 0.2591, 0.2649, 0.2590]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py deleted file mode 100644 index 546b1d21252c6ef84d9d181839b7976d3d376082..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py +++ /dev/null @@ -1,106 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch - -from diffusers import StableDiffusionKDiffusionPipeline -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@slow -@require_torch_gpu -class StableDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_1(self): - sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - sd_pipe.set_scheduler("sample_euler") - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np") - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0447, 0.0492, 0.0468, 0.0408, 0.0383, 0.0408, 0.0354, 0.0380, 0.0339]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_2(self): - sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - sd_pipe.set_scheduler("sample_euler") - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np") - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1237, 0.1320, 0.1438, 0.1359, 0.1390, 0.1132, 0.1277, 0.1175, 0.1112]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-1 - - def test_stable_diffusion_karras_sigmas(self): - sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - sd_pipe.set_scheduler("sample_dpmpp_2m") - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=7.5, - num_inference_steps=15, - output_type="np", - use_karras_sigmas=True, - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array( - [0.11381689, 0.12112921, 0.1389457, 0.12549606, 0.1244964, 0.10831517, 0.11562866, 0.10867816, 0.10499048] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py deleted file mode 100644 index 2d9b1e54ee6ebaddf8d6ca133ef322ed06853980..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ /dev/null @@ -1,252 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - PNDMScheduler, - StableDiffusionModelEditingPipeline, - UNet2DConditionModel, -) -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionModelEditingPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "A field of roses", - "generator": generator, - # Setting height and width to None to prevent OOMs on CPU. - "height": None, - "width": None, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_model_editing_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [0.5217179, 0.50658035, 0.5003239, 0.41109088, 0.3595158, 0.46607107, 0.5323504, 0.5335255, 0.49187922] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [0.546259, 0.5108156, 0.50897664, 0.41931948, 0.3748669, 0.4669299, 0.5427151, 0.54561913, 0.49353] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [0.47106352, 0.53579676, 0.45798016, 0.514294, 0.56856745, 0.4788605, 0.54380214, 0.5046455, 0.50404465] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_model_editing_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler() - sd_pipe = StableDiffusionModelEditingPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - # the pipeline does not expect pndm so test if it raises error. - with self.assertRaises(ValueError): - _ = sd_pipe(**inputs).images - - -@slow -@require_torch_gpu -class StableDiffusionModelEditingSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "A field of roses", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_model_editing_default(self): - model_ckpt = "CompVis/stable-diffusion-v1-4" - pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - expected_slice = np.array( - [0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 - - # make sure image changes after editing - pipe.edit_model("A pack of roses", "A pack of blue roses") - - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(expected_slice - image_slice).max() > 1e-1 - - def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - model_ckpt = "CompVis/stable-diffusion-v1-4" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionModelEditingPipeline.from_pretrained( - model_ckpt, scheduler=scheduler, safety_checker=None - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 4.4 GB is allocated - assert mem_bytes < 4.4 * 10**9 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py deleted file mode 100644 index af26e19cca732ee3144bb38929949499d41f64b5..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ /dev/null @@ -1,342 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPanoramaPipeline, - UNet2DConditionModel, -) -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionPanoramaPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "a photo of the dolomites", - "generator": generator, - # Setting height and width to None to prevent OOMs on CPU. - "height": None, - "width": None, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_panorama_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.5101, 0.5006, 0.4962, 0.3995, 0.3501, 0.4632, 0.5339, 0.525, 0.4878]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.5326, 0.5009, 0.5074, 0.4133, 0.371, 0.464, 0.5432, 0.5429, 0.4896]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [0.48235387, 0.5423796, 0.46016198, 0.5377287, 0.5803722, 0.4876525, 0.5515428, 0.5045897, 0.50709957] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler() - sd_pipe = StableDiffusionPanoramaPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - # the pipeline does not expect pndm so test if it raises error. - with self.assertRaises(ValueError): - _ = sd_pipe(**inputs).images - - -@slow -@require_torch_gpu -class StableDiffusionPanoramaSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - inputs = { - "prompt": "a photo of the dolomites", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_panorama_default(self): - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 2048, 3) - - expected_slice = np.array( - [ - 0.36968392, - 0.27025372, - 0.32446766, - 0.28379387, - 0.36363274, - 0.30733347, - 0.27100027, - 0.27054125, - 0.25536096, - ] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-2 - - def test_stable_diffusion_panorama_k_lms(self): - pipe = StableDiffusionPanoramaPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 2048, 3) - - expected_slice = np.array( - [ - [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - ] - ] - ) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 - - def test_stable_diffusion_panorama_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 256) - latents_slice = latents[0, -3:, -3:, -1] - - expected_slice = np.array( - [ - 0.18681869, - 0.33907816, - 0.5361276, - 0.14432865, - -0.02856611, - -0.73941123, - 0.23397987, - 0.47322682, - -0.37823164, - ] - ) - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 256) - latents_slice = latents[0, -3:, -3:, -1] - - expected_slice = np.array( - [ - 0.18539645, - 0.33987248, - 0.5378559, - 0.14437142, - -0.02455261, - -0.7338317, - 0.23990755, - 0.47356272, - -0.3786505, - ] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 3 - - def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - model_ckpt = "stabilityai/stable-diffusion-2-base" - scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler") - pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 5.2 GB is allocated - assert mem_bytes < 5.5 * 10**9 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py deleted file mode 100644 index 46b93a0589ce1775e26921a6cc5dcdcf464c4b29..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ /dev/null @@ -1,470 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMInverseScheduler, - DDIMScheduler, - DDPMScheduler, - EulerAncestralDiscreteScheduler, - LMSDiscreteScheduler, - StableDiffusionPix2PixZeroPipeline, - UNet2DConditionModel, -) -from diffusers.utils import load_numpy, slow, torch_device -from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionPix2PixZeroPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - - @classmethod - def setUpClass(cls): - cls.source_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt" - ) - - cls.target_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt" - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler() - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - "inverse_scheduler": None, - "caption_generator": None, - "caption_processor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - generator = torch.manual_seed(seed) - - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "cross_attention_guidance_amount": 0.15, - "source_embeds": self.source_embeds, - "target_embeds": self.target_embeds, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_pix2pix_zero_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5184, 0.503, 0.4917, 0.4022, 0.3455, 0.464, 0.5324, 0.5323, 0.4894]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = sd_pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5464, 0.5072, 0.5012, 0.4124, 0.3624, 0.466, 0.5413, 0.5468, 0.4927]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" - ) - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5114, 0.5051, 0.5222, 0.5279, 0.5037, 0.5156, 0.4604, 0.4966, 0.504]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_pix2pix_zero_ddpm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = DDPMScheduler() - sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5185, 0.5027, 0.492, 0.401, 0.3445, 0.464, 0.5321, 0.5327, 0.4892]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - # Non-determinism caused by the scheduler optimizing the latent inputs during inference - @unittest.skip("non-deterministic pipeline") - def test_inference_batch_single_identical(self): - return super().test_inference_batch_single_identical() - - -@slow -@require_torch_gpu -class StableDiffusionPix2PixZeroPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @classmethod - def setUpClass(cls): - cls.source_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt" - ) - - cls.target_embeds = load_pt( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt" - ) - - def get_inputs(self, seed=0): - generator = torch.manual_seed(seed) - - inputs = { - "prompt": "turn him into a cyborg", - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "cross_attention_guidance_amount": 0.15, - "source_embeds": self.source_embeds, - "target_embeds": self.target_embeds, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_pix2pix_zero_default(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.5742, 0.5757, 0.5747, 0.5781, 0.5688, 0.5713, 0.5742, 0.5664, 0.5747]) - - assert np.abs(expected_slice - image_slice).max() < 5e-2 - - def test_stable_diffusion_pix2pix_zero_k_lms(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.6367, 0.5459, 0.5146, 0.5479, 0.4905, 0.4753, 0.4961, 0.4629, 0.4624]) - - assert np.abs(expected_slice - image_slice).max() < 5e-2 - - def test_stable_diffusion_pix2pix_zero_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.1345, 0.268, 0.1539, 0.0726, 0.0959, 0.2261, -0.2673, 0.0277, -0.2062]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.1393, 0.2637, 0.1617, 0.0724, 0.0987, 0.2271, -0.2666, 0.0299, -0.2104]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 3 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs() - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 8.2 GB is allocated - assert mem_bytes < 8.2 * 10**9 - - -@slow -@require_torch_gpu -class InversionPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @classmethod - def setUpClass(cls): - raw_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png" - ) - - raw_image = raw_image.convert("RGB").resize((512, 512)) - - cls.raw_image = raw_image - - def test_stable_diffusion_pix2pix_inversion(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10) - inv_latents = output[0] - - image_slice = inv_latents[0, -3:, -3:, -1].flatten() - - assert inv_latents.shape == (1, 4, 64, 64) - expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666]) - - assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2 - - def test_stable_diffusion_2_pix2pix_inversion(self): - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10) - inv_latents = output[0] - - image_slice = inv_latents[0, -3:, -3:, -1].flatten() - - assert inv_latents.shape == (1, 4, 64, 64) - expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050]) - - assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2 - - def test_stable_diffusion_pix2pix_full(self): - # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog.png - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.npy" - ) - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator) - inv_latents = output[0] - - source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"] - target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"] - - source_embeds = pipe.get_embeds(source_prompts) - target_embeds = pipe.get_embeds(target_prompts) - - image = pipe( - caption, - source_embeds=source_embeds, - target_embeds=target_embeds, - num_inference_steps=50, - cross_attention_guidance_amount=0.15, - generator=generator, - latents=inv_latents, - negative_prompt=caption, - output_type="np", - ).images - - max_diff = np.abs(expected_image - image).mean() - assert max_diff < 0.05 - - def test_stable_diffusion_2_pix2pix_full(self): - # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog_2.png - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog_2.npy" - ) - - pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) - - caption = "a photography of a cat with flowers" - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipe.invert(caption, image=self.raw_image, generator=generator) - inv_latents = output[0] - - source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"] - target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"] - - source_embeds = pipe.get_embeds(source_prompts) - target_embeds = pipe.get_embeds(target_prompts) - - image = pipe( - caption, - source_embeds=source_embeds, - target_embeds=target_embeds, - num_inference_steps=125, - cross_attention_guidance_amount=0.015, - generator=generator, - latents=inv_latents, - negative_prompt=caption, - output_type="np", - ).images - - mean_diff = np.abs(expected_image - image).mean() - assert mean_diff < 0.25 diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py deleted file mode 100644 index abaefbcad0118cf494d10e6ba4c44638af9d285d..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ /dev/null @@ -1,184 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - StableDiffusionSAGPipeline, - UNet2DConditionModel, -) -from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionSAGPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - test_cpu_offload = False - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": ".", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 1.0, - "sag_scale": 1.0, - "output_type": "numpy", - } - return inputs - - -@slow -@require_torch_gpu -class StableDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_1(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np" - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - - def test_stable_diffusion_2(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np" - ) - - image = output.images - - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - - def test_stable_diffusion_2_non_square(self): - sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base") - sag_pipe = sag_pipe.to(torch_device) - sag_pipe.set_progress_bar_config(disable=None) - - prompt = "." - generator = torch.manual_seed(0) - output = sag_pipe( - [prompt], - width=768, - height=512, - generator=generator, - guidance_scale=7.5, - sag_scale=1.0, - num_inference_steps=20, - output_type="np", - ) - - image = output.images - - assert image.shape == (1, 512, 768, 3) diff --git a/diffusers/tests/pipelines/stable_diffusion_2/__init__.py b/diffusers/tests/pipelines/stable_diffusion_2/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py deleted file mode 100644 index fa3c3d628e4f1ec74c6729db436e4f20c0e714c5..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ /dev/null @@ -1,563 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionPipeline, - UNet2DConditionModel, - logging, -) -from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusion2PipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5649, 0.6022, 0.4804, 0.5270, 0.5585, 0.4643, 0.5159, 0.4963, 0.4793]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler(skip_prk_steps=True) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5099, 0.5677, 0.4671, 0.5128, 0.5697, 0.4676, 0.5277, 0.4964, 0.4946]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_k_lms(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_k_euler_ancestral(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4715, 0.5376, 0.4569, 0.5224, 0.5734, 0.4797, 0.5465, 0.5074, 0.5046]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_k_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_long_prompt(self): - components = self.get_dummy_components() - components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) - sd_pipe = StableDiffusionPipeline(**components) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - do_classifier_free_guidance = True - negative_prompt = None - num_images_per_prompt = 1 - logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion") - - prompt = 25 * "@" - with CaptureLogger(logger) as cap_logger_3: - text_embeddings_3 = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - prompt = 100 * "@" - with CaptureLogger(logger) as cap_logger: - text_embeddings = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - negative_prompt = "Hello" - with CaptureLogger(logger) as cap_logger_2: - text_embeddings_2 = sd_pipe._encode_prompt( - prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt - ) - - assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape - assert text_embeddings.shape[1] == 77 - - assert cap_logger.out == cap_logger_2.out - # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25 - assert cap_logger.out.count("@") == 25 - assert cap_logger_3.out == "" - - -@slow -@require_torch_gpu -class StableDiffusion2PipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_default_ddim(self): - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_pndm(self): - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_k_lms(self): - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.10440, 0.13115, 0.11100, 0.10141, 0.11440, 0.07215, 0.11332, 0.09693, 0.10006]) - assert np.abs(image_slice - expected_slice).max() < 1e-4 - - def test_stable_diffusion_attention_slicing(self): - torch.cuda.reset_peak_memory_stats() - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - # enable attention slicing - pipe.enable_attention_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image_sliced = pipe(**inputs).images - - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - # make sure that less than 3.3 GB is allocated - assert mem_bytes < 3.3 * 10**9 - - # disable slicing - pipe.disable_attention_slicing() - inputs = self.get_inputs(torch_device, dtype=torch.float16) - image = pipe(**inputs).images - - # make sure that more than 3.3 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes > 3.3 * 10**9 - assert np.abs(image_sliced - image).max() < 1e-3 - - def test_stable_diffusion_text2img_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.3862, -0.4507, -1.1729, 0.0686, -1.1045, 0.7124, -1.8301, 0.1903, 1.2773] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 64, 64) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [0.2720, -0.1863, -0.7383, -0.5029, -0.7534, 0.3970, -0.7646, 0.4468, 1.2686] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == inputs["num_inference_steps"] - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.8 GB is allocated - assert mem_bytes < 2.8 * 10**9 - - def test_stable_diffusion_pipeline_with_model_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - inputs = self.get_inputs(torch_device, dtype=torch.float16) - - # Normal inference - - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", - torch_dtype=torch.float16, - ) - pipe.unet.set_default_attn_processor() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - outputs = pipe(**inputs) - mem_bytes = torch.cuda.max_memory_allocated() - - # With model offloading - - # Reload but don't move to cuda - pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-base", - torch_dtype=torch.float16, - ) - pipe.unet.set_default_attn_processor() - - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe.enable_model_cpu_offload() - pipe.set_progress_bar_config(disable=None) - inputs = self.get_inputs(torch_device, dtype=torch.float16) - outputs_offloaded = pipe(**inputs) - mem_bytes_offloaded = torch.cuda.max_memory_allocated() - - assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3 - assert mem_bytes_offloaded < mem_bytes - assert mem_bytes_offloaded < 3 * 10**9 - for module in pipe.text_encoder, pipe.unet, pipe.vae: - assert module.device == torch.device("cpu") - - # With attention slicing - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe.enable_attention_slicing() - _ = pipe(**inputs) - mem_bytes_slicing = torch.cuda.max_memory_allocated() - assert mem_bytes_slicing < mem_bytes_offloaded - - -@nightly -@require_torch_gpu -class StableDiffusion2PipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=generator_device).manual_seed(seed) - latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) - latents = torch.from_numpy(latents).to(device=device, dtype=dtype) - inputs = { - "prompt": "a photograph of an astronaut riding a horse", - "latents": latents, - "generator": generator, - "num_inference_steps": 50, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_2_0_default_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base").to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_0_base_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_2_1_default_pndm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_ddim(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device) - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_lms(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_euler(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device) - sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_euler.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_stable_diffusion_dpm(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device) - sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - inputs["num_inference_steps"] = 25 - image = sd_pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py deleted file mode 100644 index 780abf304a469ddefbe35d5f5132367fe3c8213d..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ /dev/null @@ -1,178 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - StableDiffusionAttendAndExcitePipeline, - UNet2DConditionModel, -) -from diffusers.utils import load_numpy, skip_mps, slow -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -@skip_mps -class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionAttendAndExcitePipeline - test_attention_slicing = False - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"}) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = inputs = { - "prompt": "a cat and a frog", - "token_indices": [2, 5], - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - "max_iter_to_alter": 2, - "thresholds": {0: 0.7}, - } - return inputs - - def test_inference(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - self.assertEqual(image.shape, (1, 64, 64, 3)) - expected_slice = np.array( - [0.5644937, 0.60543084, 0.48239064, 0.5206757, 0.55623394, 0.46045133, 0.5100435, 0.48919064, 0.4759359] - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_inference_batch_consistent(self): - # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches - self._test_inference_batch_consistent(batch_sizes=[2, 4]) - - -@require_torch_gpu -@slow -class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_attend_and_excite_fp16(self): - generator = torch.manual_seed(51) - - pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16 - ) - pipe.to("cuda") - - prompt = "a painting of an elephant with glasses" - token_indices = [5, 7] - - image = pipe( - prompt=prompt, - token_indices=token_indices, - guidance_scale=7.5, - generator=generator, - num_inference_steps=5, - max_iter_to_alter=5, - output_type="numpy", - ).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy" - ) - assert np.abs((expected_image - image).max()) < 5e-1 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py deleted file mode 100644 index c2ad239f6888027a0e39c844c826f9482770b754..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ /dev/null @@ -1,587 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import ( - CLIPTextConfig, - CLIPTextModel, - CLIPTokenizer, - DPTConfig, - DPTFeatureExtractor, - DPTForDepthEstimation, -) - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionDepth2ImgPipeline, - UNet2DConditionModel, -) -from diffusers.utils import ( - floats_tensor, - is_accelerate_available, - is_accelerate_version, - load_image, - load_numpy, - nightly, - slow, - torch_device, -) -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionDepth2ImgPipeline - test_save_load_optional_components = False - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} - required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=5, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - backbone_config = { - "global_padding": "same", - "layer_type": "bottleneck", - "depths": [3, 4, 9], - "out_features": ["stage1", "stage2", "stage3"], - "embedding_dynamic_padding": True, - "hidden_sizes": [96, 192, 384, 768], - "num_groups": 2, - } - depth_estimator_config = DPTConfig( - image_size=32, - patch_size=16, - num_channels=3, - hidden_size=32, - num_hidden_layers=4, - backbone_out_indices=(0, 1, 2, 3), - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - is_decoder=False, - initializer_range=0.02, - is_hybrid=True, - backbone_config=backbone_config, - backbone_featmap_shape=[1, 384, 24, 24], - ) - depth_estimator = DPTForDepthEstimation(depth_estimator_config) - feature_extractor = DPTFeatureExtractor.from_pretrained( - "hf-internal-testing/tiny-random-DPTForDepthEstimation" - ) - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "depth_estimator": depth_estimator, - "feature_extractor": feature_extractor, - } - return components - - def get_dummy_inputs(self, device, seed=0): - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)) - image = image.cpu().permute(0, 2, 3, 1)[0] - image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32)) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_save_load_local(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(output - output_loaded).max() - self.assertLess(max_diff, 1e-4) - - @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") - def test_save_load_float16(self): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.to(torch_device).half() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for name, component in pipe_loaded.components.items(): - if hasattr(component, "dtype"): - self.assertTrue( - component.dtype == torch.float16, - f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(output - output_loaded).max() - self.assertLess(max_diff, 2e-2, "The output of the fp16 pipeline changed after saving and loading.") - - @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") - def test_float16_inference(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.half() - pipe_fp16 = self.pipeline_class(**components) - pipe_fp16.to(torch_device) - pipe_fp16.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(torch_device))[0] - output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0] - - max_diff = np.abs(output - output_fp16).max() - self.assertLess(max_diff, 1.3e-2, "The outputs of the fp16 and fp32 pipelines are too different.") - - @unittest.skipIf( - torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), - reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", - ) - def test_cpu_offload_forward_pass(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs)[0] - - pipe.enable_sequential_cpu_offload() - inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs)[0] - - max_diff = np.abs(output_with_offload - output_without_offload).max() - self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results") - - def test_dict_tuple_outputs_equivalent(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(torch_device))[0] - output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0] - - max_diff = np.abs(output - output_tuple).max() - self.assertLess(max_diff, 1e-4) - - def test_progress_bar(self): - super().test_progress_bar() - - def test_stable_diffusion_depth2img_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = StableDiffusionDepth2ImgPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - if torch_device == "mps": - expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546]) - else: - expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_depth2img_negative_prompt(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = StableDiffusionDepth2ImgPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - negative_prompt = "french fries" - output = pipe(**inputs, negative_prompt=negative_prompt) - image = output.images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - if torch_device == "mps": - expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335]) - else: - expected_slice = np.array([0.6296, 0.5125, 0.3890, 0.4456, 0.5955, 0.4621, 0.3810, 0.5310, 0.4626]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_depth2img_multiple_init_images(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = StableDiffusionDepth2ImgPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["prompt"] = [inputs["prompt"]] * 2 - inputs["image"] = 2 * [inputs["image"]] - image = pipe(**inputs).images - image_slice = image[-1, -3:, -3:, -1] - - assert image.shape == (2, 32, 32, 3) - - if torch_device == "mps": - expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551]) - else: - expected_slice = np.array([0.6267, 0.5232, 0.6001, 0.6738, 0.5029, 0.6429, 0.5364, 0.4159, 0.4674]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - def test_stable_diffusion_depth2img_pil(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - pipe = StableDiffusionDepth2ImgPipeline(**components) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - if torch_device == "mps": - expected_slice = np.array([0.53232, 0.47015, 0.40868, 0.45651, 0.4891, 0.4668, 0.4287, 0.48822, 0.47439]) - else: - expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - @skip_mps - def test_attention_slicing_forward_pass(self): - return super().test_attention_slicing_forward_pass() - - -@slow -@require_torch_gpu -class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png" - ) - inputs = { - "prompt": "two tigers", - "image": init_image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_depth2img_pipeline_default(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 480, 640, 3) - expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.8260, 0.7747, 0.7421]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_depth2img_pipeline_k_lms(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None - ) - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 480, 640, 3) - expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.6370, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_depth2img_pipeline_ddim(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None - ) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs() - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 480, 640, 3) - expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.6420, 0.6522, 0.6555, 0.6436]) - - assert np.abs(expected_slice - image_slice).max() < 1e-4 - - def test_stable_diffusion_depth2img_intermediate_state(self): - number_of_steps = 0 - - def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 1: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 60, 80) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.7168, -1.5137, -0.1418, -2.9219, -2.7266, -2.4414, -2.1035, -3.0078, -1.7051] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 2: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 60, 80) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array( - [-0.7109, -1.5068, -0.1403, -2.9160, -2.7207, -2.4414, -2.1035, -3.0059, -1.7090] - ) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - callback_fn.has_been_called = False - - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - inputs = self.get_inputs(dtype=torch.float16) - pipe(**inputs, callback=callback_fn, callback_steps=1) - assert callback_fn.has_been_called - assert number_of_steps == 2 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - inputs = self.get_inputs(dtype=torch.float16) - _ = pipe(**inputs) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.9 GB is allocated - assert mem_bytes < 2.9 * 10**9 - - -@nightly -@require_torch_gpu -class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def get_inputs(self, device="cpu", dtype=torch.float32, seed=0): - generator = torch.Generator(device=device).manual_seed(seed) - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png" - ) - inputs = { - "prompt": "two tigers", - "image": init_image, - "generator": generator, - "num_inference_steps": 3, - "strength": 0.75, - "guidance_scale": 7.5, - "output_type": "numpy", - } - return inputs - - def test_depth2img_pndm(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs() - image = pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_depth2img/stable_diffusion_2_0_pndm.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_depth2img_ddim(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth") - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs() - image = pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_depth2img/stable_diffusion_2_0_ddim.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img2img_lms(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth") - pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs() - image = pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_depth2img/stable_diffusion_2_0_lms.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 - - def test_img2img_dpm(self): - pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs() - inputs["num_inference_steps"] = 30 - image = pipe(**inputs).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main" - "/stable_diffusion_depth2img/stable_diffusion_2_0_dpm_multi.npy" - ) - max_diff = np.abs(expected_image - image).max() - assert max_diff < 1e-3 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py deleted file mode 100644 index 8db8ec7810068aab4517fe2066e3fab10a52f6f7..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py +++ /dev/null @@ -1,99 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -from diffusers import FlaxDPMSolverMultistepScheduler, FlaxStableDiffusionPipeline -from diffusers.utils import is_flax_available, slow -from diffusers.utils.testing_utils import require_flax - - -if is_flax_available(): - import jax - import jax.numpy as jnp - from flax.jax_utils import replicate - from flax.training.common_utils import shard - - -@slow -@require_flax -class FlaxStableDiffusion2PipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - - def test_stable_diffusion_flax(self): - sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2", - revision="bf16", - dtype=jnp.bfloat16, - ) - - prompt = "A painting of a squirrel eating a burger" - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = sd_pipe.prepare_inputs(prompt) - - params = replicate(params) - prompt_ids = shard(prompt_ids) - - prng_seed = jax.random.PRNGKey(0) - prng_seed = jax.random.split(prng_seed, jax.device_count()) - - images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0] - assert images.shape == (jax.device_count(), 1, 768, 768, 3) - - images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:]) - image_slice = images[0, 253:256, 253:256, -1] - - output_slice = jnp.asarray(jax.device_get(image_slice.flatten())) - expected_slice = jnp.array([0.4238, 0.4414, 0.4395, 0.4453, 0.4629, 0.4590, 0.4531, 0.45508, 0.4512]) - print(f"output_slice: {output_slice}") - assert jnp.abs(output_slice - expected_slice).max() < 1e-2 - - def test_stable_diffusion_dpm_flax(self): - model_id = "stabilityai/stable-diffusion-2" - scheduler, scheduler_params = FlaxDPMSolverMultistepScheduler.from_pretrained(model_id, subfolder="scheduler") - sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained( - model_id, - scheduler=scheduler, - revision="bf16", - dtype=jnp.bfloat16, - ) - params["scheduler"] = scheduler_params - - prompt = "A painting of a squirrel eating a burger" - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = sd_pipe.prepare_inputs(prompt) - - params = replicate(params) - prompt_ids = shard(prompt_ids) - - prng_seed = jax.random.PRNGKey(0) - prng_seed = jax.random.split(prng_seed, jax.device_count()) - - images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0] - assert images.shape == (jax.device_count(), 1, 768, 768, 3) - - images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:]) - image_slice = images[0, 253:256, 253:256, -1] - - output_slice = jnp.asarray(jax.device_get(image_slice.flatten())) - expected_slice = jnp.array([0.4336, 0.42969, 0.4453, 0.4199, 0.4297, 0.4531, 0.4434, 0.4434, 0.4297]) - print(f"output_slice: {output_slice}") - assert jnp.abs(output_slice - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py deleted file mode 100644 index 432619a79ddd32d288893e3021a14ab6893b370a..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py +++ /dev/null @@ -1,82 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -from diffusers import FlaxStableDiffusionInpaintPipeline -from diffusers.utils import is_flax_available, load_image, slow -from diffusers.utils.testing_utils import require_flax - - -if is_flax_available(): - import jax - import jax.numpy as jnp - from flax.jax_utils import replicate - from flax.training.common_utils import shard - - -@slow -@require_flax -class FlaxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - - def test_stable_diffusion_inpaint_pipeline(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-inpaint/init_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png" - ) - - model_id = "xvjiarui/stable-diffusion-2-inpainting" - pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None) - - prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 50 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - init_image = num_samples * [init_image] - mask_image = num_samples * [mask_image] - prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, jax.device_count()) - prompt_ids = shard(prompt_ids) - processed_masked_images = shard(processed_masked_images) - processed_masks = shard(processed_masks) - - output = pipeline( - prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True - ) - - images = output.images.reshape(num_samples, 512, 512, 3) - - image_slice = images[0, 253:256, 253:256, -1] - - output_slice = jnp.asarray(jax.device_get(image_slice.flatten())) - expected_slice = jnp.array( - [0.3611307, 0.37649736, 0.3757408, 0.38213953, 0.39295167, 0.3841631, 0.41554978, 0.4137475, 0.4217084] - ) - print(f"output_slice: {output_slice}") - assert jnp.abs(output_slice - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py deleted file mode 100644 index ee059314904fc31e748e99db6674c76190530ef7..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ /dev/null @@ -1,255 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel -from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, slow - -from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionInpaintPipeline - params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS - batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=9, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - scheduler = PNDMScheduler(skip_prk_steps=True) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "safety_checker": None, - "feature_extractor": None, - } - return components - - def get_dummy_inputs(self, device, seed=0): - # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched - image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - image = image.cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64)) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": init_image, - "mask_image": mask_image, - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "numpy", - } - return inputs - - def test_stable_diffusion_inpaint(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableDiffusionInpaintPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4727, 0.5735, 0.3941, 0.5446, 0.5926, 0.4394, 0.5062, 0.4654, 0.4476]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch_gpu -class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_inpaint_pipeline(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-inpaint/init_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint" - "/yellow_cat_sitting_on_a_park_bench.npy" - ) - - model_id = "stabilityai/stable-diffusion-2-inpainting" - pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 1e-3 - - def test_stable_diffusion_inpaint_pipeline_fp16(self): - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-inpaint/init_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint" - "/yellow_cat_sitting_on_a_park_bench_fp16.npy" - ) - - model_id = "stabilityai/stable-diffusion-2-inpainting" - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_id, - torch_dtype=torch.float16, - safety_checker=None, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 5e-1 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-inpaint/init_image.png" - ) - mask_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png" - ) - - model_id = "stabilityai/stable-diffusion-2-inpainting" - pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler") - pipe = StableDiffusionInpaintPipeline.from_pretrained( - model_id, - safety_checker=None, - scheduler=pndm, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - - generator = torch.manual_seed(0) - _ = pipe( - prompt=prompt, - image=init_image, - mask_image=mask_image, - generator=generator, - num_inference_steps=2, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.65 GB is allocated - assert mem_bytes < 2.65 * 10**9 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py deleted file mode 100644 index 38f4b053714bb048f412883b10cb12bfbb010e93..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ /dev/null @@ -1,229 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - EulerDiscreteScheduler, - StableDiffusionLatentUpscalePipeline, - StableDiffusionPipeline, - UNet2DConditionModel, -) -from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableDiffusionLatentUpscalePipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { - "height", - "width", - "cross_attention_kwargs", - "negative_prompt_embeds", - "prompt_embeds", - } - required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"} - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - test_cpu_offload = True - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 4 - sizes = (16, 16) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - def get_dummy_components(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - act_fn="gelu", - attention_head_dim=8, - norm_num_groups=None, - block_out_channels=[32, 32, 64, 64], - time_cond_proj_dim=160, - conv_in_kernel=1, - conv_out_kernel=1, - cross_attention_dim=32, - down_block_types=( - "KDownBlock2D", - "KCrossAttnDownBlock2D", - "KCrossAttnDownBlock2D", - "KCrossAttnDownBlock2D", - ), - in_channels=8, - mid_block_type=None, - only_cross_attention=False, - out_channels=5, - resnet_time_scale_shift="scale_shift", - time_embedding_type="fourier", - timestep_post_act="gelu", - up_block_types=("KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KUpBlock2D"), - ) - vae = AutoencoderKL( - block_out_channels=[32, 32, 64, 64], - in_channels=3, - out_channels=3, - down_block_types=[ - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D", - ], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - scheduler = EulerDiscreteScheduler(prediction_type="sample") - text_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - hidden_act="quick_gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": model.eval(), - "vae": vae.eval(), - "scheduler": scheduler, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "image": self.dummy_image.cpu(), - "generator": generator, - "num_inference_steps": 2, - "output_type": "numpy", - } - return inputs - - def test_inference(self): - device = "cpu" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - self.assertEqual(image.shape, (1, 256, 256, 3)) - expected_slice = np.array( - [0.47222412, 0.41921633, 0.44717434, 0.46874192, 0.42588258, 0.46150726, 0.4677534, 0.45583832, 0.48579055] - ) - max_diff = np.abs(image_slice.flatten() - expected_slice).max() - self.assertLessEqual(max_diff, 1e-3) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical(relax_max_difference=False) - - -@require_torch_gpu -@slow -class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_latent_upscaler_fp16(self): - generator = torch.manual_seed(33) - - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) - pipe.to("cuda") - - upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( - "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 - ) - upscaler.to("cuda") - - prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic" - - low_res_latents = pipe(prompt, generator=generator, output_type="latent").images - - image = upscaler( - prompt=prompt, - image=low_res_latents, - num_inference_steps=20, - guidance_scale=0, - generator=generator, - output_type="np", - ).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy" - ) - assert np.abs((expected_image - image).mean()) < 5e-2 - - def test_latent_upscaler_fp16_image(self): - generator = torch.manual_seed(33) - - upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained( - "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16 - ) - upscaler.to("cuda") - - prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas" - - low_res_img = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_512.png" - ) - - image = upscaler( - prompt=prompt, - image=low_res_img, - num_inference_steps=20, - guidance_scale=0, - generator=generator, - output_type="np", - ).images[0] - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy" - ) - assert np.abs((expected_image - image).max()) < 5e-2 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py deleted file mode 100644 index b8e7b858130bfd7ce9d8189d30a71cdd86e00b7e..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ /dev/null @@ -1,362 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from PIL import Image -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel -from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusionUpscalePipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet_upscale(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 32, 64), - layers_per_block=2, - sample_size=32, - in_channels=7, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=8, - use_linear_projection=True, - only_cross_attention=(True, True, False), - num_class_embeds=100, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=512, - ) - return CLIPTextModel(config) - - def test_stable_diffusion_upscale(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet_upscale - low_res_scheduler = DDPMScheduler() - scheduler = DDIMScheduler(prediction_type="v_prediction") - vae = self.dummy_vae - text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionUpscalePipeline( - unet=unet, - low_res_scheduler=low_res_scheduler, - scheduler=scheduler, - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - max_noise_level=350, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - [prompt], - image=low_res_image, - generator=generator, - guidance_scale=6.0, - noise_level=20, - num_inference_steps=2, - output_type="np", - ) - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - image=low_res_image, - generator=generator, - guidance_scale=6.0, - noise_level=20, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - expected_height_width = low_res_image.size[0] * 4 - assert image.shape == (1, expected_height_width, expected_height_width, 3) - expected_slice = np.array([0.2562, 0.3606, 0.4204, 0.4469, 0.4822, 0.4647, 0.5315, 0.5748, 0.5606]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_upscale_batch(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet_upscale - low_res_scheduler = DDPMScheduler() - scheduler = DDIMScheduler(prediction_type="v_prediction") - vae = self.dummy_vae - text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionUpscalePipeline( - unet=unet, - low_res_scheduler=low_res_scheduler, - scheduler=scheduler, - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - max_noise_level=350, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - output = sd_pipe( - 2 * [prompt], - image=2 * [low_res_image], - guidance_scale=6.0, - noise_level=20, - num_inference_steps=2, - output_type="np", - ) - image = output.images - assert image.shape[0] == 2 - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe( - [prompt], - image=low_res_image, - generator=generator, - num_images_per_prompt=2, - guidance_scale=6.0, - noise_level=20, - num_inference_steps=2, - output_type="np", - ) - image = output.images - assert image.shape[0] == 2 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_upscale_fp16(self): - """Test that stable diffusion upscale works with fp16""" - unet = self.dummy_cond_unet_upscale - low_res_scheduler = DDPMScheduler() - scheduler = DDIMScheduler(prediction_type="v_prediction") - vae = self.dummy_vae - text_encoder = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0] - low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64)) - - # put models in fp16, except vae as it overflows in fp16 - unet = unet.half() - text_encoder = text_encoder.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionUpscalePipeline( - unet=unet, - low_res_scheduler=low_res_scheduler, - scheduler=scheduler, - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - max_noise_level=350, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - image = sd_pipe( - [prompt], - image=low_res_image, - generator=generator, - num_inference_steps=2, - output_type="np", - ).images - - expected_height_width = low_res_image.size[0] * 4 - assert image.shape == (1, expected_height_width, expected_height_width, 3) - - -@slow -@require_torch_gpu -class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_upscale_pipeline(self): - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-upscale/low_res_cat.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale" - "/upsampled_cat.npy" - ) - - model_id = "stabilityai/stable-diffusion-x4-upscaler" - pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "a cat sitting on a park bench" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=image, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 1e-3 - - def test_stable_diffusion_upscale_pipeline_fp16(self): - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-upscale/low_res_cat.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale" - "/upsampled_cat_fp16.npy" - ) - - model_id = "stabilityai/stable-diffusion-x4-upscaler" - pipe = StableDiffusionUpscalePipeline.from_pretrained( - model_id, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "a cat sitting on a park bench" - - generator = torch.manual_seed(0) - output = pipe( - prompt=prompt, - image=image, - generator=generator, - output_type="np", - ) - image = output.images[0] - - assert image.shape == (512, 512, 3) - assert np.abs(expected_image - image).max() < 5e-1 - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/sd2-upscale/low_res_cat.png" - ) - - model_id = "stabilityai/stable-diffusion-x4-upscaler" - pipe = StableDiffusionUpscalePipeline.from_pretrained( - model_id, - torch_dtype=torch.float16, - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing(1) - pipe.enable_sequential_cpu_offload() - - prompt = "a cat sitting on a park bench" - - generator = torch.manual_seed(0) - _ = pipe( - prompt=prompt, - image=image, - generator=generator, - num_inference_steps=5, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.9 GB is allocated - assert mem_bytes < 2.9 * 10**9 diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py deleted file mode 100644 index 8aab5845741c638d2d93a28f1a23616086adbddb..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ /dev/null @@ -1,481 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import time -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - EulerDiscreteScheduler, - StableDiffusionPipeline, - UNet2DConditionModel, -) -from diffusers.utils import load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - # SD2-specific config below - attention_head_dim=(2, 4), - use_linear_projection=True, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - # SD2-specific config below - hidden_act="gelu", - projection_dim=64, - ) - return CLIPTextModel(config) - - def test_stable_diffusion_v_pred_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - prediction_type="v_prediction", - ) - - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.6424, 0.6109, 0.494, 0.5088, 0.4984, 0.4525, 0.5059, 0.5068, 0.4474]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_v_pred_k_euler(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = EulerDiscreteScheduler( - beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction" - ) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4616, 0.5184, 0.4887, 0.5111, 0.4839, 0.48, 0.5119, 0.5263, 0.4776]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_v_pred_fp16(self): - """Test that stable diffusion v-prediction works with fp16""" - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - prediction_type="v_prediction", - ) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=None, - requires_safety_checker=False, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images - - assert image.shape == (1, 64, 64, 3) - - -@slow -@require_torch_gpu -class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_diffusion_v_pred_default(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.enable_attention_slicing() - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np") - - image = output.images - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([0.1868, 0.1922, 0.1527, 0.1921, 0.1908, 0.1624, 0.1779, 0.1652, 0.1734]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_v_pred_upcast_attention(self): - sd_pipe = StableDiffusionPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16 - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.enable_attention_slicing() - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np") - - image = output.images - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([0.4209, 0.4087, 0.4097, 0.4209, 0.3860, 0.4329, 0.4280, 0.4324, 0.4187]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2 - - def test_stable_diffusion_v_pred_euler(self): - scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler") - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.enable_attention_slicing() - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.manual_seed(0) - - output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy") - image = output.images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([0.1781, 0.1695, 0.1661, 0.1705, 0.1588, 0.1699, 0.2005, 0.1589, 0.1677]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_v_pred_dpm(self): - """ - TODO: update this test after making DPM compatible with V-prediction! - """ - scheduler = DPMSolverMultistepScheduler.from_pretrained( - "stabilityai/stable-diffusion-2", subfolder="scheduler" - ) - sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.enable_attention_slicing() - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "a photograph of an astronaut riding a horse" - generator = torch.manual_seed(0) - image = sd_pipe( - [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="numpy" - ).images - - image_slice = image[0, 253:256, 253:256, -1] - assert image.shape == (1, 768, 768, 3) - expected_slice = np.array([0.3303, 0.3184, 0.3291, 0.3300, 0.3256, 0.3113, 0.2965, 0.3134, 0.3192]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_attention_slicing_v_pred(self): - torch.cuda.reset_peak_memory_stats() - model_id = "stabilityai/stable-diffusion-2" - pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "a photograph of an astronaut riding a horse" - - # make attention efficient - pipe.enable_attention_slicing() - generator = torch.manual_seed(0) - output_chunked = pipe( - [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy" - ) - image_chunked = output_chunked.images - - mem_bytes = torch.cuda.max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - # make sure that less than 5.5 GB is allocated - assert mem_bytes < 5.5 * 10**9 - - # disable slicing - pipe.disable_attention_slicing() - generator = torch.manual_seed(0) - output = pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy") - image = output.images - - # make sure that more than 5.5 GB is allocated - mem_bytes = torch.cuda.max_memory_allocated() - assert mem_bytes > 5.5 * 10**9 - assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3 - - def test_stable_diffusion_text2img_pipeline_v_pred_default(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" - "sd2-text2img/astronaut_riding_a_horse_v_pred.npy" - ) - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2") - pipe.to(torch_device) - pipe.enable_attention_slicing() - pipe.set_progress_bar_config(disable=None) - - prompt = "astronaut riding a horse" - - generator = torch.manual_seed(0) - output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (768, 768, 3) - assert np.abs(expected_image - image).max() < 7.5e-2 - - def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" - "sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy" - ) - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "astronaut riding a horse" - - generator = torch.manual_seed(0) - output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (768, 768, 3) - assert np.abs(expected_image - image).max() < 7.5e-1 - - def test_stable_diffusion_text2img_intermediate_state_v_pred(self): - number_of_steps = 0 - - def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None: - test_callback_fn.has_been_called = True - nonlocal number_of_steps - number_of_steps += 1 - if step == 0: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 96, 96) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([0.7749, 0.0325, 0.5088, 0.1619, 0.3372, 0.3667, -0.5186, 0.6860, 1.4326]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - elif step == 19: - latents = latents.detach().cpu().numpy() - assert latents.shape == (1, 4, 96, 96) - latents_slice = latents[0, -3:, -3:, -1] - expected_slice = np.array([1.3887, 1.0273, 1.7266, 0.0726, 0.6611, 0.1598, -1.0547, 0.1522, 0.0227]) - - assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2 - - test_callback_fn.has_been_called = False - - pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - - prompt = "Andromeda galaxy in a bottle" - - generator = torch.manual_seed(0) - pipe( - prompt=prompt, - num_inference_steps=20, - guidance_scale=7.5, - generator=generator, - callback=test_callback_fn, - callback_steps=1, - ) - assert test_callback_fn.has_been_called - assert number_of_steps == 20 - - def test_stable_diffusion_low_cpu_mem_usage_v_pred(self): - pipeline_id = "stabilityai/stable-diffusion-2" - - start_time = time.time() - pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) - pipeline_low_cpu_mem_usage.to(torch_device) - low_cpu_mem_usage_time = time.time() - start_time - - start_time = time.time() - _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False) - normal_load_time = time.time() - start_time - - assert 2 * low_cpu_mem_usage_time < normal_load_time - - def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipeline_id = "stabilityai/stable-diffusion-2" - prompt = "Andromeda galaxy in a bottle" - - pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16) - pipeline = pipeline.to(torch_device) - pipeline.enable_attention_slicing(1) - pipeline.enable_sequential_cpu_offload() - - generator = torch.manual_seed(0) - _ = pipeline(prompt, generator=generator, num_inference_steps=5) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 2.8 GB is allocated - assert mem_bytes < 2.8 * 10**9 diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py b/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py deleted file mode 100644 index 2f393a66d166ef80328af8fbb077013e09b1408d..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py +++ /dev/null @@ -1,439 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import tempfile -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel -from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline -from diffusers.utils import floats_tensor, nightly, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class SafeDiffusionPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - @property - def dummy_cond_unet(self): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - def test_safe_diffusion_ddim(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_pndm(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - generator = torch.Generator(device=device).manual_seed(0) - output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np") - - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = sd_pipe( - [prompt], - generator=generator, - guidance_scale=6.0, - num_inference_steps=2, - output_type="np", - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_stable_diffusion_no_safety_checker(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None - ) - assert isinstance(pipe, StableDiffusionPipeline) - assert isinstance(pipe.scheduler, LMSDiscreteScheduler) - assert pipe.safety_checker is None - - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - # check that there's no error when saving a pipeline with one of the models being None - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = StableDiffusionPipeline.from_pretrained(tmpdirname) - - # sanity check that the pipeline still works - assert pipe.safety_checker is None - image = pipe("example prompt", num_inference_steps=2).images[0] - assert image is not None - - @unittest.skipIf(torch_device != "cuda", "This test requires a GPU") - def test_stable_diffusion_fp16(self): - """Test that stable diffusion works with fp16""" - unet = self.dummy_cond_unet - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - # put models in fp16 - unet = unet.half() - vae = vae.half() - bert = bert.half() - - # make sure here that pndm scheduler skips prk - sd_pipe = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger" - image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images - - assert image.shape == (1, 64, 64, 3) - - -@nightly -@require_torch_gpu -class SafeDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_harm_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = ( - "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle" - " coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with" - " anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and" - " children from bahnhof zoo, detailed " - ) - seed = 4003660346 - guidance_scale = 7 - - # without safety guidance (sld_guidance_scale = 0) - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - # without safety guidance (strong configuration) - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_nudity_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None) - sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = "padme amidala taking a bath artwork, safe for work, no nudity" - seed = 2734971755 - guidance_scale = 7 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443] - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_nudity_safetychecker_safe_stable_diffusion(self): - sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - sd_pipe = sd_pipe.to(torch_device) - sd_pipe.set_progress_bar_config(disable=None) - - prompt = ( - "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c." - " leyendecker" - ) - seed = 1044355234 - guidance_scale = 12 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=0, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) - - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7 - - generator = torch.manual_seed(seed) - output = sd_pipe( - [prompt], - generator=generator, - guidance_scale=guidance_scale, - num_inference_steps=50, - output_type="np", - width=512, - height=512, - sld_guidance_scale=2000, - sld_warmup_steps=7, - sld_threshold=0.025, - sld_momentum_scale=0.5, - sld_mom_beta=0.7, - ) - - image = output.images - image_slice = image[0, -3:, -3:, -1] - expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561]) - assert image.shape == (1, 512, 512, 3) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/stable_unclip/__init__.py b/diffusers/tests/pipelines/stable_unclip/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py deleted file mode 100644 index 368ab21f24a91df7ff17ae8bf69a1acdfa949fab..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py +++ /dev/null @@ -1,229 +0,0 @@ -import gc -import unittest - -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DDPMScheduler, - PriorTransformer, - StableUnCLIPPipeline, - UNet2DConditionModel, -) -from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer -from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference - - -class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableUnCLIPPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - - # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false - test_xformers_attention = False - - def get_dummy_components(self): - embedder_hidden_size = 32 - embedder_projection_dim = embedder_hidden_size - - # prior components - - torch.manual_seed(0) - prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - torch.manual_seed(0) - prior_text_encoder = CLIPTextModelWithProjection( - CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=embedder_hidden_size, - projection_dim=embedder_projection_dim, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - ) - - torch.manual_seed(0) - prior = PriorTransformer( - num_attention_heads=2, - attention_head_dim=12, - embedding_dim=embedder_projection_dim, - num_layers=1, - ) - - torch.manual_seed(0) - prior_scheduler = DDPMScheduler( - variance_type="fixed_small_log", - prediction_type="sample", - num_train_timesteps=1000, - clip_sample=True, - clip_sample_range=5.0, - beta_schedule="squaredcos_cap_v2", - ) - - # regular denoising components - - torch.manual_seed(0) - image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size) - image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2") - - torch.manual_seed(0) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - torch.manual_seed(0) - text_encoder = CLIPTextModel( - CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=embedder_hidden_size, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - ) - - torch.manual_seed(0) - unet = UNet2DConditionModel( - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"), - block_out_channels=(32, 64), - attention_head_dim=(2, 4), - class_embed_type="projection", - # The class embeddings are the noise augmented image embeddings. - # I.e. the image embeddings concated with the noised embeddings of the same dimension - projection_class_embeddings_input_dim=embedder_projection_dim * 2, - cross_attention_dim=embedder_hidden_size, - layers_per_block=1, - upcast_attention=True, - use_linear_projection=True, - ) - - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_schedule="scaled_linear", - beta_start=0.00085, - beta_end=0.012, - prediction_type="v_prediction", - set_alpha_to_one=False, - steps_offset=1, - ) - - torch.manual_seed(0) - vae = AutoencoderKL() - - components = { - # prior components - "prior_tokenizer": prior_tokenizer, - "prior_text_encoder": prior_text_encoder, - "prior": prior, - "prior_scheduler": prior_scheduler, - # image noising components - "image_normalizer": image_normalizer, - "image_noising_scheduler": image_noising_scheduler, - # regular denoising components - "tokenizer": tokenizer, - "text_encoder": text_encoder, - "unet": unet, - "scheduler": scheduler, - "vae": vae, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "prior_num_inference_steps": 2, - "output_type": "numpy", - } - return inputs - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because UnCLIP GPU undeterminism requires a looser check. - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because UnCLIP undeterminism requires a looser check. - def test_inference_batch_single_identical(self): - test_max_difference = torch_device in ["cpu", "mps"] - - self._test_inference_batch_single_identical(test_max_difference=test_max_difference) - - -@slow -@require_torch_gpu -class StableUnCLIPPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_unclip(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_anime_turtle_fp16.npy" - ) - - pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - # stable unclip will oom when integration tests are run on a V100, - # so turn on memory savings - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe("anime turle", generator=generator, output_type="np") - - image = output.images[0] - - assert image.shape == (768, 768, 3) - - assert_mean_pixel_difference(image, expected_image) - - def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - _ = pipe( - "anime turtle", - prior_num_inference_steps=2, - num_inference_steps=2, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 7 GB is allocated - assert mem_bytes < 7 * 10**9 diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py deleted file mode 100644 index f93fa3a59014498238591dbf158c09d319d5ad60..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ /dev/null @@ -1,282 +0,0 @@ -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import ( - CLIPImageProcessor, - CLIPTextConfig, - CLIPTextModel, - CLIPTokenizer, - CLIPVisionConfig, - CLIPVisionModelWithProjection, -) - -from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableUnCLIPImg2ImgPipeline, UNet2DConditionModel -from diffusers.pipelines.pipeline_utils import DiffusionPipeline -from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer -from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import floats_tensor, load_image, load_numpy, require_torch_gpu, slow, torch_device - -from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import ( - PipelineTesterMixin, - assert_mean_pixel_difference, -) - - -class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = StableUnCLIPImg2ImgPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - - def get_dummy_components(self): - embedder_hidden_size = 32 - embedder_projection_dim = embedder_hidden_size - - # image encoding components - - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - image_encoder = CLIPVisionModelWithProjection( - CLIPVisionConfig( - hidden_size=embedder_hidden_size, - projection_dim=embedder_projection_dim, - num_hidden_layers=5, - num_attention_heads=4, - image_size=32, - intermediate_size=37, - patch_size=1, - ) - ) - - # regular denoising components - - torch.manual_seed(0) - image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size) - image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2") - - torch.manual_seed(0) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - torch.manual_seed(0) - text_encoder = CLIPTextModel( - CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=embedder_hidden_size, - projection_dim=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - ) - - torch.manual_seed(0) - unet = UNet2DConditionModel( - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"), - up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"), - block_out_channels=(32, 64), - attention_head_dim=(2, 4), - class_embed_type="projection", - # The class embeddings are the noise augmented image embeddings. - # I.e. the image embeddings concated with the noised embeddings of the same dimension - projection_class_embeddings_input_dim=embedder_projection_dim * 2, - cross_attention_dim=embedder_hidden_size, - layers_per_block=1, - upcast_attention=True, - use_linear_projection=True, - ) - - torch.manual_seed(0) - scheduler = DDIMScheduler( - beta_schedule="scaled_linear", - beta_start=0.00085, - beta_end=0.012, - prediction_type="v_prediction", - set_alpha_to_one=False, - steps_offset=1, - ) - - torch.manual_seed(0) - vae = AutoencoderKL() - - components = { - # image encoding components - "feature_extractor": feature_extractor, - "image_encoder": image_encoder, - # image noising components - "image_normalizer": image_normalizer, - "image_noising_scheduler": image_noising_scheduler, - # regular denoising components - "tokenizer": tokenizer, - "text_encoder": text_encoder, - "unet": unet, - "scheduler": scheduler, - "vae": vae, - } - - return components - - def get_dummy_inputs(self, device, seed=0, pil_image=True): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - - if pil_image: - input_image = input_image * 0.5 + 0.5 - input_image = input_image.clamp(0, 1) - input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy() - input_image = DiffusionPipeline.numpy_to_pil(input_image)[0] - - return { - "prompt": "An anime racoon running a marathon", - "image": input_image, - "generator": generator, - "num_inference_steps": 2, - "output_type": "np", - } - - def test_image_embeds_none(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = StableUnCLIPImg2ImgPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs.update({"image_embeds": None}) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1] - - assert image.shape == (1, 32, 32, 3) - expected_slice = np.array( - [0.34588397, 0.7747054, 0.5453714, 0.5227859, 0.57656777, 0.6532228, 0.5177634, 0.49932978, 0.56626225] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because GPU undeterminism requires a looser check. - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device in ["cpu", "mps"] - - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because undeterminism requires a looser check. - def test_inference_batch_single_identical(self): - test_max_difference = torch_device in ["cpu", "mps"] - - self._test_inference_batch_single_identical(test_max_difference=test_max_difference) - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False) - - -@slow -@require_torch_gpu -class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_stable_unclip_l_img2img(self): - input_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png" - ) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_img2img_anime_turtle_fp16.npy" - ) - - pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( - "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16 - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - # stable unclip will oom when integration tests are run on a V100, - # so turn on memory savings - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe(input_image, "anime turle", generator=generator, output_type="np") - - image = output.images[0] - - assert image.shape == (768, 768, 3) - - assert_mean_pixel_difference(image, expected_image) - - def test_stable_unclip_h_img2img(self): - input_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png" - ) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_h_img2img_anime_turtle_fp16.npy" - ) - - pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( - "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16 - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - # stable unclip will oom when integration tests are run on a V100, - # so turn on memory savings - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipe(input_image, "anime turle", generator=generator, output_type="np") - - image = output.images[0] - - assert image.shape == (768, 768, 3) - - assert_mean_pixel_difference(image, expected_image) - - def test_stable_unclip_img2img_pipeline_with_sequential_cpu_offloading(self): - input_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png" - ) - - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = StableUnCLIPImg2ImgPipeline.from_pretrained( - "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16 - ) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - _ = pipe( - input_image, - "anime turtle", - num_inference_steps=2, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 7 GB is allocated - assert mem_bytes < 7 * 10**9 diff --git a/diffusers/tests/pipelines/test_pipeline_utils.py b/diffusers/tests/pipelines/test_pipeline_utils.py deleted file mode 100644 index 51d987d8bb1151862f910822eb2c173ce4ff313c..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/test_pipeline_utils.py +++ /dev/null @@ -1,134 +0,0 @@ -import unittest - -from diffusers.pipelines.pipeline_utils import is_safetensors_compatible - - -class IsSafetensorsCompatibleTests(unittest.TestCase): - def test_all_is_compatible(self): - filenames = [ - "safety_checker/pytorch_model.bin", - "safety_checker/model.safetensors", - "vae/diffusion_pytorch_model.bin", - "vae/diffusion_pytorch_model.safetensors", - "text_encoder/pytorch_model.bin", - "text_encoder/model.safetensors", - "unet/diffusion_pytorch_model.bin", - "unet/diffusion_pytorch_model.safetensors", - ] - self.assertTrue(is_safetensors_compatible(filenames)) - - def test_diffusers_model_is_compatible(self): - filenames = [ - "unet/diffusion_pytorch_model.bin", - "unet/diffusion_pytorch_model.safetensors", - ] - self.assertTrue(is_safetensors_compatible(filenames)) - - def test_diffusers_model_is_not_compatible(self): - filenames = [ - "safety_checker/pytorch_model.bin", - "safety_checker/model.safetensors", - "vae/diffusion_pytorch_model.bin", - "vae/diffusion_pytorch_model.safetensors", - "text_encoder/pytorch_model.bin", - "text_encoder/model.safetensors", - "unet/diffusion_pytorch_model.bin", - # Removed: 'unet/diffusion_pytorch_model.safetensors', - ] - self.assertFalse(is_safetensors_compatible(filenames)) - - def test_transformer_model_is_compatible(self): - filenames = [ - "text_encoder/pytorch_model.bin", - "text_encoder/model.safetensors", - ] - self.assertTrue(is_safetensors_compatible(filenames)) - - def test_transformer_model_is_not_compatible(self): - filenames = [ - "safety_checker/pytorch_model.bin", - "safety_checker/model.safetensors", - "vae/diffusion_pytorch_model.bin", - "vae/diffusion_pytorch_model.safetensors", - "text_encoder/pytorch_model.bin", - # Removed: 'text_encoder/model.safetensors', - "unet/diffusion_pytorch_model.bin", - "unet/diffusion_pytorch_model.safetensors", - ] - self.assertFalse(is_safetensors_compatible(filenames)) - - def test_all_is_compatible_variant(self): - filenames = [ - "safety_checker/pytorch_model.fp16.bin", - "safety_checker/model.fp16.safetensors", - "vae/diffusion_pytorch_model.fp16.bin", - "vae/diffusion_pytorch_model.fp16.safetensors", - "text_encoder/pytorch_model.fp16.bin", - "text_encoder/model.fp16.safetensors", - "unet/diffusion_pytorch_model.fp16.bin", - "unet/diffusion_pytorch_model.fp16.safetensors", - ] - variant = "fp16" - self.assertTrue(is_safetensors_compatible(filenames, variant=variant)) - - def test_diffusers_model_is_compatible_variant(self): - filenames = [ - "unet/diffusion_pytorch_model.fp16.bin", - "unet/diffusion_pytorch_model.fp16.safetensors", - ] - variant = "fp16" - self.assertTrue(is_safetensors_compatible(filenames, variant=variant)) - - def test_diffusers_model_is_compatible_variant_partial(self): - # pass variant but use the non-variant filenames - filenames = [ - "unet/diffusion_pytorch_model.bin", - "unet/diffusion_pytorch_model.safetensors", - ] - variant = "fp16" - self.assertTrue(is_safetensors_compatible(filenames, variant=variant)) - - def test_diffusers_model_is_not_compatible_variant(self): - filenames = [ - "safety_checker/pytorch_model.fp16.bin", - "safety_checker/model.fp16.safetensors", - "vae/diffusion_pytorch_model.fp16.bin", - "vae/diffusion_pytorch_model.fp16.safetensors", - "text_encoder/pytorch_model.fp16.bin", - "text_encoder/model.fp16.safetensors", - "unet/diffusion_pytorch_model.fp16.bin", - # Removed: 'unet/diffusion_pytorch_model.fp16.safetensors', - ] - variant = "fp16" - self.assertFalse(is_safetensors_compatible(filenames, variant=variant)) - - def test_transformer_model_is_compatible_variant(self): - filenames = [ - "text_encoder/pytorch_model.fp16.bin", - "text_encoder/model.fp16.safetensors", - ] - variant = "fp16" - self.assertTrue(is_safetensors_compatible(filenames, variant=variant)) - - def test_transformer_model_is_compatible_variant_partial(self): - # pass variant but use the non-variant filenames - filenames = [ - "text_encoder/pytorch_model.bin", - "text_encoder/model.safetensors", - ] - variant = "fp16" - self.assertTrue(is_safetensors_compatible(filenames, variant=variant)) - - def test_transformer_model_is_not_compatible_variant(self): - filenames = [ - "safety_checker/pytorch_model.fp16.bin", - "safety_checker/model.fp16.safetensors", - "vae/diffusion_pytorch_model.fp16.bin", - "vae/diffusion_pytorch_model.fp16.safetensors", - "text_encoder/pytorch_model.fp16.bin", - # 'text_encoder/model.fp16.safetensors', - "unet/diffusion_pytorch_model.fp16.bin", - "unet/diffusion_pytorch_model.fp16.safetensors", - ] - variant = "fp16" - self.assertFalse(is_safetensors_compatible(filenames, variant=variant)) diff --git a/diffusers/tests/pipelines/text_to_video/__init__.py b/diffusers/tests/pipelines/text_to_video/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/text_to_video/test_text_to_video.py b/diffusers/tests/pipelines/text_to_video/test_text_to_video.py deleted file mode 100644 index e4331fda02ff6511a4b0d5cb7a49c1212129bbe2..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/text_to_video/test_text_to_video.py +++ /dev/null @@ -1,197 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMScheduler, - DPMSolverMultistepScheduler, - TextToVideoSDPipeline, - UNet3DConditionModel, -) -from diffusers.utils import load_numpy, skip_mps, slow - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@skip_mps -class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = TextToVideoSDPipeline - params = TEXT_TO_IMAGE_PARAMS - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - # No `output_type`. - required_optional_params = frozenset( - [ - "num_inference_steps", - "generator", - "latents", - "return_dict", - "callback", - "callback_steps", - ] - ) - - def get_dummy_components(self): - torch.manual_seed(0) - unet = UNet3DConditionModel( - block_out_channels=(32, 64, 64, 64), - layers_per_block=2, - sample_size=32, - in_channels=4, - out_channels=4, - down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"), - up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"), - cross_attention_dim=32, - attention_head_dim=4, - ) - scheduler = DDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - clip_sample=False, - set_alpha_to_one=False, - ) - torch.manual_seed(0) - vae = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - sample_size=128, - ) - torch.manual_seed(0) - text_encoder_config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - hidden_act="gelu", - projection_dim=512, - ) - text_encoder = CLIPTextModel(text_encoder_config) - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - components = { - "unet": unet, - "scheduler": scheduler, - "vae": vae, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - } - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "A painting of a squirrel eating a burger", - "generator": generator, - "num_inference_steps": 2, - "guidance_scale": 6.0, - "output_type": "pt", - } - return inputs - - def test_text_to_video_default_case(self): - device = "cpu" # ensure determinism for the device-dependent torch.Generator - components = self.get_dummy_components() - sd_pipe = TextToVideoSDPipeline(**components) - sd_pipe = sd_pipe.to(device) - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(device) - inputs["output_type"] = "np" - frames = sd_pipe(**inputs).frames - image_slice = frames[0][-3:, -3:, -1] - - assert frames[0].shape == (64, 64, 3) - expected_slice = np.array([166, 184, 167, 118, 102, 123, 108, 93, 114]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False) - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_consistent(self): - pass - - # (todo): sayakpaul - @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") - def test_inference_batch_single_identical(self): - pass - - @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") - def test_num_images_per_prompt(self): - pass - - def test_progress_bar(self): - return super().test_progress_bar() - - -@slow -@skip_mps -class TextToVideoSDPipelineSlowTests(unittest.TestCase): - def test_full_model(self): - expected_video = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy" - ) - - pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") - pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) - pipe = pipe.to("cuda") - - prompt = "Spiderman is surfing" - generator = torch.Generator(device="cpu").manual_seed(0) - - video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pt").frames - video = video_frames.cpu().numpy() - - assert np.abs(expected_video - video).mean() < 5e-2 - - def test_two_step_model(self): - expected_video = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy" - ) - - pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b") - pipe = pipe.to("cuda") - - prompt = "Spiderman is surfing" - generator = torch.Generator(device="cpu").manual_seed(0) - - video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames - video = video_frames.cpu().numpy() - - assert np.abs(expected_video - video).mean() < 5e-2 diff --git a/diffusers/tests/pipelines/unclip/__init__.py b/diffusers/tests/pipelines/unclip/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/unclip/test_unclip.py b/diffusers/tests/pipelines/unclip/test_unclip.py deleted file mode 100644 index c36fb02b190f271d57eca0c54a94a19acad0faf3..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/unclip/test_unclip.py +++ /dev/null @@ -1,498 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer - -from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel -from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel -from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps - -from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference - - -class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = UnCLIPPipeline - params = TEXT_TO_IMAGE_PARAMS - { - "negative_prompt", - "height", - "width", - "negative_prompt_embeds", - "guidance_scale", - "prompt_embeds", - "cross_attention_kwargs", - } - batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - required_optional_params = [ - "generator", - "return_dict", - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - test_xformers_attention = False - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def cross_attention_dim(self): - return 100 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModelWithProjection(config) - - @property - def dummy_prior(self): - torch.manual_seed(0) - - model_kwargs = { - "num_attention_heads": 2, - "attention_head_dim": 12, - "embedding_dim": self.text_embedder_hidden_size, - "num_layers": 1, - } - - model = PriorTransformer(**model_kwargs) - return model - - @property - def dummy_text_proj(self): - torch.manual_seed(0) - - model_kwargs = { - "clip_embeddings_dim": self.text_embedder_hidden_size, - "time_embed_dim": self.time_embed_dim, - "cross_attention_dim": self.cross_attention_dim, - } - - model = UnCLIPTextProjModel(**model_kwargs) - return model - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "sample_size": 32, - # RGB in channels - "in_channels": 3, - # Out channels is double in channels because predicts mean and variance - "out_channels": 6, - "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), - "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), - "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "layers_per_block": 1, - "cross_attention_dim": self.cross_attention_dim, - "attention_head_dim": 4, - "resnet_time_scale_shift": "scale_shift", - "class_embed_type": "identity", - } - - model = UNet2DConditionModel(**model_kwargs) - return model - - @property - def dummy_super_res_kwargs(self): - return { - "sample_size": 64, - "layers_per_block": 1, - "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "in_channels": 6, - "out_channels": 3, - } - - @property - def dummy_super_res_first(self): - torch.manual_seed(0) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - @property - def dummy_super_res_last(self): - # seeded differently to get different unet than `self.dummy_super_res_first` - torch.manual_seed(1) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - def get_dummy_components(self): - prior = self.dummy_prior - decoder = self.dummy_decoder - text_proj = self.dummy_text_proj - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - super_res_first = self.dummy_super_res_first - super_res_last = self.dummy_super_res_last - - prior_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="sample", - num_train_timesteps=1000, - clip_sample_range=5.0, - ) - - decoder_scheduler = UnCLIPScheduler( - variance_type="learned_range", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - super_res_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - components = { - "prior": prior, - "decoder": decoder, - "text_proj": text_proj, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "super_res_first": super_res_first, - "super_res_last": super_res_last, - "prior_scheduler": prior_scheduler, - "decoder_scheduler": decoder_scheduler, - "super_res_scheduler": super_res_scheduler, - } - - return components - - def get_dummy_inputs(self, device, seed=0): - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - inputs = { - "prompt": "horse", - "generator": generator, - "prior_num_inference_steps": 2, - "decoder_num_inference_steps": 2, - "super_res_num_inference_steps": 2, - "output_type": "numpy", - } - return inputs - - def test_unclip(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(device)) - image = output.images - - image_from_tuple = pipe( - **self.get_dummy_inputs(device), - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.9988, - 0.0028, - 0.9997, - 0.9984, - 0.9965, - 0.0029, - 0.9986, - 0.0025, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_passed_text_embed(self): - device = torch.device("cpu") - - class DummyScheduler: - init_noise_sigma = 1 - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - prior = components["prior"] - decoder = components["decoder"] - super_res_first = components["super_res_first"] - tokenizer = components["tokenizer"] - text_encoder = components["text_encoder"] - - generator = torch.Generator(device=device).manual_seed(0) - dtype = prior.dtype - batch_size = 1 - - shape = (batch_size, prior.config.embedding_dim) - prior_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - shape = (batch_size, decoder.in_channels, decoder.sample_size, decoder.sample_size) - decoder_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - shape = ( - batch_size, - super_res_first.in_channels // 2, - super_res_first.sample_size, - super_res_first.sample_size, - ) - super_res_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - pipe.set_progress_bar_config(disable=None) - - prompt = "this is a prompt example" - - generator = torch.Generator(device=device).manual_seed(0) - output = pipe( - [prompt], - generator=generator, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - prior_latents=prior_latents, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - output_type="np", - ) - image = output.images - - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - return_tensors="pt", - ) - text_model_output = text_encoder(text_inputs.input_ids) - text_attention_mask = text_inputs.attention_mask - - generator = torch.Generator(device=device).manual_seed(0) - image_from_text = pipe( - generator=generator, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - prior_latents=prior_latents, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - text_model_output=text_model_output, - text_attention_mask=text_attention_mask, - output_type="np", - )[0] - - # make sure passing text embeddings manually is identical - assert np.abs(image - image_from_text).max() < 1e-4 - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because UnCLIP GPU undeterminism requires a looser check. - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because UnCLIP undeterminism requires a looser check. - @skip_mps - def test_inference_batch_single_identical(self): - test_max_difference = torch_device == "cpu" - relax_max_difference = True - additional_params_copy_to_batched_inputs = [ - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - self._test_inference_batch_single_identical( - test_max_difference=test_max_difference, - relax_max_difference=relax_max_difference, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - - def test_inference_batch_consistent(self): - additional_params_copy_to_batched_inputs = [ - "prior_num_inference_steps", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - if torch_device == "mps": - # TODO: MPS errors with larger batch sizes - batch_sizes = [2, 3] - self._test_inference_batch_consistent( - batch_sizes=batch_sizes, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - else: - self._test_inference_batch_consistent( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs - ) - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - -@nightly -class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_unclip_karlo_cpu_fp32(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_horse_cpu.npy" - ) - - pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha") - pipeline.set_progress_bar_config(disable=None) - - generator = torch.manual_seed(0) - output = pipeline( - "horse", - num_images_per_prompt=1, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - assert np.abs(expected_image - image).max() < 1e-1 - - -@slow -@require_torch_gpu -class UnCLIPPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_unclip_karlo(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_horse_fp16.npy" - ) - - pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16) - pipeline = pipeline.to(torch_device) - pipeline.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipeline( - "horse", - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - assert_mean_pixel_difference(image, expected_image) - - def test_unclip_pipeline_with_sequential_cpu_offloading(self): - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - pipe.enable_attention_slicing() - pipe.enable_sequential_cpu_offload() - - _ = pipe( - "horse", - num_images_per_prompt=1, - prior_num_inference_steps=2, - decoder_num_inference_steps=2, - super_res_num_inference_steps=2, - output_type="np", - ) - - mem_bytes = torch.cuda.max_memory_allocated() - # make sure that less than 7 GB is allocated - assert mem_bytes < 7 * 10**9 diff --git a/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py deleted file mode 100644 index ff32ac5f9aafb9140ec5b49dc79711d493b76788..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py +++ /dev/null @@ -1,508 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import random -import unittest - -import numpy as np -import torch -from transformers import ( - CLIPImageProcessor, - CLIPTextConfig, - CLIPTextModelWithProjection, - CLIPTokenizer, - CLIPVisionConfig, - CLIPVisionModelWithProjection, -) - -from diffusers import ( - DiffusionPipeline, - UnCLIPImageVariationPipeline, - UnCLIPScheduler, - UNet2DConditionModel, - UNet2DModel, -) -from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel -from diffusers.utils import floats_tensor, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps - -from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference - - -class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): - pipeline_class = UnCLIPImageVariationPipeline - params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"} - batch_params = IMAGE_VARIATION_BATCH_PARAMS - - required_optional_params = [ - "generator", - "return_dict", - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def time_input_dim(self): - return 32 - - @property - def block_out_channels_0(self): - return self.time_input_dim - - @property - def time_embed_dim(self): - return self.time_input_dim * 4 - - @property - def cross_attention_dim(self): - return 100 - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModelWithProjection(config) - - @property - def dummy_image_encoder(self): - torch.manual_seed(0) - config = CLIPVisionConfig( - hidden_size=self.text_embedder_hidden_size, - projection_dim=self.text_embedder_hidden_size, - num_hidden_layers=5, - num_attention_heads=4, - image_size=32, - intermediate_size=37, - patch_size=1, - ) - return CLIPVisionModelWithProjection(config) - - @property - def dummy_text_proj(self): - torch.manual_seed(0) - - model_kwargs = { - "clip_embeddings_dim": self.text_embedder_hidden_size, - "time_embed_dim": self.time_embed_dim, - "cross_attention_dim": self.cross_attention_dim, - } - - model = UnCLIPTextProjModel(**model_kwargs) - return model - - @property - def dummy_decoder(self): - torch.manual_seed(0) - - model_kwargs = { - "sample_size": 32, - # RGB in channels - "in_channels": 3, - # Out channels is double in channels because predicts mean and variance - "out_channels": 6, - "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), - "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), - "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "layers_per_block": 1, - "cross_attention_dim": self.cross_attention_dim, - "attention_head_dim": 4, - "resnet_time_scale_shift": "scale_shift", - "class_embed_type": "identity", - } - - model = UNet2DConditionModel(**model_kwargs) - return model - - @property - def dummy_super_res_kwargs(self): - return { - "sample_size": 64, - "layers_per_block": 1, - "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"), - "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"), - "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), - "in_channels": 6, - "out_channels": 3, - } - - @property - def dummy_super_res_first(self): - torch.manual_seed(0) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - @property - def dummy_super_res_last(self): - # seeded differently to get different unet than `self.dummy_super_res_first` - torch.manual_seed(1) - - model = UNet2DModel(**self.dummy_super_res_kwargs) - return model - - def get_dummy_components(self): - decoder = self.dummy_decoder - text_proj = self.dummy_text_proj - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - super_res_first = self.dummy_super_res_first - super_res_last = self.dummy_super_res_last - - decoder_scheduler = UnCLIPScheduler( - variance_type="learned_range", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - super_res_scheduler = UnCLIPScheduler( - variance_type="fixed_small_log", - prediction_type="epsilon", - num_train_timesteps=1000, - ) - - feature_extractor = CLIPImageProcessor(crop_size=32, size=32) - - image_encoder = self.dummy_image_encoder - - return { - "decoder": decoder, - "text_encoder": text_encoder, - "tokenizer": tokenizer, - "text_proj": text_proj, - "feature_extractor": feature_extractor, - "image_encoder": image_encoder, - "super_res_first": super_res_first, - "super_res_last": super_res_last, - "decoder_scheduler": decoder_scheduler, - "super_res_scheduler": super_res_scheduler, - } - - def get_dummy_inputs(self, device, seed=0, pil_image=True): - input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) - if str(device).startswith("mps"): - generator = torch.manual_seed(seed) - else: - generator = torch.Generator(device=device).manual_seed(seed) - - if pil_image: - input_image = input_image * 0.5 + 0.5 - input_image = input_image.clamp(0, 1) - input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy() - input_image = DiffusionPipeline.numpy_to_pil(input_image)[0] - - return { - "image": input_image, - "generator": generator, - "decoder_num_inference_steps": 2, - "super_res_num_inference_steps": 2, - "output_type": "np", - } - - def test_unclip_image_variation_input_tensor(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.0002, - 0.9997, - 0.9997, - 0.9969, - 0.0023, - 0.9997, - 0.9969, - 0.9970, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_image_variation_input_image(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 64, 64, 3) - - expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_image_variation_input_list_images(self): - device = "cpu" - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - pipeline_inputs["image"] = [ - pipeline_inputs["image"], - pipeline_inputs["image"], - ] - - output = pipe(**pipeline_inputs) - image = output.images - - tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True) - tuple_pipeline_inputs["image"] = [ - tuple_pipeline_inputs["image"], - tuple_pipeline_inputs["image"], - ] - - image_from_tuple = pipe( - **tuple_pipeline_inputs, - return_dict=False, - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (2, 64, 64, 3) - - expected_slice = np.array( - [ - 0.9997, - 0.9989, - 0.0008, - 0.0021, - 0.9960, - 0.0018, - 0.0014, - 0.0002, - 0.9933, - ] - ) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_unclip_passed_image_embed(self): - device = torch.device("cpu") - - class DummyScheduler: - init_noise_sigma = 1 - - components = self.get_dummy_components() - - pipe = self.pipeline_class(**components) - pipe = pipe.to(device) - - pipe.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=device).manual_seed(0) - dtype = pipe.decoder.dtype - batch_size = 1 - - shape = (batch_size, pipe.decoder.in_channels, pipe.decoder.sample_size, pipe.decoder.sample_size) - decoder_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - shape = ( - batch_size, - pipe.super_res_first.in_channels // 2, - pipe.super_res_first.sample_size, - pipe.super_res_first.sample_size, - ) - super_res_latents = pipe.prepare_latents( - shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler() - ) - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - - img_out_1 = pipe( - **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents - ).images - - pipeline_inputs = self.get_dummy_inputs(device, pil_image=False) - # Don't pass image, instead pass embedding - image = pipeline_inputs.pop("image") - image_embeddings = pipe.image_encoder(image).image_embeds - - img_out_2 = pipe( - **pipeline_inputs, - decoder_latents=decoder_latents, - super_res_latents=super_res_latents, - image_embeddings=image_embeddings, - ).images - - # make sure passing text embeddings manually is identical - assert np.abs(img_out_1 - img_out_2).max() < 1e-4 - - # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass - # because UnCLIP GPU undeterminism requires a looser check. - @skip_mps - def test_attention_slicing_forward_pass(self): - test_max_difference = torch_device == "cpu" - - self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference) - - # Overriding PipelineTesterMixin::test_inference_batch_single_identical - # because UnCLIP undeterminism requires a looser check. - @skip_mps - def test_inference_batch_single_identical(self): - test_max_difference = torch_device == "cpu" - relax_max_difference = True - additional_params_copy_to_batched_inputs = [ - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - self._test_inference_batch_single_identical( - test_max_difference=test_max_difference, - relax_max_difference=relax_max_difference, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - - def test_inference_batch_consistent(self): - additional_params_copy_to_batched_inputs = [ - "decoder_num_inference_steps", - "super_res_num_inference_steps", - ] - - if torch_device == "mps": - # TODO: MPS errors with larger batch sizes - batch_sizes = [2, 3] - self._test_inference_batch_consistent( - batch_sizes=batch_sizes, - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, - ) - else: - self._test_inference_batch_consistent( - additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs - ) - - @skip_mps - def test_dict_tuple_outputs_equivalent(self): - return super().test_dict_tuple_outputs_equivalent() - - @skip_mps - def test_save_load_local(self): - return super().test_save_load_local() - - @skip_mps - def test_save_load_optional_components(self): - return super().test_save_load_optional_components() - - -@slow -@require_torch_gpu -class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_unclip_image_variation_karlo(self): - input_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png" - ) - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/unclip/karlo_v1_alpha_cat_variation_fp16.npy" - ) - - pipeline = UnCLIPImageVariationPipeline.from_pretrained( - "kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16 - ) - pipeline = pipeline.to(torch_device) - pipeline.set_progress_bar_config(disable=None) - - generator = torch.Generator(device="cpu").manual_seed(0) - output = pipeline( - input_image, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - - assert_mean_pixel_difference(image, expected_image) diff --git a/diffusers/tests/pipelines/versatile_diffusion/__init__.py b/diffusers/tests/pipelines/versatile_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py deleted file mode 100644 index 4e2b89982a6aad0fb2f2b7c8735b0e645665359f..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py +++ /dev/null @@ -1,107 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import tempfile -import unittest - -import numpy as np -import torch - -from diffusers import VersatileDiffusionDualGuidedPipeline -from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -@nightly -@require_torch_gpu -class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_remove_unused_weights_save_load(self): - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") - # remove text_unet - pipe.remove_unused_weights() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - second_prompt = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" - ) - - generator = torch.manual_seed(0) - image = pipe( - prompt="first prompt", - image=second_prompt, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ).images - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname) - - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = generator.manual_seed(0) - new_image = pipe( - prompt="first prompt", - image=second_prompt, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ).images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass" - - def test_inference_dual_guided(self): - pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion") - pipe.remove_unused_weights() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - first_prompt = "cyberpunk 2077" - second_prompt = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" - ) - generator = torch.manual_seed(0) - image = pipe( - prompt=first_prompt, - image=second_prompt, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=50, - output_type="numpy", - ).images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0787, 0.0849, 0.0826, 0.0812, 0.0807, 0.0795, 0.0818, 0.0798, 0.0779]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py deleted file mode 100644 index b4eabb9e3a0e18dd71a445bb8960b27d8699daac..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import torch - -from diffusers import VersatileDiffusionImageVariationPipeline -from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase): - pass - - -@slow -@require_torch_gpu -class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase): - def test_inference_image_variations(self): - pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion") - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - image_prompt = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" - ) - generator = torch.manual_seed(0) - image = pipe( - image=image_prompt, - generator=generator, - guidance_scale=7.5, - num_inference_steps=50, - output_type="numpy", - ).images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0441, 0.0469, 0.0507, 0.0575, 0.0632, 0.0650, 0.0865, 0.0909, 0.0945]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py deleted file mode 100644 index b77c1baf41d5abe4adb17aebb600b80eedda6c39..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py +++ /dev/null @@ -1,129 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import tempfile -import unittest - -import numpy as np -import torch - -from diffusers import VersatileDiffusionPipeline -from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase): - pass - - -@nightly -@require_torch_gpu -class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_from_save_pretrained(self): - pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" - ) - - generator = torch.manual_seed(0) - image = pipe.dual_guided( - prompt="first prompt", - image=prompt_image, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ).images - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = generator.manual_seed(0) - new_image = pipe.dual_guided( - prompt="first prompt", - image=prompt_image, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=2, - output_type="numpy", - ).images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass" - - def test_inference_dual_guided_then_text_to_image(self): - pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "cyberpunk 2077" - init_image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg" - ) - generator = torch.manual_seed(0) - image = pipe.dual_guided( - prompt=prompt, - image=init_image, - text_to_image_strength=0.75, - generator=generator, - guidance_scale=7.5, - num_inference_steps=50, - output_type="numpy", - ).images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - prompt = "A painting of a squirrel eating a burger " - generator = torch.manual_seed(0) - image = pipe.text_to_image( - prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy" - ).images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 - - image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.3770, 0.3894, 0.4297, 0.4331, 0.4456]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py deleted file mode 100644 index 194f660f7055308b41c47c14a35c41f3b2b1014b..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import tempfile -import unittest - -import numpy as np -import torch - -from diffusers import VersatileDiffusionTextToImagePipeline -from diffusers.utils.testing_utils import nightly, require_torch_gpu, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class VersatileDiffusionTextToImagePipelineFastTests(unittest.TestCase): - pass - - -@nightly -@require_torch_gpu -class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_remove_unused_weights_save_load(self): - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion") - # remove text_unet - pipe.remove_unused_weights() - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger " - generator = torch.manual_seed(0) - image = pipe( - prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy" - ).images - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - generator = generator.manual_seed(0) - new_image = pipe( - prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy" - ).images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass" - - def test_inference_text2img(self): - pipe = VersatileDiffusionTextToImagePipeline.from_pretrained( - "shi-labs/versatile-diffusion", torch_dtype=torch.float16 - ) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - prompt = "A painting of a squirrel eating a burger " - generator = torch.manual_seed(0) - image = pipe( - prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy" - ).images - - image_slice = image[0, 253:256, 253:256, -1] - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/diffusers/tests/pipelines/vq_diffusion/__init__.py b/diffusers/tests/pipelines/vq_diffusion/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py deleted file mode 100644 index 6769240db905abc75e2d04af89a1852911868751..0000000000000000000000000000000000000000 --- a/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py +++ /dev/null @@ -1,228 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -import numpy as np -import torch -from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel -from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings -from diffusers.utils import load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class VQDiffusionPipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - @property - def num_embed(self): - return 12 - - @property - def num_embeds_ada_norm(self): - return 12 - - @property - def text_embedder_hidden_size(self): - return 32 - - @property - def dummy_vqvae(self): - torch.manual_seed(0) - model = VQModel( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=3, - num_vq_embeddings=self.num_embed, - vq_embed_dim=3, - ) - return model - - @property - def dummy_tokenizer(self): - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - return tokenizer - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=self.text_embedder_hidden_size, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_transformer(self): - torch.manual_seed(0) - - height = 12 - width = 12 - - model_kwargs = { - "attention_bias": True, - "cross_attention_dim": 32, - "attention_head_dim": height * width, - "num_attention_heads": 1, - "num_vector_embeds": self.num_embed, - "num_embeds_ada_norm": self.num_embeds_ada_norm, - "norm_num_groups": 32, - "sample_size": width, - "activation_fn": "geglu-approximate", - } - - model = Transformer2DModel(**model_kwargs) - return model - - def test_vq_diffusion(self): - device = "cpu" - - vqvae = self.dummy_vqvae - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - transformer = self.dummy_transformer - scheduler = VQDiffusionScheduler(self.num_embed) - learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False) - - pipe = VQDiffusionPipeline( - vqvae=vqvae, - text_encoder=text_encoder, - tokenizer=tokenizer, - transformer=transformer, - scheduler=scheduler, - learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings, - ) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - prompt = "teddy bear playing in the pool" - - generator = torch.Generator(device=device).manual_seed(0) - output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = pipe( - [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2 - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 24, 24, 3) - - expected_slice = np.array([0.6583, 0.6410, 0.5325, 0.5635, 0.5563, 0.4234, 0.6008, 0.5491, 0.4880]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - def test_vq_diffusion_classifier_free_sampling(self): - device = "cpu" - - vqvae = self.dummy_vqvae - text_encoder = self.dummy_text_encoder - tokenizer = self.dummy_tokenizer - transformer = self.dummy_transformer - scheduler = VQDiffusionScheduler(self.num_embed) - learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings( - learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length - ) - - pipe = VQDiffusionPipeline( - vqvae=vqvae, - text_encoder=text_encoder, - tokenizer=tokenizer, - transformer=transformer, - scheduler=scheduler, - learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings, - ) - pipe = pipe.to(device) - pipe.set_progress_bar_config(disable=None) - - prompt = "teddy bear playing in the pool" - - generator = torch.Generator(device=device).manual_seed(0) - output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np") - image = output.images - - generator = torch.Generator(device=device).manual_seed(0) - image_from_tuple = pipe( - [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2 - )[0] - - image_slice = image[0, -3:, -3:, -1] - image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] - - assert image.shape == (1, 24, 24, 3) - - expected_slice = np.array([0.6647, 0.6531, 0.5303, 0.5891, 0.5726, 0.4439, 0.6304, 0.5564, 0.4912]) - - assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 - assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 - - -@slow -@require_torch_gpu -class VQDiffusionPipelineIntegrationTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_vq_diffusion_classifier_free_sampling(self): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy" - ) - - pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq") - pipeline = pipeline.to(torch_device) - pipeline.set_progress_bar_config(disable=None) - - # requires GPU generator for gumbel softmax - # don't use GPU generator in tests though - generator = torch.Generator(device=torch_device).manual_seed(0) - output = pipeline( - "teddy bear playing in the pool", - num_images_per_prompt=1, - generator=generator, - output_type="np", - ) - - image = output.images[0] - - assert image.shape == (256, 256, 3) - assert np.abs(expected_image - image).max() < 1e-2 diff --git a/diffusers/tests/repo_utils/test_check_copies.py b/diffusers/tests/repo_utils/test_check_copies.py deleted file mode 100644 index bd0a22da2c3af2bed6f3029e84face108e3cbda3..0000000000000000000000000000000000000000 --- a/diffusers/tests/repo_utils/test_check_copies.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import re -import shutil -import sys -import tempfile -import unittest - -import black - - -git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -sys.path.append(os.path.join(git_repo_path, "utils")) - -import check_copies # noqa: E402 - - -# This is the reference code that will be used in the tests. -# If DDPMSchedulerOutput is changed in scheduling_ddpm.py, this code needs to be manually updated. -REFERENCE_CODE = """ \""" - Output class for the scheduler's step function output. - - Args: - prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): - Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the - denoising loop. - pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): - The predicted denoised sample (x_{0}) based on the model output from the current timestep. - `pred_original_sample` can be used to preview progress or for guidance. - \""" - - prev_sample: torch.FloatTensor - pred_original_sample: Optional[torch.FloatTensor] = None -""" - - -class CopyCheckTester(unittest.TestCase): - def setUp(self): - self.diffusers_dir = tempfile.mkdtemp() - os.makedirs(os.path.join(self.diffusers_dir, "schedulers/")) - check_copies.DIFFUSERS_PATH = self.diffusers_dir - shutil.copy( - os.path.join(git_repo_path, "src/diffusers/schedulers/scheduling_ddpm.py"), - os.path.join(self.diffusers_dir, "schedulers/scheduling_ddpm.py"), - ) - - def tearDown(self): - check_copies.DIFFUSERS_PATH = "src/diffusers" - shutil.rmtree(self.diffusers_dir) - - def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None): - code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code - if overwrite_result is not None: - expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result - mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119) - code = black.format_str(code, mode=mode) - fname = os.path.join(self.diffusers_dir, "new_code.py") - with open(fname, "w", newline="\n") as f: - f.write(code) - if overwrite_result is None: - self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0) - else: - check_copies.is_copy_consistent(f.name, overwrite=True) - with open(fname, "r") as f: - self.assertTrue(f.read(), expected) - - def test_find_code_in_diffusers(self): - code = check_copies.find_code_in_diffusers("schedulers.scheduling_ddpm.DDPMSchedulerOutput") - self.assertEqual(code, REFERENCE_CODE) - - def test_is_copy_consistent(self): - # Base copy consistency - self.check_copy_consistency( - "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput", - "DDPMSchedulerOutput", - REFERENCE_CODE + "\n", - ) - - # With no empty line at the end - self.check_copy_consistency( - "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput", - "DDPMSchedulerOutput", - REFERENCE_CODE, - ) - - # Copy consistency with rename - self.check_copy_consistency( - "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test", - "TestSchedulerOutput", - re.sub("DDPM", "Test", REFERENCE_CODE), - ) - - # Copy consistency with a really long name - long_class_name = "TestClassWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason" - self.check_copy_consistency( - f"# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->{long_class_name}", - f"{long_class_name}SchedulerOutput", - re.sub("Bert", long_class_name, REFERENCE_CODE), - ) - - # Copy consistency with overwrite - self.check_copy_consistency( - "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test", - "TestSchedulerOutput", - REFERENCE_CODE, - overwrite_result=re.sub("DDPM", "Test", REFERENCE_CODE), - ) diff --git a/diffusers/tests/repo_utils/test_check_dummies.py b/diffusers/tests/repo_utils/test_check_dummies.py deleted file mode 100644 index 52a75d7b02e85f70cb347afb1429ca8beb942d21..0000000000000000000000000000000000000000 --- a/diffusers/tests/repo_utils/test_check_dummies.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import unittest - - -git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -sys.path.append(os.path.join(git_repo_path, "utils")) - -import check_dummies # noqa: E402 -from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init # noqa: E402 - - -# Align TRANSFORMERS_PATH in check_dummies with the current path -check_dummies.PATH_TO_DIFFUSERS = os.path.join(git_repo_path, "src", "diffusers") - - -class CheckDummiesTester(unittest.TestCase): - def test_find_backend(self): - simple_backend = find_backend(" if not is_torch_available():") - self.assertEqual(simple_backend, "torch") - - # backend_with_underscore = find_backend(" if not is_tensorflow_text_available():") - # self.assertEqual(backend_with_underscore, "tensorflow_text") - - double_backend = find_backend(" if not (is_torch_available() and is_transformers_available()):") - self.assertEqual(double_backend, "torch_and_transformers") - - # double_backend_with_underscore = find_backend( - # " if not (is_sentencepiece_available() and is_tensorflow_text_available()):" - # ) - # self.assertEqual(double_backend_with_underscore, "sentencepiece_and_tensorflow_text") - - triple_backend = find_backend( - " if not (is_torch_available() and is_transformers_available() and is_onnx_available()):" - ) - self.assertEqual(triple_backend, "torch_and_transformers_and_onnx") - - def test_read_init(self): - objects = read_init() - # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects - self.assertIn("torch", objects) - self.assertIn("torch_and_transformers", objects) - self.assertIn("flax_and_transformers", objects) - self.assertIn("torch_and_transformers_and_onnx", objects) - - # Likewise, we can't assert on the exact content of a key - self.assertIn("UNet2DModel", objects["torch"]) - self.assertIn("FlaxUNet2DConditionModel", objects["flax"]) - self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"]) - self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"]) - self.assertIn("LMSDiscreteScheduler", objects["torch_and_scipy"]) - self.assertIn("OnnxStableDiffusionPipeline", objects["torch_and_transformers_and_onnx"]) - - def test_create_dummy_object(self): - dummy_constant = create_dummy_object("CONSTANT", "'torch'") - self.assertEqual(dummy_constant, "\nCONSTANT = None\n") - - dummy_function = create_dummy_object("function", "'torch'") - self.assertEqual( - dummy_function, "\ndef function(*args, **kwargs):\n requires_backends(function, 'torch')\n" - ) - - expected_dummy_class = """ -class FakeClass(metaclass=DummyObject): - _backends = 'torch' - - def __init__(self, *args, **kwargs): - requires_backends(self, 'torch') - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, 'torch') - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, 'torch') -""" - dummy_class = create_dummy_object("FakeClass", "'torch'") - self.assertEqual(dummy_class, expected_dummy_class) - - def test_create_dummy_files(self): - expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit. -from ..utils import DummyObject, requires_backends - - -CONSTANT = None - - -def function(*args, **kwargs): - requires_backends(function, ["torch"]) - - -class FakeClass(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - @classmethod - def from_config(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) - - @classmethod - def from_pretrained(cls, *args, **kwargs): - requires_backends(cls, ["torch"]) -""" - dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]}) - self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file) diff --git a/diffusers/tests/schedulers/__init__.py b/diffusers/tests/schedulers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/diffusers/tests/schedulers/test_scheduler_ddim.py b/diffusers/tests/schedulers/test_scheduler_ddim.py deleted file mode 100644 index e9c85314d558af74b2ed325df5ed7722e1acd691..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_ddim.py +++ /dev/null @@ -1,140 +0,0 @@ -import torch - -from diffusers import DDIMScheduler - -from .test_schedulers import SchedulerCommonTest - - -class DDIMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DDIMScheduler,) - forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50)) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "clip_sample": True, - } - - config.update(**kwargs) - return config - - def full_loop(self, **config): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps, eta = 10, 0.0 - - model = self.dummy_model() - sample = self.dummy_sample_deter - - scheduler.set_timesteps(num_inference_steps) - - for t in scheduler.timesteps: - residual = model(sample, t) - sample = scheduler.step(residual, t, sample, eta).prev_sample - - return sample - - def test_timesteps(self): - for timesteps in [100, 500, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_steps_offset(self): - for steps_offset in [0, 1]: - self.check_over_configs(steps_offset=steps_offset) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(steps_offset=1) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(5) - assert torch.equal(scheduler.timesteps, torch.LongTensor([801, 601, 401, 201, 1])) - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_clip_sample(self): - for clip_sample in [True, False]: - self.check_over_configs(clip_sample=clip_sample) - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - ) - - def test_time_indices(self): - for t in [1, 10, 49]: - self.check_over_forward(time_step=t) - - def test_inference_steps(self): - for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]): - self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps) - - def test_eta(self): - for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]): - self.check_over_forward(time_step=t, eta=eta) - - def test_variance(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5 - - def test_full_loop_no_noise(self): - sample = self.full_loop() - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 172.0067) < 1e-2 - assert abs(result_mean.item() - 0.223967) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 52.5302) < 1e-2 - assert abs(result_mean.item() - 0.0684) < 1e-3 - - def test_full_loop_with_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01) - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 149.8295) < 1e-2 - assert abs(result_mean.item() - 0.1951) < 1e-3 - - def test_full_loop_with_no_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01) - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 149.0784) < 1e-2 - assert abs(result_mean.item() - 0.1941) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_ddpm.py b/diffusers/tests/schedulers/test_scheduler_ddpm.py deleted file mode 100644 index b55a39ee2e79274691f5136b989cbaabb3f00932..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_ddpm.py +++ /dev/null @@ -1,131 +0,0 @@ -import torch - -from diffusers import DDPMScheduler - -from .test_schedulers import SchedulerCommonTest - - -class DDPMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DDPMScheduler,) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "variance_type": "fixed_small", - "clip_sample": True, - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [1, 5, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_variance_type(self): - for variance in ["fixed_small", "fixed_large", "other"]: - self.check_over_configs(variance_type=variance) - - def test_clip_sample(self): - for clip_sample in [True, False]: - self.check_over_configs(clip_sample=clip_sample) - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "sample", "v_prediction"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - ) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "sample", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_time_indices(self): - for t in [0, 500, 999]: - self.check_over_forward(time_step=t) - - def test_variance(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - assert torch.sum(torch.abs(scheduler._get_variance(0) - 0.0)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.02)) < 1e-5 - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - num_trained_timesteps = len(scheduler) - - model = self.dummy_model() - sample = self.dummy_sample_deter - generator = torch.manual_seed(0) - - for t in reversed(range(num_trained_timesteps)): - # 1. predict noise residual - residual = model(sample, t) - - # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample - - # if t > 0: - # noise = self.dummy_sample_deter - # variance = scheduler.get_variance(t) ** (0.5) * noise - # - # sample = pred_prev_sample + variance - sample = pred_prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 258.9606) < 1e-2 - assert abs(result_mean.item() - 0.3372) < 1e-3 - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - num_trained_timesteps = len(scheduler) - - model = self.dummy_model() - sample = self.dummy_sample_deter - generator = torch.manual_seed(0) - - for t in reversed(range(num_trained_timesteps)): - # 1. predict noise residual - residual = model(sample, t) - - # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample - - # if t > 0: - # noise = self.dummy_sample_deter - # variance = scheduler.get_variance(t) ** (0.5) * noise - # - # sample = pred_prev_sample + variance - sample = pred_prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 202.0296) < 1e-2 - assert abs(result_mean.item() - 0.2631) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_deis.py b/diffusers/tests/schedulers/test_scheduler_deis.py deleted file mode 100644 index 8b14601bc98240cca5ea75ae06343be20bc3ca79..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_deis.py +++ /dev/null @@ -1,237 +0,0 @@ -import tempfile - -import torch - -from diffusers import ( - DEISMultistepScheduler, - DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, - UniPCMultistepScheduler, -) - -from .test_schedulers import SchedulerCommonTest - - -class DEISMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DEISMultistepScheduler,) - forward_default_kwargs = (("num_inference_steps", 25),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "solver_order": 2, - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output, new_output = sample, sample - for t in range(time_step, time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, scheduler=None, **config): - if scheduler is None: - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - time_step_0 = scheduler.timesteps[5] - time_step_1 = scheduler.timesteps[6] - - output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_switch(self): - # make sure that iterating over schedulers with same config names gives same results - # for defaults - scheduler = DEISMultistepScheduler(**self.get_scheduler_config()) - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.23916) < 1e-3 - - scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config) - scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) - scheduler = UniPCMultistepScheduler.from_config(scheduler.config) - scheduler = DEISMultistepScheduler.from_config(scheduler.config) - - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.23916) < 1e-3 - - def test_timesteps(self): - for timesteps in [25, 50, 100, 999, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for order in [1, 2, 3]: - for solver_type in ["logrho"]: - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - algorithm_type="deis", - solver_order=order, - solver_type=solver_type, - ) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_solver_order_and_type(self): - for algorithm_type in ["deis"]: - for solver_type in ["logrho"]: - for order in [1, 2, 3]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - sample = self.full_loop( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - assert not torch.isnan(sample).any(), "Samples have nan numbers" - - def test_lower_order_final(self): - self.check_over_configs(lower_order_final=True) - self.check_over_configs(lower_order_final=False) - - def test_inference_steps(self): - for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.23916) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.091) < 1e-3 - - def test_fp16_support(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter.half() - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - assert sample.dtype == torch.float16 diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_multi.py b/diffusers/tests/schedulers/test_scheduler_dpm_multi.py deleted file mode 100644 index 295bbe882746793b09b196f054e392e22415d455..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_dpm_multi.py +++ /dev/null @@ -1,245 +0,0 @@ -import tempfile - -import torch - -from diffusers import ( - DEISMultistepScheduler, - DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, - UniPCMultistepScheduler, -) - -from .test_schedulers import SchedulerCommonTest - - -class DPMSolverMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DPMSolverMultistepScheduler,) - forward_default_kwargs = (("num_inference_steps", 25),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "solver_order": 2, - "prediction_type": "epsilon", - "thresholding": False, - "sample_max_value": 1.0, - "algorithm_type": "dpmsolver++", - "solver_type": "midpoint", - "lower_order_final": False, - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output, new_output = sample, sample - for t in range(time_step, time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, scheduler=None, **config): - if scheduler is None: - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - time_step_0 = scheduler.timesteps[5] - time_step_1 = scheduler.timesteps[6] - - output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_timesteps(self): - for timesteps in [25, 50, 100, 999, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for order in [1, 2, 3]: - for solver_type in ["midpoint", "heun"]: - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - algorithm_type="dpmsolver++", - solver_order=order, - solver_type=solver_type, - ) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_solver_order_and_type(self): - for algorithm_type in ["dpmsolver", "dpmsolver++"]: - for solver_type in ["midpoint", "heun"]: - for order in [1, 2, 3]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - sample = self.full_loop( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - assert not torch.isnan(sample).any(), "Samples have nan numbers" - - def test_lower_order_final(self): - self.check_over_configs(lower_order_final=True) - self.check_over_configs(lower_order_final=False) - - def test_inference_steps(self): - for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.3301) < 1e-3 - - def test_full_loop_no_noise_thres(self): - sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.6405) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2251) < 1e-3 - - def test_switch(self): - # make sure that iterating over schedulers with same config names gives same results - # for defaults - scheduler = DPMSolverMultistepScheduler(**self.get_scheduler_config()) - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.3301) < 1e-3 - - scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config) - scheduler = UniPCMultistepScheduler.from_config(scheduler.config) - scheduler = DEISMultistepScheduler.from_config(scheduler.config) - scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) - - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.3301) < 1e-3 - - def test_fp16_support(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter.half() - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - assert sample.dtype == torch.float16 diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_single.py b/diffusers/tests/schedulers/test_scheduler_dpm_single.py deleted file mode 100644 index 9dff04e7c99841f83d9cbbd34dde7ee4525541fe..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_dpm_single.py +++ /dev/null @@ -1,212 +0,0 @@ -import tempfile - -import torch - -from diffusers import ( - DEISMultistepScheduler, - DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, - UniPCMultistepScheduler, -) - -from .test_schedulers import SchedulerCommonTest - - -class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (DPMSolverSinglestepScheduler,) - forward_default_kwargs = (("num_inference_steps", 25),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "solver_order": 2, - "prediction_type": "epsilon", - "thresholding": False, - "sample_max_value": 1.0, - "algorithm_type": "dpmsolver++", - "solver_type": "midpoint", - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output, new_output = sample, sample - for t in range(time_step, time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, scheduler=None, **config): - if scheduler is None: - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - return sample - - def test_timesteps(self): - for timesteps in [25, 50, 100, 999, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_switch(self): - # make sure that iterating over schedulers with same config names gives same results - # for defaults - scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config()) - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2791) < 1e-3 - - scheduler = DEISMultistepScheduler.from_config(scheduler.config) - scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) - scheduler = UniPCMultistepScheduler.from_config(scheduler.config) - scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config) - - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2791) < 1e-3 - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for order in [1, 2, 3]: - for solver_type in ["midpoint", "heun"]: - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - algorithm_type="dpmsolver++", - solver_order=order, - solver_type=solver_type, - ) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_solver_order_and_type(self): - for algorithm_type in ["dpmsolver", "dpmsolver++"]: - for solver_type in ["midpoint", "heun"]: - for order in [1, 2, 3]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - sample = self.full_loop( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - algorithm_type=algorithm_type, - ) - assert not torch.isnan(sample).any(), "Samples have nan numbers" - - def test_lower_order_final(self): - self.check_over_configs(lower_order_final=True) - self.check_over_configs(lower_order_final=False) - - def test_inference_steps(self): - for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2791) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.1453) < 1e-3 - - def test_fp16_support(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter.half() - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - assert sample.dtype == torch.float16 diff --git a/diffusers/tests/schedulers/test_scheduler_euler.py b/diffusers/tests/schedulers/test_scheduler_euler.py deleted file mode 100644 index 4d521b0075e18710b88ed3efe1f2652bb4718733..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_euler.py +++ /dev/null @@ -1,119 +0,0 @@ -import torch - -from diffusers import EulerDiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class EulerDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (EulerDiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 10.0807) < 1e-2 - assert abs(result_mean.item() - 0.0131) < 1e-3 - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 0.0002) < 1e-2 - assert abs(result_mean.item() - 2.2676e-06) < 1e-3 - - def test_full_loop_device(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for t in scheduler.timesteps: - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 10.0807) < 1e-2 - assert abs(result_mean.item() - 0.0131) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py deleted file mode 100644 index 5fa36be6bc64e5fc6aac72e11e50e455089469cb..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py +++ /dev/null @@ -1,118 +0,0 @@ -import torch - -from diffusers import EulerAncestralDiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (EulerAncestralDiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 152.3192) < 1e-2 - assert abs(result_mean.item() - 0.1983) < 1e-3 - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 108.4439) < 1e-2 - assert abs(result_mean.item() - 0.1412) < 1e-3 - - def test_full_loop_device(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for t in scheduler.timesteps: - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 152.3192) < 1e-2 - assert abs(result_mean.item() - 0.1983) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_flax.py b/diffusers/tests/schedulers/test_scheduler_flax.py deleted file mode 100644 index 8f7ad59d285eb50a42ab5809ce60dd0bf26e026c..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_flax.py +++ /dev/null @@ -1,919 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import inspect -import tempfile -import unittest -from typing import Dict, List, Tuple - -from diffusers import FlaxDDIMScheduler, FlaxDDPMScheduler, FlaxPNDMScheduler -from diffusers.utils import is_flax_available -from diffusers.utils.testing_utils import require_flax - - -if is_flax_available(): - import jax - import jax.numpy as jnp - from jax import random - - jax_device = jax.default_backend() - - -@require_flax -class FlaxSchedulerCommonTest(unittest.TestCase): - scheduler_classes = () - forward_default_kwargs = () - - @property - def dummy_sample(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - key1, key2 = random.split(random.PRNGKey(0)) - sample = random.uniform(key1, (batch_size, num_channels, height, width)) - - return sample, key2 - - @property - def dummy_sample_deter(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - num_elems = batch_size * num_channels * height * width - sample = jnp.arange(num_elems) - sample = sample.reshape(num_channels, height, width, batch_size) - sample = sample / num_elems - return jnp.transpose(sample, (3, 0, 1, 2)) - - def get_scheduler_config(self): - raise NotImplementedError - - def dummy_model(self): - def model(sample, t, *args): - return sample * t / (t + 1) - - return model - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, key = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - kwargs.update(forward_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, key = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, key = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, 1, sample, key, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, key = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output_0 = scheduler.step(state, residual, 0, sample, key, **kwargs).prev_sample - output_1 = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_scheduler_outputs_equivalence(self): - def set_nan_tensor_to_zero(t): - return t.at[t != t].set(0) - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5), - msg=( - "Tuple and dict output are not equal. Difference:" - f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:" - f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has" - f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}." - ), - ) - - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, key = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_dict = scheduler.step(state, residual, 0, sample, key, **kwargs) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_tuple = scheduler.step(state, residual, 0, sample, key, return_dict=False, **kwargs) - - recursive_check(outputs_tuple[0], outputs_dict.prev_sample) - - def test_deprecated_kwargs(self): - for scheduler_class in self.scheduler_classes: - has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters - has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0 - - if has_kwarg_in_model_class and not has_deprecated_kwarg: - raise ValueError( - f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated" - " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if" - " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs =" - " []`" - ) - - if not has_kwarg_in_model_class and has_deprecated_kwarg: - raise ValueError( - f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated" - " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`" - f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the" - " deprecated argument from `_deprecated_kwargs = []`" - ) - - -@require_flax -class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest): - scheduler_classes = (FlaxDDPMScheduler,) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "variance_type": "fixed_small", - "clip_sample": True, - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [1, 5, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_variance_type(self): - for variance in ["fixed_small", "fixed_large", "other"]: - self.check_over_configs(variance_type=variance) - - def test_clip_sample(self): - for clip_sample in [True, False]: - self.check_over_configs(clip_sample=clip_sample) - - def test_time_indices(self): - for t in [0, 500, 999]: - self.check_over_forward(time_step=t) - - def test_variance(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0) - 0.0)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487) - 0.00979)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999) - 0.02)) < 1e-5 - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - num_trained_timesteps = len(scheduler) - - model = self.dummy_model() - sample = self.dummy_sample_deter - key1, key2 = random.split(random.PRNGKey(0)) - - for t in reversed(range(num_trained_timesteps)): - # 1. predict noise residual - residual = model(sample, t) - - # 2. predict previous mean of sample x_t-1 - output = scheduler.step(state, residual, t, sample, key1) - pred_prev_sample = output.prev_sample - state = output.state - key1, key2 = random.split(key2) - - # if t > 0: - # noise = self.dummy_sample_deter - # variance = scheduler.get_variance(t) ** (0.5) * noise - # - # sample = pred_prev_sample + variance - sample = pred_prev_sample - - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - assert abs(result_sum - 255.0714) < 1e-2 - assert abs(result_mean - 0.332124) < 1e-3 - else: - assert abs(result_sum - 255.1113) < 1e-2 - assert abs(result_mean - 0.332176) < 1e-3 - - -@require_flax -class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest): - scheduler_classes = (FlaxDDIMScheduler,) - forward_default_kwargs = (("num_inference_steps", 50),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def full_loop(self, **config): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - key1, key2 = random.split(random.PRNGKey(0)) - - num_inference_steps = 10 - - model = self.dummy_model() - sample = self.dummy_sample_deter - - state = scheduler.set_timesteps(state, num_inference_steps) - - for t in state.timesteps: - residual = model(sample, t) - output = scheduler.step(state, residual, t, sample) - sample = output.prev_sample - state = output.state - key1, key2 = random.split(key2) - - return sample - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, _ = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, _ = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, 1, sample, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - kwargs.update(forward_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - sample, _ = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_scheduler_outputs_equivalence(self): - def set_nan_tensor_to_zero(t): - return t.at[t != t].set(0) - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5), - msg=( - "Tuple and dict output are not equal. Difference:" - f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:" - f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has" - f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}." - ), - ) - - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, _ = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs) - - recursive_check(outputs_tuple[0], outputs_dict.prev_sample) - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, _ = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output_0 = scheduler.step(state, residual, 0, sample, **kwargs).prev_sample - output_1 = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_timesteps(self): - for timesteps in [100, 500, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_steps_offset(self): - for steps_offset in [0, 1]: - self.check_over_configs(steps_offset=steps_offset) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(steps_offset=1) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - state = scheduler.set_timesteps(state, 5) - assert jnp.equal(state.timesteps, jnp.array([801, 601, 401, 201, 1])).all() - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_time_indices(self): - for t in [1, 10, 49]: - self.check_over_forward(time_step=t) - - def test_inference_steps(self): - for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]): - self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps) - - def test_variance(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 420, 400) - 0.14771)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 980, 960) - 0.32460)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487, 486) - 0.00979)) < 1e-5 - assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999, 998) - 0.02)) < 1e-5 - - def test_full_loop_no_noise(self): - sample = self.full_loop() - - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - assert abs(result_sum - 172.0067) < 1e-2 - assert abs(result_mean - 0.223967) < 1e-3 - - def test_full_loop_with_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01) - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - assert abs(result_sum - 149.8409) < 1e-2 - assert abs(result_mean - 0.1951) < 1e-3 - else: - assert abs(result_sum - 149.8295) < 1e-2 - assert abs(result_mean - 0.1951) < 1e-3 - - def test_full_loop_with_no_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01) - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - pass - # FIXME: both result_sum and result_mean are nan on TPU - # assert jnp.isnan(result_sum) - # assert jnp.isnan(result_mean) - else: - assert abs(result_sum - 149.0784) < 1e-2 - assert abs(result_mean - 0.1941) < 1e-3 - - def test_prediction_type(self): - for prediction_type in ["epsilon", "sample", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - -@require_flax -class FlaxPNDMSchedulerTest(FlaxSchedulerCommonTest): - scheduler_classes = (FlaxPNDMScheduler,) - forward_default_kwargs = (("num_inference_steps", 50),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample, _ = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - # copy over dummy past residuals - state = state.replace(ets=dummy_past_residuals[:]) - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape) - # copy over dummy past residuals - new_state = new_state.replace(ets=dummy_past_residuals[:]) - - (prev_sample, state) = scheduler.step_prk(state, residual, time_step, sample, **kwargs) - (new_prev_sample, new_state) = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs) - - assert jnp.sum(jnp.abs(prev_sample - new_prev_sample)) < 1e-5, "Scheduler outputs are not identical" - - output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs) - new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs) - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def test_scheduler_outputs_equivalence(self): - def set_nan_tensor_to_zero(t): - return t.at[t != t].set(0) - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5), - msg=( - "Tuple and dict output are not equal. Difference:" - f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:" - f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has" - f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}." - ), - ) - - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, _ = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs) - - recursive_check(outputs_tuple[0], outputs_dict.prev_sample) - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample, _ = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.ets = dummy_past_residuals[:] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape) - - # copy over dummy past residual (must be after setting timesteps) - new_state.replace(ets=dummy_past_residuals[:]) - - output, state = scheduler.step_prk(state, residual, time_step, sample, **kwargs) - new_output, new_state = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs) - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs) - new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs) - - assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, **config): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - - for i, t in enumerate(state.prk_timesteps): - residual = model(sample, t) - sample, state = scheduler.step_prk(state, residual, t, sample) - - for i, t in enumerate(state.plms_timesteps): - residual = model(sample, t) - sample, state = scheduler.step_plms(state, residual, t, sample) - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - sample, _ = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]) - state = state.replace(ets=dummy_past_residuals[:]) - - output_0, state = scheduler.step_prk(state, residual, 0, sample, **kwargs) - output_1, state = scheduler.step_prk(state, residual, 1, sample, **kwargs) - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - output_0, state = scheduler.step_plms(state, residual, 0, sample, **kwargs) - output_1, state = scheduler.step_plms(state, residual, 1, sample, **kwargs) - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_timesteps(self): - for timesteps in [100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_steps_offset(self): - for steps_offset in [0, 1]: - self.check_over_configs(steps_offset=steps_offset) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(steps_offset=1) - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - state = scheduler.set_timesteps(state, 10, shape=()) - assert jnp.equal( - state.timesteps, - jnp.array([901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]), - ).all() - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_time_indices(self): - for t in [1, 5, 10]: - self.check_over_forward(time_step=t) - - def test_inference_steps(self): - for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]): - self.check_over_forward(num_inference_steps=num_inference_steps) - - def test_pow_of_3_inference_steps(self): - # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3 - num_inference_steps = 27 - - for scheduler_class in self.scheduler_classes: - sample, _ = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape) - - # before power of 3 fix, would error on first step, so we only need to do two - for i, t in enumerate(state.prk_timesteps[:2]): - sample, state = scheduler.step_prk(state, residual, t, sample) - - def test_inference_plms_no_past_residuals(self): - with self.assertRaises(ValueError): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - state = scheduler.create_state() - - scheduler.step_plms(state, self.dummy_sample, 1, self.dummy_sample).prev_sample - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - assert abs(result_sum - 198.1275) < 1e-2 - assert abs(result_mean - 0.2580) < 1e-3 - else: - assert abs(result_sum - 198.1318) < 1e-2 - assert abs(result_mean - 0.2580) < 1e-3 - - def test_full_loop_with_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01) - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - assert abs(result_sum - 186.83226) < 1e-2 - assert abs(result_mean - 0.24327) < 1e-3 - else: - assert abs(result_sum - 186.9466) < 1e-2 - assert abs(result_mean - 0.24342) < 1e-3 - - def test_full_loop_with_no_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01) - result_sum = jnp.sum(jnp.abs(sample)) - result_mean = jnp.mean(jnp.abs(sample)) - - if jax_device == "tpu": - assert abs(result_sum - 186.83226) < 1e-2 - assert abs(result_mean - 0.24327) < 1e-3 - else: - assert abs(result_sum - 186.9482) < 1e-2 - assert abs(result_mean - 0.2434) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_heun.py b/diffusers/tests/schedulers/test_scheduler_heun.py deleted file mode 100644 index 7d38c8e2374c26e49c52f3430a3e595b35771436..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_heun.py +++ /dev/null @@ -1,131 +0,0 @@ -import torch - -from diffusers import HeunDiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class HeunDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (HeunDiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if torch_device in ["cpu", "mps"]: - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if torch_device in ["cpu", "mps"]: - assert abs(result_sum.item() - 4.6934e-07) < 1e-2 - assert abs(result_mean.item() - 6.1112e-10) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - - def test_full_loop_device(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - - model = self.dummy_model() - sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma - - for t in scheduler.timesteps: - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if str(torch_device).startswith("cpu"): - # The following sum varies between 148 and 156 on mps. Why? - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - elif str(torch_device).startswith("mps"): - # Larger tolerance on mps - assert abs(result_mean.item() - 0.0002) < 1e-2 - else: - # CUDA - assert abs(result_sum.item() - 0.1233) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_ipndm.py b/diffusers/tests/schedulers/test_scheduler_ipndm.py deleted file mode 100644 index 549caed47fe8f100c2bc4164329210209595ba7f..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_ipndm.py +++ /dev/null @@ -1,161 +0,0 @@ -import tempfile - -import torch - -from diffusers import IPNDMScheduler - -from .test_schedulers import SchedulerCommonTest - - -class IPNDMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (IPNDMScheduler,) - forward_default_kwargs = (("num_inference_steps", 50),) - - def get_scheduler_config(self, **kwargs): - config = {"num_train_timesteps": 1000} - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.ets = dummy_past_residuals[:] - - if time_step is None: - time_step = scheduler.timesteps[len(scheduler.timesteps) // 2] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.ets = dummy_past_residuals[:] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.ets = dummy_past_residuals[:] - - if time_step is None: - time_step = scheduler.timesteps[len(scheduler.timesteps) // 2] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.ets = dummy_past_residuals[:] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, **config): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - scheduler.ets = dummy_past_residuals[:] - - time_step_0 = scheduler.timesteps[5] - time_step_1 = scheduler.timesteps[6] - - output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_timesteps(self): - for timesteps in [100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps, time_step=None) - - def test_inference_steps(self): - for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]): - self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None) - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 2540529) < 10 diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py deleted file mode 100644 index 45371121e66b8ffdcecb5cc86a91758e436b2955..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py +++ /dev/null @@ -1,123 +0,0 @@ -import torch - -from diffusers import KDPM2AncestralDiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (KDPM2AncestralDiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_full_loop_no_noise(self): - if torch_device == "mps": - return - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 13849.3877) < 1e-2 - assert abs(result_mean.item() - 18.0331) < 5e-3 - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_full_loop_with_v_prediction(self): - if torch_device == "mps": - return - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - generator = torch.manual_seed(0) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 328.9970) < 1e-2 - assert abs(result_mean.item() - 0.4284) < 1e-3 - - def test_full_loop_device(self): - if torch_device == "mps": - return - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - generator = torch.manual_seed(0) - - model = self.dummy_model() - sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma - - for t in scheduler.timesteps: - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample, generator=generator) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 13849.3818) < 1e-1 - assert abs(result_mean.item() - 18.0331) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py deleted file mode 100644 index 4f1bd1f8aeb78a9266a319fe1f097e7c4a5d0e2a..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py +++ /dev/null @@ -1,132 +0,0 @@ -import torch - -from diffusers import KDPM2DiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class KDPM2DiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (KDPM2DiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if torch_device in ["cpu", "mps"]: - assert abs(result_sum.item() - 4.6934e-07) < 1e-2 - assert abs(result_mean.item() - 6.1112e-10) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2 - assert abs(result_mean.item() - 0.0002) < 1e-3 - - def test_full_loop_no_noise(self): - if torch_device == "mps": - return - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if torch_device in ["cpu", "mps"]: - assert abs(result_sum.item() - 20.4125) < 1e-2 - assert abs(result_mean.item() - 0.0266) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 20.4125) < 1e-2 - assert abs(result_mean.item() - 0.0266) < 1e-3 - - def test_full_loop_device(self): - if torch_device == "mps": - return - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - - model = self.dummy_model() - sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma - - for t in scheduler.timesteps: - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - if str(torch_device).startswith("cpu"): - # The following sum varies between 148 and 156 on mps. Why? - assert abs(result_sum.item() - 20.4125) < 1e-2 - assert abs(result_mean.item() - 0.0266) < 1e-3 - else: - # CUDA - assert abs(result_sum.item() - 20.4125) < 1e-2 - assert abs(result_mean.item() - 0.0266) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_lms.py b/diffusers/tests/schedulers/test_scheduler_lms.py deleted file mode 100644 index ca3574e9ee638546d313e5256feba804522da65b..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_lms.py +++ /dev/null @@ -1,115 +0,0 @@ -import torch - -from diffusers import LMSDiscreteScheduler -from diffusers.utils import torch_device - -from .test_schedulers import SchedulerCommonTest - - -class LMSDiscreteSchedulerTest(SchedulerCommonTest): - scheduler_classes = (LMSDiscreteScheduler,) - num_inference_steps = 10 - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1100, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [10, 50, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_betas(self): - for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_time_indices(self): - for t in [0, 500, 800]: - self.check_over_forward(time_step=t) - - def test_full_loop_no_noise(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 1006.388) < 1e-2 - assert abs(result_mean.item() - 1.31) < 1e-3 - - def test_full_loop_with_v_prediction(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 0.0017) < 1e-2 - assert abs(result_mean.item() - 2.2676e-06) < 1e-3 - - def test_full_loop_device(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(self.num_inference_steps, device=torch_device) - - model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma - sample = sample.to(torch_device) - - for i, t in enumerate(scheduler.timesteps): - sample = scheduler.scale_model_input(sample, t) - - model_output = model(sample, t) - - output = scheduler.step(model_output, t, sample) - sample = output.prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 1006.388) < 1e-2 - assert abs(result_mean.item() - 1.31) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_pndm.py b/diffusers/tests/schedulers/test_scheduler_pndm.py deleted file mode 100644 index c1519f7c7e8e113aca61c8749c3a08f6f390309f..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_pndm.py +++ /dev/null @@ -1,242 +0,0 @@ -import tempfile - -import torch - -from diffusers import PNDMScheduler - -from .test_schedulers import SchedulerCommonTest - - -class PNDMSchedulerTest(SchedulerCommonTest): - scheduler_classes = (PNDMScheduler,) - forward_default_kwargs = (("num_inference_steps", 50),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.ets = dummy_past_residuals[:] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.ets = dummy_past_residuals[:] - - output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - pass - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.ets = dummy_past_residuals[:] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.ets = dummy_past_residuals[:] - - output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, **config): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.prk_timesteps): - residual = model(sample, t) - sample = scheduler.step_prk(residual, t, sample).prev_sample - - for i, t in enumerate(scheduler.plms_timesteps): - residual = model(sample, t) - sample = scheduler.step_plms(residual, t, sample).prev_sample - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05] - scheduler.ets = dummy_past_residuals[:] - - output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample - output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample - output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_timesteps(self): - for timesteps in [100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_steps_offset(self): - for steps_offset in [0, 1]: - self.check_over_configs(steps_offset=steps_offset) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(steps_offset=1) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(10) - assert torch.equal( - scheduler.timesteps, - torch.LongTensor( - [901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1] - ), - ) - - def test_betas(self): - for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]): - self.check_over_configs(beta_start=beta_start, beta_end=beta_end) - - def test_schedules(self): - for schedule in ["linear", "squaredcos_cap_v2"]: - self.check_over_configs(beta_schedule=schedule) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_time_indices(self): - for t in [1, 5, 10]: - self.check_over_forward(time_step=t) - - def test_inference_steps(self): - for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]): - self.check_over_forward(num_inference_steps=num_inference_steps) - - def test_pow_of_3_inference_steps(self): - # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3 - num_inference_steps = 27 - - for scheduler_class in self.scheduler_classes: - sample = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(num_inference_steps) - - # before power of 3 fix, would error on first step, so we only need to do two - for i, t in enumerate(scheduler.prk_timesteps[:2]): - sample = scheduler.step_prk(residual, t, sample).prev_sample - - def test_inference_plms_no_past_residuals(self): - with self.assertRaises(ValueError): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 198.1318) < 1e-2 - assert abs(result_mean.item() - 0.2580) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 67.3986) < 1e-2 - assert abs(result_mean.item() - 0.0878) < 1e-3 - - def test_full_loop_with_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01) - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 230.0399) < 1e-2 - assert abs(result_mean.item() - 0.2995) < 1e-3 - - def test_full_loop_with_no_set_alpha_to_one(self): - # We specify different beta, so that the first alpha is 0.99 - sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01) - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 186.9482) < 1e-2 - assert abs(result_mean.item() - 0.2434) < 1e-3 diff --git a/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py deleted file mode 100644 index 08c30f9b1e0c2ce1f7baab82f5076efabe465a69..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py +++ /dev/null @@ -1,189 +0,0 @@ -import tempfile -import unittest - -import numpy as np -import torch - -from diffusers import ScoreSdeVeScheduler - - -class ScoreSdeVeSchedulerTest(unittest.TestCase): - # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration) - scheduler_classes = (ScoreSdeVeScheduler,) - forward_default_kwargs = () - - @property - def dummy_sample(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - sample = torch.rand((batch_size, num_channels, height, width)) - - return sample - - @property - def dummy_sample_deter(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - num_elems = batch_size * num_channels * height * width - sample = torch.arange(num_elems) - sample = sample.reshape(num_channels, height, width, batch_size) - sample = sample / num_elems - sample = sample.permute(3, 0, 1, 2) - - return sample - - def dummy_model(self): - def model(sample, t, *args): - return sample * t / (t + 1) - - return model - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 2000, - "snr": 0.15, - "sigma_min": 0.01, - "sigma_max": 1348, - "sampling_eps": 1e-5, - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - - for scheduler_class in self.scheduler_classes: - sample = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - output = scheduler.step_pred( - residual, time_step, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - new_output = new_scheduler.step_pred( - residual, time_step, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample - new_output = new_scheduler.step_correct( - residual, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical" - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - kwargs.update(forward_kwargs) - - for scheduler_class in self.scheduler_classes: - sample = self.dummy_sample - residual = 0.1 * sample - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - output = scheduler.step_pred( - residual, time_step, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - new_output = new_scheduler.step_pred( - residual, time_step, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample - new_output = new_scheduler.step_correct( - residual, sample, generator=torch.manual_seed(0), **kwargs - ).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical" - - def test_timesteps(self): - for timesteps in [10, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_sigmas(self): - for sigma_min, sigma_max in zip([0.0001, 0.001, 0.01], [1, 100, 1000]): - self.check_over_configs(sigma_min=sigma_min, sigma_max=sigma_max) - - def test_time_indices(self): - for t in [0.1, 0.5, 0.75]: - self.check_over_forward(time_step=t) - - def test_full_loop_no_noise(self): - kwargs = dict(self.forward_default_kwargs) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 3 - - model = self.dummy_model() - sample = self.dummy_sample_deter - - scheduler.set_sigmas(num_inference_steps) - scheduler.set_timesteps(num_inference_steps) - generator = torch.manual_seed(0) - - for i, t in enumerate(scheduler.timesteps): - sigma_t = scheduler.sigmas[i] - - for _ in range(scheduler.config.correct_steps): - with torch.no_grad(): - model_output = model(sample, sigma_t) - sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample - - with torch.no_grad(): - model_output = model(sample, sigma_t) - - output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs) - sample, _ = output.prev_sample, output.prev_sample_mean - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert np.isclose(result_sum.item(), 14372758528.0) - assert np.isclose(result_mean.item(), 18714530.0) - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output_0 = scheduler.step_pred(residual, 0, sample, generator=torch.manual_seed(0), **kwargs).prev_sample - output_1 = scheduler.step_pred(residual, 1, sample, generator=torch.manual_seed(0), **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) diff --git a/diffusers/tests/schedulers/test_scheduler_unclip.py b/diffusers/tests/schedulers/test_scheduler_unclip.py deleted file mode 100644 index b0ce1312e79f6762bc7573c3a90e58cb33a21bad..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_unclip.py +++ /dev/null @@ -1,137 +0,0 @@ -import torch - -from diffusers import UnCLIPScheduler - -from .test_schedulers import SchedulerCommonTest - - -# UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration. -class UnCLIPSchedulerTest(SchedulerCommonTest): - scheduler_classes = (UnCLIPScheduler,) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "variance_type": "fixed_small_log", - "clip_sample": True, - "clip_sample_range": 1.0, - "prediction_type": "epsilon", - } - - config.update(**kwargs) - return config - - def test_timesteps(self): - for timesteps in [1, 5, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_variance_type(self): - for variance in ["fixed_small_log", "learned_range"]: - self.check_over_configs(variance_type=variance) - - def test_clip_sample(self): - for clip_sample in [True, False]: - self.check_over_configs(clip_sample=clip_sample) - - def test_clip_sample_range(self): - for clip_sample_range in [1, 5, 10, 20]: - self.check_over_configs(clip_sample_range=clip_sample_range) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_time_indices(self): - for time_step in [0, 500, 999]: - for prev_timestep in [None, 5, 100, 250, 500, 750]: - if prev_timestep is not None and prev_timestep >= time_step: - continue - - self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep) - - def test_variance_fixed_small_log(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log") - scheduler = scheduler_class(**scheduler_config) - - assert torch.sum(torch.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5 - assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5 - - def test_variance_learned_range(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(variance_type="learned_range") - scheduler = scheduler_class(**scheduler_config) - - predicted_variance = 0.5 - - assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5 - assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5 - assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5 - - def test_full_loop(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - timesteps = scheduler.timesteps - - model = self.dummy_model() - sample = self.dummy_sample_deter - generator = torch.manual_seed(0) - - for i, t in enumerate(timesteps): - # 1. predict noise residual - residual = model(sample, t) - - # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample - - sample = pred_prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 252.2682495) < 1e-2 - assert abs(result_mean.item() - 0.3284743) < 1e-3 - - def test_full_loop_skip_timesteps(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - scheduler.set_timesteps(25) - - timesteps = scheduler.timesteps - - model = self.dummy_model() - sample = self.dummy_sample_deter - generator = torch.manual_seed(0) - - for i, t in enumerate(timesteps): - # 1. predict noise residual - residual = model(sample, t) - - if i + 1 == timesteps.shape[0]: - prev_timestep = None - else: - prev_timestep = timesteps[i + 1] - - # 2. predict previous mean of sample x_t-1 - pred_prev_sample = scheduler.step( - residual, t, sample, prev_timestep=prev_timestep, generator=generator - ).prev_sample - - sample = pred_prev_sample - - result_sum = torch.sum(torch.abs(sample)) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_sum.item() - 258.2044983) < 1e-2 - assert abs(result_mean.item() - 0.3362038) < 1e-3 - - def test_trained_betas(self): - pass - - def test_add_noise_device(self): - pass diff --git a/diffusers/tests/schedulers/test_scheduler_unipc.py b/diffusers/tests/schedulers/test_scheduler_unipc.py deleted file mode 100644 index 6154c8e2d625506f138c28da7a605e5739e6ffd3..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_unipc.py +++ /dev/null @@ -1,231 +0,0 @@ -import tempfile - -import torch - -from diffusers import ( - DEISMultistepScheduler, - DPMSolverMultistepScheduler, - DPMSolverSinglestepScheduler, - UniPCMultistepScheduler, -) - -from .test_schedulers import SchedulerCommonTest - - -class UniPCMultistepSchedulerTest(SchedulerCommonTest): - scheduler_classes = (UniPCMultistepScheduler,) - forward_default_kwargs = (("num_inference_steps", 25),) - - def get_scheduler_config(self, **kwargs): - config = { - "num_train_timesteps": 1000, - "beta_start": 0.0001, - "beta_end": 0.02, - "beta_schedule": "linear", - "solver_order": 2, - "solver_type": "bh1", - } - - config.update(**kwargs) - return config - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - new_scheduler.set_timesteps(num_inference_steps) - # copy over dummy past residuals - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output, new_output = sample, sample - for t in range(time_step, time_step + scheduler.config.solver_order + 1): - output = scheduler.step(residual, t, output, **kwargs).prev_sample - new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", None) - sample = self.dummy_sample - residual = 0.1 * sample - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residuals (must be after setting timesteps) - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - # copy over dummy past residuals - new_scheduler.set_timesteps(num_inference_steps) - - # copy over dummy past residual (must be after setting timesteps) - new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order] - - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def full_loop(self, scheduler=None, **config): - if scheduler is None: - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - return sample - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # copy over dummy past residuals (must be done after set_timesteps) - dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10] - scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order] - - time_step_0 = scheduler.timesteps[5] - time_step_1 = scheduler.timesteps[6] - - output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_switch(self): - # make sure that iterating over schedulers with same config names gives same results - # for defaults - scheduler = UniPCMultistepScheduler(**self.get_scheduler_config()) - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2521) < 1e-3 - - scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config) - scheduler = DEISMultistepScheduler.from_config(scheduler.config) - scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config) - scheduler = UniPCMultistepScheduler.from_config(scheduler.config) - - sample = self.full_loop(scheduler=scheduler) - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2521) < 1e-3 - - def test_timesteps(self): - for timesteps in [25, 50, 100, 999, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_thresholding(self): - self.check_over_configs(thresholding=False) - for order in [1, 2, 3]: - for solver_type in ["bh1", "bh2"]: - for threshold in [0.5, 1.0, 2.0]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - thresholding=True, - prediction_type=prediction_type, - sample_max_value=threshold, - solver_order=order, - solver_type=solver_type, - ) - - def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: - self.check_over_configs(prediction_type=prediction_type) - - def test_solver_order_and_type(self): - for solver_type in ["bh1", "bh2"]: - for order in [1, 2, 3]: - for prediction_type in ["epsilon", "sample"]: - self.check_over_configs( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - ) - sample = self.full_loop( - solver_order=order, - solver_type=solver_type, - prediction_type=prediction_type, - ) - assert not torch.isnan(sample).any(), "Samples have nan numbers" - - def test_lower_order_final(self): - self.check_over_configs(lower_order_final=True) - self.check_over_configs(lower_order_final=False) - - def test_inference_steps(self): - for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]: - self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0) - - def test_full_loop_no_noise(self): - sample = self.full_loop() - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.2521) < 1e-3 - - def test_full_loop_with_v_prediction(self): - sample = self.full_loop(prediction_type="v_prediction") - result_mean = torch.mean(torch.abs(sample)) - - assert abs(result_mean.item() - 0.1096) < 1e-3 - - def test_fp16_support(self): - scheduler_class = self.scheduler_classes[0] - scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) - scheduler = scheduler_class(**scheduler_config) - - num_inference_steps = 10 - model = self.dummy_model() - sample = self.dummy_sample_deter.half() - scheduler.set_timesteps(num_inference_steps) - - for i, t in enumerate(scheduler.timesteps): - residual = model(sample, t) - sample = scheduler.step(residual, t, sample).prev_sample - - assert sample.dtype == torch.float16 diff --git a/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py deleted file mode 100644 index 74437ad4548074a488917d3ea9b5eef4f0ac1532..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py +++ /dev/null @@ -1,56 +0,0 @@ -import torch -import torch.nn.functional as F - -from diffusers import VQDiffusionScheduler - -from .test_schedulers import SchedulerCommonTest - - -class VQDiffusionSchedulerTest(SchedulerCommonTest): - scheduler_classes = (VQDiffusionScheduler,) - - def get_scheduler_config(self, **kwargs): - config = { - "num_vec_classes": 4097, - "num_train_timesteps": 100, - } - - config.update(**kwargs) - return config - - def dummy_sample(self, num_vec_classes): - batch_size = 4 - height = 8 - width = 8 - - sample = torch.randint(0, num_vec_classes, (batch_size, height * width)) - - return sample - - @property - def dummy_sample_deter(self): - assert False - - def dummy_model(self, num_vec_classes): - def model(sample, t, *args): - batch_size, num_latent_pixels = sample.shape - logits = torch.rand((batch_size, num_vec_classes - 1, num_latent_pixels)) - return_value = F.log_softmax(logits.double(), dim=1).float() - return return_value - - return model - - def test_timesteps(self): - for timesteps in [2, 5, 100, 1000]: - self.check_over_configs(num_train_timesteps=timesteps) - - def test_num_vec_classes(self): - for num_vec_classes in [5, 100, 1000, 4000]: - self.check_over_configs(num_vec_classes=num_vec_classes) - - def test_time_indices(self): - for t in [0, 50, 99]: - self.check_over_forward(time_step=t) - - def test_add_noise_device(self): - pass diff --git a/diffusers/tests/schedulers/test_schedulers.py b/diffusers/tests/schedulers/test_schedulers.py deleted file mode 100755 index bfbf5cbc798f52b2eceba8ba17747dc4d8ae8bc3..0000000000000000000000000000000000000000 --- a/diffusers/tests/schedulers/test_schedulers.py +++ /dev/null @@ -1,598 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import inspect -import json -import os -import tempfile -import unittest -from typing import Dict, List, Tuple - -import numpy as np -import torch - -import diffusers -from diffusers import ( - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - IPNDMScheduler, - LMSDiscreteScheduler, - VQDiffusionScheduler, - logging, -) -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.schedulers.scheduling_utils import SchedulerMixin -from diffusers.utils import torch_device -from diffusers.utils.testing_utils import CaptureLogger - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class SchedulerObject(SchedulerMixin, ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - ): - pass - - -class SchedulerObject2(SchedulerMixin, ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - f=[1, 3], - ): - pass - - -class SchedulerObject3(SchedulerMixin, ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - f=[1, 3], - ): - pass - - -class SchedulerBaseTests(unittest.TestCase): - def test_save_load_from_different_config(self): - obj = SchedulerObject() - - # mock add obj class to `diffusers` - setattr(diffusers, "SchedulerObject", SchedulerObject) - logger = logging.get_logger("diffusers.configuration_utils") - - with tempfile.TemporaryDirectory() as tmpdirname: - obj.save_config(tmpdirname) - with CaptureLogger(logger) as cap_logger_1: - config = SchedulerObject2.load_config(tmpdirname) - new_obj_1 = SchedulerObject2.from_config(config) - - # now save a config parameter that is not expected - with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f: - data = json.load(f) - data["unexpected"] = True - - with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f: - json.dump(data, f) - - with CaptureLogger(logger) as cap_logger_2: - config = SchedulerObject.load_config(tmpdirname) - new_obj_2 = SchedulerObject.from_config(config) - - with CaptureLogger(logger) as cap_logger_3: - config = SchedulerObject2.load_config(tmpdirname) - new_obj_3 = SchedulerObject2.from_config(config) - - assert new_obj_1.__class__ == SchedulerObject2 - assert new_obj_2.__class__ == SchedulerObject - assert new_obj_3.__class__ == SchedulerObject2 - - assert cap_logger_1.out == "" - assert ( - cap_logger_2.out - == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" - " will" - " be ignored. Please verify your config.json configuration file.\n" - ) - assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out - - def test_save_load_compatible_schedulers(self): - SchedulerObject2._compatibles = ["SchedulerObject"] - SchedulerObject._compatibles = ["SchedulerObject2"] - - obj = SchedulerObject() - - # mock add obj class to `diffusers` - setattr(diffusers, "SchedulerObject", SchedulerObject) - setattr(diffusers, "SchedulerObject2", SchedulerObject2) - logger = logging.get_logger("diffusers.configuration_utils") - - with tempfile.TemporaryDirectory() as tmpdirname: - obj.save_config(tmpdirname) - - # now save a config parameter that is expected by another class, but not origin class - with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f: - data = json.load(f) - data["f"] = [0, 0] - data["unexpected"] = True - - with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f: - json.dump(data, f) - - with CaptureLogger(logger) as cap_logger: - config = SchedulerObject.load_config(tmpdirname) - new_obj = SchedulerObject.from_config(config) - - assert new_obj.__class__ == SchedulerObject - - assert ( - cap_logger.out - == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and" - " will" - " be ignored. Please verify your config.json configuration file.\n" - ) - - def test_save_load_from_different_config_comp_schedulers(self): - SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"] - SchedulerObject2._compatibles = ["SchedulerObject", "SchedulerObject3"] - SchedulerObject._compatibles = ["SchedulerObject2", "SchedulerObject3"] - - obj = SchedulerObject() - - # mock add obj class to `diffusers` - setattr(diffusers, "SchedulerObject", SchedulerObject) - setattr(diffusers, "SchedulerObject2", SchedulerObject2) - setattr(diffusers, "SchedulerObject3", SchedulerObject3) - logger = logging.get_logger("diffusers.configuration_utils") - logger.setLevel(diffusers.logging.INFO) - - with tempfile.TemporaryDirectory() as tmpdirname: - obj.save_config(tmpdirname) - - with CaptureLogger(logger) as cap_logger_1: - config = SchedulerObject.load_config(tmpdirname) - new_obj_1 = SchedulerObject.from_config(config) - - with CaptureLogger(logger) as cap_logger_2: - config = SchedulerObject2.load_config(tmpdirname) - new_obj_2 = SchedulerObject2.from_config(config) - - with CaptureLogger(logger) as cap_logger_3: - config = SchedulerObject3.load_config(tmpdirname) - new_obj_3 = SchedulerObject3.from_config(config) - - assert new_obj_1.__class__ == SchedulerObject - assert new_obj_2.__class__ == SchedulerObject2 - assert new_obj_3.__class__ == SchedulerObject3 - - assert cap_logger_1.out == "" - assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n" - assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n" - - -class SchedulerCommonTest(unittest.TestCase): - scheduler_classes = () - forward_default_kwargs = () - - @property - def dummy_sample(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - sample = torch.rand((batch_size, num_channels, height, width)) - - return sample - - @property - def dummy_sample_deter(self): - batch_size = 4 - num_channels = 3 - height = 8 - width = 8 - - num_elems = batch_size * num_channels * height * width - sample = torch.arange(num_elems) - sample = sample.reshape(num_channels, height, width, batch_size) - sample = sample / num_elems - sample = sample.permute(3, 0, 1, 2) - - return sample - - def get_scheduler_config(self): - raise NotImplementedError - - def dummy_model(self): - def model(sample, t, *args): - return sample * t / (t + 1) - - return model - - def check_over_configs(self, time_step=0, **config): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default - if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler): - time_step = float(time_step) - - scheduler_config = self.get_scheduler_config(**config) - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class == VQDiffusionScheduler: - num_vec_classes = scheduler_config["num_vec_classes"] - sample = self.dummy_sample(num_vec_classes) - model = self.dummy_model(num_vec_classes) - residual = model(sample, time_step) - else: - sample = self.dummy_sample - residual = 0.1 * sample - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # Make sure `scale_model_input` is invoked to prevent a warning - if scheduler_class != VQDiffusionScheduler: - _ = scheduler.scale_model_input(sample, 0) - _ = new_scheduler.scale_model_input(sample, 0) - - # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def check_over_forward(self, time_step=0, **forward_kwargs): - kwargs = dict(self.forward_default_kwargs) - kwargs.update(forward_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler): - time_step = float(time_step) - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class == VQDiffusionScheduler: - num_vec_classes = scheduler_config["num_vec_classes"] - sample = self.dummy_sample(num_vec_classes) - model = self.dummy_model(num_vec_classes) - residual = model(sample, time_step) - else: - sample = self.dummy_sample - residual = 0.1 * sample - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_from_save_pretrained(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - for scheduler_class in self.scheduler_classes: - timestep = 1 - if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler): - timestep = float(timestep) - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class == VQDiffusionScheduler: - num_vec_classes = scheduler_config["num_vec_classes"] - sample = self.dummy_sample(num_vec_classes) - model = self.dummy_model(num_vec_classes) - residual = model(sample, timestep) - else: - sample = self.dummy_sample - residual = 0.1 * sample - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_config(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - new_scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample - - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample - - assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical" - - def test_compatibles(self): - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - - scheduler = scheduler_class(**scheduler_config) - - assert all(c is not None for c in scheduler.compatibles) - - for comp_scheduler_cls in scheduler.compatibles: - comp_scheduler = comp_scheduler_cls.from_config(scheduler.config) - assert comp_scheduler is not None - - new_scheduler = scheduler_class.from_config(comp_scheduler.config) - - new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config} - scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config} - - # make sure that configs are essentially identical - assert new_scheduler_config == dict(scheduler.config) - - # make sure that only differences are for configs that are not in init - init_keys = inspect.signature(scheduler_class.__init__).parameters.keys() - assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set() - - def test_from_pretrained(self): - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - - scheduler = scheduler_class(**scheduler_config) - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_pretrained(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - assert scheduler.config == new_scheduler.config - - def test_step_shape(self): - kwargs = dict(self.forward_default_kwargs) - - num_inference_steps = kwargs.pop("num_inference_steps", None) - - timestep_0 = 0 - timestep_1 = 1 - - for scheduler_class in self.scheduler_classes: - if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler): - timestep_0 = float(timestep_0) - timestep_1 = float(timestep_1) - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class == VQDiffusionScheduler: - num_vec_classes = scheduler_config["num_vec_classes"] - sample = self.dummy_sample(num_vec_classes) - model = self.dummy_model(num_vec_classes) - residual = model(sample, timestep_0) - else: - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample - output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample - - self.assertEqual(output_0.shape, sample.shape) - self.assertEqual(output_0.shape, output_1.shape) - - def test_scheduler_outputs_equivalence(self): - def set_nan_tensor_to_zero(t): - t[t != t] = 0 - return t - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - torch.allclose( - set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 - ), - msg=( - "Tuple and dict output are not equal. Difference:" - f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" - f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" - f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." - ), - ) - - kwargs = dict(self.forward_default_kwargs) - num_inference_steps = kwargs.pop("num_inference_steps", 50) - - timestep = 0 - if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler: - timestep = 1 - - for scheduler_class in self.scheduler_classes: - if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler): - timestep = float(timestep) - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class == VQDiffusionScheduler: - num_vec_classes = scheduler_config["num_vec_classes"] - sample = self.dummy_sample(num_vec_classes) - model = self.dummy_model(num_vec_classes) - residual = model(sample, timestep) - else: - sample = self.dummy_sample - residual = 0.1 * sample - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - outputs_dict = scheduler.step(residual, timestep, sample, **kwargs) - - if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"): - scheduler.set_timesteps(num_inference_steps) - elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"): - kwargs["num_inference_steps"] = num_inference_steps - - # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler - if "generator" in set(inspect.signature(scheduler.step).parameters.keys()): - kwargs["generator"] = torch.manual_seed(0) - outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs) - - recursive_check(outputs_tuple, outputs_dict) - - def test_scheduler_public_api(self): - for scheduler_class in self.scheduler_classes: - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - - if scheduler_class != VQDiffusionScheduler: - self.assertTrue( - hasattr(scheduler, "init_noise_sigma"), - f"{scheduler_class} does not implement a required attribute `init_noise_sigma`", - ) - self.assertTrue( - hasattr(scheduler, "scale_model_input"), - ( - f"{scheduler_class} does not implement a required class method `scale_model_input(sample," - " timestep)`" - ), - ) - self.assertTrue( - hasattr(scheduler, "step"), - f"{scheduler_class} does not implement a required class method `step(...)`", - ) - - if scheduler_class != VQDiffusionScheduler: - sample = self.dummy_sample - scaled_sample = scheduler.scale_model_input(sample, 0.0) - self.assertEqual(sample.shape, scaled_sample.shape) - - def test_add_noise_device(self): - for scheduler_class in self.scheduler_classes: - if scheduler_class == IPNDMScheduler: - continue - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config) - scheduler.set_timesteps(100) - - sample = self.dummy_sample.to(torch_device) - scaled_sample = scheduler.scale_model_input(sample, 0.0) - self.assertEqual(sample.shape, scaled_sample.shape) - - noise = torch.randn_like(scaled_sample).to(torch_device) - t = scheduler.timesteps[5][None] - noised = scheduler.add_noise(scaled_sample, noise, t) - self.assertEqual(noised.shape, scaled_sample.shape) - - def test_deprecated_kwargs(self): - for scheduler_class in self.scheduler_classes: - has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters - has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0 - - if has_kwarg_in_model_class and not has_deprecated_kwarg: - raise ValueError( - f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated" - " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if" - " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs =" - " []`" - ) - - if not has_kwarg_in_model_class and has_deprecated_kwarg: - raise ValueError( - f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated" - " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`" - f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the" - " deprecated argument from `_deprecated_kwargs = []`" - ) - - def test_trained_betas(self): - for scheduler_class in self.scheduler_classes: - if scheduler_class == VQDiffusionScheduler: - continue - - scheduler_config = self.get_scheduler_config() - scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3])) - - with tempfile.TemporaryDirectory() as tmpdirname: - scheduler.save_pretrained(tmpdirname) - new_scheduler = scheduler_class.from_pretrained(tmpdirname) - - assert scheduler.betas.tolist() == new_scheduler.betas.tolist() diff --git a/diffusers/tests/test_config.py b/diffusers/tests/test_config.py deleted file mode 100644 index 95b0cdf9a597ef8ff26fab3ada4a2deeac156b8e..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_config.py +++ /dev/null @@ -1,223 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile -import unittest - -from diffusers import ( - DDIMScheduler, - DDPMScheduler, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - PNDMScheduler, - logging, -) -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.utils.testing_utils import CaptureLogger - - -class SampleObject(ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - ): - pass - - -class SampleObject2(ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - f=[1, 3], - ): - pass - - -class SampleObject3(ConfigMixin): - config_name = "config.json" - - @register_to_config - def __init__( - self, - a=2, - b=5, - c=(2, 5), - d="for diffusion", - e=[1, 3], - f=[1, 3], - ): - pass - - -class ConfigTester(unittest.TestCase): - def test_load_not_from_mixin(self): - with self.assertRaises(ValueError): - ConfigMixin.load_config("dummy_path") - - def test_register_to_config(self): - obj = SampleObject() - config = obj.config - assert config["a"] == 2 - assert config["b"] == 5 - assert config["c"] == (2, 5) - assert config["d"] == "for diffusion" - assert config["e"] == [1, 3] - - # init ignore private arguments - obj = SampleObject(_name_or_path="lalala") - config = obj.config - assert config["a"] == 2 - assert config["b"] == 5 - assert config["c"] == (2, 5) - assert config["d"] == "for diffusion" - assert config["e"] == [1, 3] - - # can override default - obj = SampleObject(c=6) - config = obj.config - assert config["a"] == 2 - assert config["b"] == 5 - assert config["c"] == 6 - assert config["d"] == "for diffusion" - assert config["e"] == [1, 3] - - # can use positional arguments. - obj = SampleObject(1, c=6) - config = obj.config - assert config["a"] == 1 - assert config["b"] == 5 - assert config["c"] == 6 - assert config["d"] == "for diffusion" - assert config["e"] == [1, 3] - - def test_save_load(self): - obj = SampleObject() - config = obj.config - - assert config["a"] == 2 - assert config["b"] == 5 - assert config["c"] == (2, 5) - assert config["d"] == "for diffusion" - assert config["e"] == [1, 3] - - with tempfile.TemporaryDirectory() as tmpdirname: - obj.save_config(tmpdirname) - new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname)) - new_config = new_obj.config - - # unfreeze configs - config = dict(config) - new_config = dict(new_config) - - assert config.pop("c") == (2, 5) # instantiated as tuple - assert new_config.pop("c") == [2, 5] # saved & loaded as list because of json - assert config == new_config - - def test_load_ddim_from_pndm(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - ddim = DDIMScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" - ) - - assert ddim.__class__ == DDIMScheduler - # no warning should be thrown - assert cap_logger.out == "" - - def test_load_euler_from_pndm(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - euler = EulerDiscreteScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" - ) - - assert euler.__class__ == EulerDiscreteScheduler - # no warning should be thrown - assert cap_logger.out == "" - - def test_load_euler_ancestral_from_pndm(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - euler = EulerAncestralDiscreteScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" - ) - - assert euler.__class__ == EulerAncestralDiscreteScheduler - # no warning should be thrown - assert cap_logger.out == "" - - def test_load_pndm(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - pndm = PNDMScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" - ) - - assert pndm.__class__ == PNDMScheduler - # no warning should be thrown - assert cap_logger.out == "" - - def test_overwrite_config_on_load(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - ddpm = DDPMScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="scheduler", - prediction_type="sample", - beta_end=8, - ) - - with CaptureLogger(logger) as cap_logger_2: - ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88) - - assert ddpm.__class__ == DDPMScheduler - assert ddpm.config.prediction_type == "sample" - assert ddpm.config.beta_end == 8 - assert ddpm_2.config.beta_start == 88 - - # no warning should be thrown - assert cap_logger.out == "" - assert cap_logger_2.out == "" - - def test_load_dpmsolver(self): - logger = logging.get_logger("diffusers.configuration_utils") - - with CaptureLogger(logger) as cap_logger: - dpm = DPMSolverMultistepScheduler.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler" - ) - - assert dpm.__class__ == DPMSolverMultistepScheduler - # no warning should be thrown - assert cap_logger.out == "" diff --git a/diffusers/tests/test_ema.py b/diffusers/tests/test_ema.py deleted file mode 100644 index 812d83e2f2418817f4d7e0e1c81d1b1dedfa611d..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_ema.py +++ /dev/null @@ -1,156 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile -import unittest - -import torch - -from diffusers import UNet2DConditionModel -from diffusers.training_utils import EMAModel -from diffusers.utils.testing_utils import skip_mps, torch_device - - -class EMAModelTests(unittest.TestCase): - model_id = "hf-internal-testing/tiny-stable-diffusion-pipe" - batch_size = 1 - prompt_length = 77 - text_encoder_hidden_dim = 32 - num_in_channels = 4 - latent_height = latent_width = 64 - generator = torch.manual_seed(0) - - def get_models(self, decay=0.9999): - unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet") - unet = unet.to(torch_device) - ema_unet = EMAModel(unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, model_config=unet.config) - return unet, ema_unet - - def get_dummy_inputs(self): - noisy_latents = torch.randn( - self.batch_size, self.num_in_channels, self.latent_height, self.latent_width, generator=self.generator - ).to(torch_device) - timesteps = torch.randint(0, 1000, size=(self.batch_size,), generator=self.generator).to(torch_device) - encoder_hidden_states = torch.randn( - self.batch_size, self.prompt_length, self.text_encoder_hidden_dim, generator=self.generator - ).to(torch_device) - return noisy_latents, timesteps, encoder_hidden_states - - def simulate_backprop(self, unet): - updated_state_dict = {} - for k, param in unet.state_dict().items(): - updated_param = torch.randn_like(param) + (param * torch.randn_like(param)) - updated_state_dict.update({k: updated_param}) - unet.load_state_dict(updated_state_dict) - return unet - - def test_optimization_steps_updated(self): - unet, ema_unet = self.get_models() - # Take the first (hypothetical) EMA step. - ema_unet.step(unet.parameters()) - assert ema_unet.optimization_step == 1 - - # Take two more. - for _ in range(2): - ema_unet.step(unet.parameters()) - assert ema_unet.optimization_step == 3 - - def test_shadow_params_not_updated(self): - unet, ema_unet = self.get_models() - # Since the `unet` is not being updated (i.e., backprop'd) - # there won't be any difference between the `params` of `unet` - # and `ema_unet` even if we call `ema_unet.step(unet.parameters())`. - ema_unet.step(unet.parameters()) - orig_params = list(unet.parameters()) - for s_param, param in zip(ema_unet.shadow_params, orig_params): - assert torch.allclose(s_param, param) - - # The above holds true even if we call `ema.step()` multiple times since - # `unet` params are still not being updated. - for _ in range(4): - ema_unet.step(unet.parameters()) - for s_param, param in zip(ema_unet.shadow_params, orig_params): - assert torch.allclose(s_param, param) - - def test_shadow_params_updated(self): - unet, ema_unet = self.get_models() - # Here we simulate the parameter updates for `unet`. Since there might - # be some parameters which are initialized to zero we take extra care to - # initialize their values to something non-zero before the multiplication. - unet_pseudo_updated_step_one = self.simulate_backprop(unet) - - # Take the EMA step. - ema_unet.step(unet_pseudo_updated_step_one.parameters()) - - # Now the EMA'd parameters won't be equal to the original model parameters. - orig_params = list(unet_pseudo_updated_step_one.parameters()) - for s_param, param in zip(ema_unet.shadow_params, orig_params): - assert ~torch.allclose(s_param, param) - - # Ensure this is the case when we take multiple EMA steps. - for _ in range(4): - ema_unet.step(unet.parameters()) - for s_param, param in zip(ema_unet.shadow_params, orig_params): - assert ~torch.allclose(s_param, param) - - def test_consecutive_shadow_params_updated(self): - # If we call EMA step after a backpropagation consecutively for two times, - # the shadow params from those two steps should be different. - unet, ema_unet = self.get_models() - - # First backprop + EMA - unet_step_one = self.simulate_backprop(unet) - ema_unet.step(unet_step_one.parameters()) - step_one_shadow_params = ema_unet.shadow_params - - # Second backprop + EMA - unet_step_two = self.simulate_backprop(unet_step_one) - ema_unet.step(unet_step_two.parameters()) - step_two_shadow_params = ema_unet.shadow_params - - for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params): - assert ~torch.allclose(step_one, step_two) - - def test_zero_decay(self): - # If there's no decay even if there are backprops, EMA steps - # won't take any effect i.e., the shadow params would remain the - # same. - unet, ema_unet = self.get_models(decay=0.0) - unet_step_one = self.simulate_backprop(unet) - ema_unet.step(unet_step_one.parameters()) - step_one_shadow_params = ema_unet.shadow_params - - unet_step_two = self.simulate_backprop(unet_step_one) - ema_unet.step(unet_step_two.parameters()) - step_two_shadow_params = ema_unet.shadow_params - - for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params): - assert torch.allclose(step_one, step_two) - - @skip_mps - def test_serialization(self): - unet, ema_unet = self.get_models() - noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs() - - with tempfile.TemporaryDirectory() as tmpdir: - ema_unet.save_pretrained(tmpdir) - loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel) - loaded_unet = loaded_unet.to(unet.device) - - # Since no EMA step has been performed the outputs should match. - output = unet(noisy_latents, timesteps, encoder_hidden_states).sample - output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample - - assert torch.allclose(output, output_loaded, atol=1e-4) diff --git a/diffusers/tests/test_hub_utils.py b/diffusers/tests/test_hub_utils.py deleted file mode 100644 index e8b8ea3a2fd9b114ff184291e7ec73928ba885d7..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_hub_utils.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest -from pathlib import Path -from tempfile import TemporaryDirectory -from unittest.mock import Mock, patch - -import diffusers.utils.hub_utils - - -class CreateModelCardTest(unittest.TestCase): - @patch("diffusers.utils.hub_utils.get_full_repo_name") - def test_create_model_card(self, repo_name_mock: Mock) -> None: - repo_name_mock.return_value = "full_repo_name" - with TemporaryDirectory() as tmpdir: - # Dummy args values - args = Mock() - args.output_dir = tmpdir - args.local_rank = 0 - args.hub_token = "hub_token" - args.dataset_name = "dataset_name" - args.learning_rate = 0.01 - args.train_batch_size = 100000 - args.eval_batch_size = 10000 - args.gradient_accumulation_steps = 0.01 - args.adam_beta1 = 0.02 - args.adam_beta2 = 0.03 - args.adam_weight_decay = 0.0005 - args.adam_epsilon = 0.000001 - args.lr_scheduler = 1 - args.lr_warmup_steps = 10 - args.ema_inv_gamma = 0.001 - args.ema_power = 0.1 - args.ema_max_decay = 0.2 - args.mixed_precision = True - - # Model card mush be rendered and saved - diffusers.utils.hub_utils.create_model_card(args, model_name="model_name") - self.assertTrue((Path(tmpdir) / "README.md").is_file()) diff --git a/diffusers/tests/test_image_processor.py b/diffusers/tests/test_image_processor.py deleted file mode 100644 index 4f0e2c5aecfdaa63126e6d64fabf7cae4fba4b57..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_image_processor.py +++ /dev/null @@ -1,149 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import PIL -import torch - -from diffusers.image_processor import VaeImageProcessor - - -class ImageProcessorTest(unittest.TestCase): - @property - def dummy_sample(self): - batch_size = 1 - num_channels = 3 - height = 8 - width = 8 - - sample = torch.rand((batch_size, num_channels, height, width)) - - return sample - - def to_np(self, image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) - - input_pt = self.dummy_sample - input_np = self.to_np(input_pt) - - for output_type in ["pt", "np", "pil"]: - out = image_processor.postprocess( - image_processor.preprocess(input_pt), - output_type=output_type, - ) - out_np = self.to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - assert ( - np.abs(in_np - out_np).max() < 1e-6 - ), f"decoded output does not match input for output_type {output_type}" - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) - input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1) - - for output_type in ["pt", "np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - - out_np = self.to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - assert ( - np.abs(in_np - out_np).max() < 1e-6 - ), f"decoded output does not match input for output_type {output_type}" - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) - - input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1) - input_pil = image_processor.numpy_to_pil(input_np) - - for output_type in ["pt", "np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round() - assert ( - np.abs(in_np - out_np).max() < 1e-6 - ), f"decoded output does not match input for output_type {output_type}" - - def test_preprocess_input_3d(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) - - input_pt_4d = self.dummy_sample - input_pt_3d = input_pt_4d.squeeze(0) - - out_pt_4d = image_processor.postprocess( - image_processor.preprocess(input_pt_4d), - output_type="np", - ) - out_pt_3d = image_processor.postprocess( - image_processor.preprocess(input_pt_3d), - output_type="np", - ) - - input_np_4d = self.to_np(self.dummy_sample) - input_np_3d = input_np_4d.squeeze(0) - - out_np_4d = image_processor.postprocess( - image_processor.preprocess(input_np_4d), - output_type="np", - ) - out_np_3d = image_processor.postprocess( - image_processor.preprocess(input_np_3d), - output_type="np", - ) - - assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6 - assert np.abs(out_np_4d - out_np_3d).max() < 1e-6 - - def test_preprocess_input_list(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) - - input_pt_4d = self.dummy_sample - input_pt_list = list(input_pt_4d) - - out_pt_4d = image_processor.postprocess( - image_processor.preprocess(input_pt_4d), - output_type="np", - ) - - out_pt_list = image_processor.postprocess( - image_processor.preprocess(input_pt_list), - output_type="np", - ) - - input_np_4d = self.to_np(self.dummy_sample) - list(input_np_4d) - - out_np_4d = image_processor.postprocess( - image_processor.preprocess(input_pt_4d), - output_type="np", - ) - - out_np_list = image_processor.postprocess( - image_processor.preprocess(input_pt_list), - output_type="np", - ) - - assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6 - assert np.abs(out_np_4d - out_np_list).max() < 1e-6 diff --git a/diffusers/tests/test_layers_utils.py b/diffusers/tests/test_layers_utils.py deleted file mode 100644 index d0e2102b539eed99d2a3c0910c1c7d2d9def4c6f..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_layers_utils.py +++ /dev/null @@ -1,586 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import unittest - -import numpy as np -import torch -from torch import nn - -from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU, AttentionBlock -from diffusers.models.embeddings import get_timestep_embedding -from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D -from diffusers.models.transformer_2d import Transformer2DModel -from diffusers.utils import torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class EmbeddingsTests(unittest.TestCase): - def test_timestep_embeddings(self): - embedding_dim = 256 - timesteps = torch.arange(16) - - t1 = get_timestep_embedding(timesteps, embedding_dim) - - # first vector should always be composed only of 0's and 1's - assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-5 - assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-5 - - # last element of each vector should be one - assert (t1[:, -1] - 1).abs().sum() < 1e-5 - - # For large embeddings (e.g. 128) the frequency of every vector is higher - # than the previous one which means that the gradients of later vectors are - # ALWAYS higher than the previous ones - grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1) - - prev_grad = 0.0 - for grad in grad_mean: - assert grad > prev_grad - prev_grad = grad - - def test_timestep_defaults(self): - embedding_dim = 16 - timesteps = torch.arange(10) - - t1 = get_timestep_embedding(timesteps, embedding_dim) - t2 = get_timestep_embedding( - timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, max_period=10_000 - ) - - assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3) - - def test_timestep_flip_sin_cos(self): - embedding_dim = 16 - timesteps = torch.arange(10) - - t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True) - t1 = torch.cat([t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], dim=-1) - - t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False) - - assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3) - - def test_timestep_downscale_freq_shift(self): - embedding_dim = 16 - timesteps = torch.arange(10) - - t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0) - t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1) - - # get cosine half (vectors that are wrapped into cosine) - cosine_half = (t1 - t2)[:, embedding_dim // 2 :] - - # cosine needs to be negative - assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-5 - - def test_sinoid_embeddings_hardcoded(self): - embedding_dim = 64 - timesteps = torch.arange(128) - - # standard unet, score_vde - t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False) - # glide, ldm - t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True) - # grad-tts - t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000) - - assert torch.allclose( - t1[23:26, 47:50].flatten().cpu(), - torch.tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]), - 1e-3, - ) - assert torch.allclose( - t2[23:26, 47:50].flatten().cpu(), - torch.tensor([0.3019, 0.2280, 0.1716, 0.3146, 0.2377, 0.1790, 0.3272, 0.2474, 0.1864]), - 1e-3, - ) - assert torch.allclose( - t3[23:26, 47:50].flatten().cpu(), - torch.tensor([-0.9801, -0.9464, -0.9349, -0.3952, 0.8887, -0.9709, 0.5299, -0.2853, -0.9927]), - 1e-3, - ) - - -class Upsample2DBlockTests(unittest.TestCase): - def test_upsample_default(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 32, 32) - upsample = Upsample2D(channels=32, use_conv=False) - with torch.no_grad(): - upsampled = upsample(sample) - - assert upsampled.shape == (1, 32, 64, 64) - output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([-0.2173, -1.2079, -1.2079, 0.2952, 1.1254, 1.1254, 0.2952, 1.1254, 1.1254]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_upsample_with_conv(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 32, 32) - upsample = Upsample2D(channels=32, use_conv=True) - with torch.no_grad(): - upsampled = upsample(sample) - - assert upsampled.shape == (1, 32, 64, 64) - output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([0.7145, 1.3773, 0.3492, 0.8448, 1.0839, -0.3341, 0.5956, 0.1250, -0.4841]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_upsample_with_conv_out_dim(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 32, 32) - upsample = Upsample2D(channels=32, use_conv=True, out_channels=64) - with torch.no_grad(): - upsampled = upsample(sample) - - assert upsampled.shape == (1, 64, 64, 64) - output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([0.2703, 0.1656, -0.2538, -0.0553, -0.2984, 0.1044, 0.1155, 0.2579, 0.7755]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_upsample_with_transpose(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 32, 32) - upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True) - with torch.no_grad(): - upsampled = upsample(sample) - - assert upsampled.shape == (1, 32, 64, 64) - output_slice = upsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([-0.3028, -0.1582, 0.0071, 0.0350, -0.4799, -0.1139, 0.1056, -0.1153, -0.1046]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - -class Downsample2DBlockTests(unittest.TestCase): - def test_downsample_default(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64) - downsample = Downsample2D(channels=32, use_conv=False) - with torch.no_grad(): - downsampled = downsample(sample) - - assert downsampled.shape == (1, 32, 32, 32) - output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([-0.0513, -0.3889, 0.0640, 0.0836, -0.5460, -0.0341, -0.0169, -0.6967, 0.1179]) - max_diff = (output_slice.flatten() - expected_slice).abs().sum().item() - assert max_diff <= 1e-3 - # assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-1) - - def test_downsample_with_conv(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64) - downsample = Downsample2D(channels=32, use_conv=True) - with torch.no_grad(): - downsampled = downsample(sample) - - assert downsampled.shape == (1, 32, 32, 32) - output_slice = downsampled[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913], - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_downsample_with_conv_pad1(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64) - downsample = Downsample2D(channels=32, use_conv=True, padding=1) - with torch.no_grad(): - downsampled = downsample(sample) - - assert downsampled.shape == (1, 32, 32, 32) - output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_downsample_with_conv_out_dim(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64) - downsample = Downsample2D(channels=32, use_conv=True, out_channels=16) - with torch.no_grad(): - downsampled = downsample(sample) - - assert downsampled.shape == (1, 16, 32, 32) - output_slice = downsampled[0, -1, -3:, -3:] - expected_slice = torch.tensor([-0.6586, 0.5985, 0.0721, 0.1256, -0.1492, 0.4436, -0.2544, 0.5021, 1.1522]) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - -class ResnetBlock2DTests(unittest.TestCase): - def test_resnet_default(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 64, 64) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [-1.9010, -0.2974, -0.8245, -1.3533, 0.8742, -0.9645, -2.0584, 1.3387, -0.4746], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_restnet_with_use_in_shortcut(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 64, 64) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [0.2226, -1.0791, -0.1629, 0.3659, -0.2889, -1.2376, 0.0582, 0.9206, 0.0044], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_resnet_up(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, up=True).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 128, 128) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [1.2130, -0.8753, -0.9027, 1.5783, -0.5362, -0.5001, 1.0726, -0.7732, -0.4182], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_resnet_down(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 32, 32) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_restnet_with_kernel_fir(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 32, 32) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [-0.0934, -0.5729, 0.0909, -0.2710, -0.5044, 0.0243, -0.0665, -0.5267, -0.3136], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_restnet_with_kernel_sde_vp(self): - torch.manual_seed(0) - sample = torch.randn(1, 32, 64, 64).to(torch_device) - temb = torch.randn(1, 128).to(torch_device) - resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True).to(torch_device) - with torch.no_grad(): - output_tensor = resnet_block(sample, temb) - - assert output_tensor.shape == (1, 32, 32, 32) - output_slice = output_tensor[0, -1, -3:, -3:] - expected_slice = torch.tensor( - [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - -class AttentionBlockTests(unittest.TestCase): - @unittest.skipIf( - torch_device == "mps", "Matmul crashes on MPS, see https://github.com/pytorch/pytorch/issues/84039" - ) - def test_attention_block_default(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - sample = torch.randn(1, 32, 64, 64).to(torch_device) - attentionBlock = AttentionBlock( - channels=32, - num_head_channels=1, - rescale_output_factor=1.0, - eps=1e-6, - norm_num_groups=32, - ).to(torch_device) - with torch.no_grad(): - attention_scores = attentionBlock(sample) - - assert attention_scores.shape == (1, 32, 64, 64) - output_slice = attention_scores[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [-1.4975, -0.0038, -0.7847, -1.4567, 1.1220, -0.8962, -1.7394, 1.1319, -0.5427], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_attention_block_sd(self): - # This version uses SD params and is compatible with mps - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - sample = torch.randn(1, 512, 64, 64).to(torch_device) - attentionBlock = AttentionBlock( - channels=512, - rescale_output_factor=1.0, - eps=1e-6, - norm_num_groups=32, - ).to(torch_device) - with torch.no_grad(): - attention_scores = attentionBlock(sample) - - assert attention_scores.shape == (1, 512, 64, 64) - output_slice = attention_scores[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [-0.6621, -0.0156, -3.2766, 0.8025, -0.8609, 0.2820, 0.0905, -1.1179, -3.2126], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - -class Transformer2DModelTests(unittest.TestCase): - def test_spatial_transformer_default(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - sample = torch.randn(1, 32, 64, 64).to(torch_device) - spatial_transformer_block = Transformer2DModel( - in_channels=32, - num_attention_heads=1, - attention_head_dim=32, - dropout=0.0, - cross_attention_dim=None, - ).to(torch_device) - with torch.no_grad(): - attention_scores = spatial_transformer_block(sample).sample - - assert attention_scores.shape == (1, 32, 64, 64) - output_slice = attention_scores[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [-1.9455, -0.0066, -1.3933, -1.5878, 0.5325, -0.6486, -1.8648, 0.7515, -0.9689], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_spatial_transformer_cross_attention_dim(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - sample = torch.randn(1, 64, 64, 64).to(torch_device) - spatial_transformer_block = Transformer2DModel( - in_channels=64, - num_attention_heads=2, - attention_head_dim=32, - dropout=0.0, - cross_attention_dim=64, - ).to(torch_device) - with torch.no_grad(): - context = torch.randn(1, 4, 64).to(torch_device) - attention_scores = spatial_transformer_block(sample, context).sample - - assert attention_scores.shape == (1, 64, 64, 64) - output_slice = attention_scores[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [-0.2555, -0.8877, -2.4739, -2.2251, 1.2714, 0.0807, -0.4161, -1.6408, -0.0471], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_spatial_transformer_timestep(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - num_embeds_ada_norm = 5 - - sample = torch.randn(1, 64, 64, 64).to(torch_device) - spatial_transformer_block = Transformer2DModel( - in_channels=64, - num_attention_heads=2, - attention_head_dim=32, - dropout=0.0, - cross_attention_dim=64, - num_embeds_ada_norm=num_embeds_ada_norm, - ).to(torch_device) - with torch.no_grad(): - timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device) - timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device) - attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample - attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample - - assert attention_scores_1.shape == (1, 64, 64, 64) - assert attention_scores_2.shape == (1, 64, 64, 64) - - output_slice_1 = attention_scores_1[0, -1, -3:, -3:] - output_slice_2 = attention_scores_2[0, -1, -3:, -3:] - - expected_slice_1 = torch.tensor( - [-0.1874, -0.9704, -1.4290, -1.3357, 1.5138, 0.3036, -0.0976, -1.1667, 0.1283], device=torch_device - ) - expected_slice_2 = torch.tensor( - [-0.3493, -1.0924, -1.6161, -1.5016, 1.4245, 0.1367, -0.2526, -1.3109, -0.0547], device=torch_device - ) - - assert torch.allclose(output_slice_1.flatten(), expected_slice_1, atol=1e-3) - assert torch.allclose(output_slice_2.flatten(), expected_slice_2, atol=1e-3) - - def test_spatial_transformer_dropout(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - sample = torch.randn(1, 32, 64, 64).to(torch_device) - spatial_transformer_block = ( - Transformer2DModel( - in_channels=32, - num_attention_heads=2, - attention_head_dim=16, - dropout=0.3, - cross_attention_dim=None, - ) - .to(torch_device) - .eval() - ) - with torch.no_grad(): - attention_scores = spatial_transformer_block(sample).sample - - assert attention_scores.shape == (1, 32, 64, 64) - output_slice = attention_scores[0, -1, -3:, -3:] - - expected_slice = torch.tensor( - [-1.9380, -0.0083, -1.3771, -1.5819, 0.5209, -0.6441, -1.8545, 0.7563, -0.9615], device=torch_device - ) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - @unittest.skipIf(torch_device == "mps", "MPS does not support float64") - def test_spatial_transformer_discrete(self): - torch.manual_seed(0) - if torch.cuda.is_available(): - torch.cuda.manual_seed_all(0) - - num_embed = 5 - - sample = torch.randint(0, num_embed, (1, 32)).to(torch_device) - spatial_transformer_block = ( - Transformer2DModel( - num_attention_heads=1, - attention_head_dim=32, - num_vector_embeds=num_embed, - sample_size=16, - ) - .to(torch_device) - .eval() - ) - - with torch.no_grad(): - attention_scores = spatial_transformer_block(sample).sample - - assert attention_scores.shape == (1, num_embed - 1, 32) - - output_slice = attention_scores[0, -2:, -3:] - - expected_slice = torch.tensor([-1.7648, -1.0241, -2.0985, -1.8035, -1.6404, -1.2098], device=torch_device) - assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3) - - def test_spatial_transformer_default_norm_layers(self): - spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32) - - assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == nn.LayerNorm - assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm - - def test_spatial_transformer_ada_norm_layers(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, - attention_head_dim=32, - in_channels=32, - num_embeds_ada_norm=5, - ) - - assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm - assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm - - def test_spatial_transformer_default_ff_layers(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, - attention_head_dim=32, - in_channels=32, - ) - - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU - assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear - - dim = 32 - inner_dim = 128 - - # First dimension change - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim - # NOTE: inner_dim * 2 because GEGLU - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim * 2 - - # Second dimension change - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim - - def test_spatial_transformer_geglu_approx_ff_layers(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, - attention_head_dim=32, - in_channels=32, - activation_fn="geglu-approximate", - ) - - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU - assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear - - dim = 32 - inner_dim = 128 - - # First dimension change - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim - assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim - - # Second dimension change - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim - assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim - - def test_spatial_transformer_attention_bias(self): - spatial_transformer_block = Transformer2DModel( - num_attention_heads=1, attention_head_dim=32, in_channels=32, attention_bias=True - ) - - assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None - assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None - assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None diff --git a/diffusers/tests/test_modeling_common.py b/diffusers/tests/test_modeling_common.py deleted file mode 100644 index 40aba3b24967683b2e64b53402d9f8bdc93be2a8..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_modeling_common.py +++ /dev/null @@ -1,447 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import tempfile -import unittest -import unittest.mock as mock -from typing import Dict, List, Tuple - -import numpy as np -import requests_mock -import torch -from requests.exceptions import HTTPError - -from diffusers.models import UNet2DConditionModel -from diffusers.training_utils import EMAModel -from diffusers.utils import torch_device -from diffusers.utils.testing_utils import require_torch_gpu - - -class ModelUtilsTest(unittest.TestCase): - def tearDown(self): - super().tearDown() - - import diffusers - - diffusers.utils.import_utils._safetensors_available = True - - def test_accelerate_loading_error_message(self): - with self.assertRaises(ValueError) as error_context: - UNet2DConditionModel.from_pretrained("hf-internal-testing/stable-diffusion-broken", subfolder="unet") - - # make sure that error message states what keys are missing - assert "conv_out.bias" in str(error_context.exception) - - def test_cached_files_are_used_when_no_internet(self): - # A mock response for an HTTP head request to emulate server down - response_mock = mock.Mock() - response_mock.status_code = 500 - response_mock.headers = {} - response_mock.raise_for_status.side_effect = HTTPError - response_mock.json.return_value = {} - - # Download this model to make sure it's in the cache. - orig_model = UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet" - ) - - # Under the mock environment we get a 500 error when trying to reach the model. - with mock.patch("requests.request", return_value=response_mock): - # Download this model to make sure it's in the cache. - model = UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", local_files_only=True - ) - - for p1, p2 in zip(orig_model.parameters(), model.parameters()): - if p1.data.ne(p2.data).sum() > 0: - assert False, "Parameters not the same!" - - def test_one_request_upon_cached(self): - # TODO: For some reason this test fails on MPS where no HEAD call is made. - if torch_device == "mps": - return - - import diffusers - - diffusers.utils.import_utils._safetensors_available = False - - with tempfile.TemporaryDirectory() as tmpdirname: - with requests_mock.mock(real_http=True) as m: - UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", cache_dir=tmpdirname - ) - - download_requests = [r.method for r in m.request_history] - assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model" - assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model" - - with requests_mock.mock(real_http=True) as m: - UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", cache_dir=tmpdirname - ) - - cache_requests = [r.method for r in m.request_history] - assert ( - "HEAD" == cache_requests[0] and len(cache_requests) == 1 - ), "We should call only `model_info` to check for _commit hash and `send_telemetry`" - - diffusers.utils.import_utils._safetensors_available = True - - def test_weight_overwrite(self): - with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context: - UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="unet", - cache_dir=tmpdirname, - in_channels=9, - ) - - # make sure that error message states what keys are missing - assert "Cannot load" in str(error_context.exception) - - with tempfile.TemporaryDirectory() as tmpdirname: - model = UNet2DConditionModel.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", - subfolder="unet", - cache_dir=tmpdirname, - in_channels=9, - low_cpu_mem_usage=False, - ignore_mismatched_sizes=True, - ) - - assert model.config.in_channels == 9 - - -class ModelTesterMixin: - def test_from_save_pretrained(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - if hasattr(model, "set_default_attn_processor"): - model.set_default_attn_processor() - model.to(torch_device) - model.eval() - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained(tmpdirname) - if hasattr(new_model, "set_default_attn_processor"): - new_model.set_default_attn_processor() - new_model.to(torch_device) - - with torch.no_grad(): - image = model(**inputs_dict) - if isinstance(image, dict): - image = image.sample - - new_image = new_model(**inputs_dict) - - if isinstance(new_image, dict): - new_image = new_image.sample - - max_diff = (image - new_image).abs().sum().item() - self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") - - def test_from_save_pretrained_variant(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - if hasattr(model, "set_default_attn_processor"): - model.set_default_attn_processor() - model.to(torch_device) - model.eval() - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname, variant="fp16") - new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16") - if hasattr(new_model, "set_default_attn_processor"): - new_model.set_default_attn_processor() - - # non-variant cannot be loaded - with self.assertRaises(OSError) as error_context: - self.model_class.from_pretrained(tmpdirname) - - # make sure that error message states what keys are missing - assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception) - - new_model.to(torch_device) - - with torch.no_grad(): - image = model(**inputs_dict) - if isinstance(image, dict): - image = image.sample - - new_image = new_model(**inputs_dict) - - if isinstance(new_image, dict): - new_image = new_image.sample - - max_diff = (image - new_image).abs().sum().item() - self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") - - @require_torch_gpu - def test_from_save_pretrained_dynamo(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model = torch.compile(model) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained(tmpdirname) - new_model.to(torch_device) - - assert new_model.__class__ == self.model_class - - def test_from_save_pretrained_dtype(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - for dtype in [torch.float32, torch.float16, torch.bfloat16]: - if torch_device == "mps" and dtype == torch.bfloat16: - continue - with tempfile.TemporaryDirectory() as tmpdirname: - model.to(dtype) - model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype) - assert new_model.dtype == dtype - new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype) - assert new_model.dtype == dtype - - def test_determinism(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - first = model(**inputs_dict) - if isinstance(first, dict): - first = first.sample - - second = model(**inputs_dict) - if isinstance(second, dict): - second = second.sample - - out_1 = first.cpu().numpy() - out_2 = second.cpu().numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) - - def test_output(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_forward_with_norm_groups(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["norm_num_groups"] = 16 - init_dict["block_out_channels"] = (16, 32) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_forward_signature(self): - init_dict, _ = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - signature = inspect.signature(model.forward) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] - - expected_arg_names = ["sample", "timestep"] - self.assertListEqual(arg_names[:2], expected_arg_names) - - def test_model_from_pretrained(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - # test if the model can be loaded from the config - # and has all the expected shape - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained(tmpdirname) - new_model.to(torch_device) - new_model.eval() - - # check if all parameters shape are the same - for param_name in model.state_dict().keys(): - param_1 = model.state_dict()[param_name] - param_2 = new_model.state_dict()[param_name] - self.assertEqual(param_1.shape, param_2.shape) - - with torch.no_grad(): - output_1 = model(**inputs_dict) - - if isinstance(output_1, dict): - output_1 = output_1.sample - - output_2 = new_model(**inputs_dict) - - if isinstance(output_2, dict): - output_2 = output_2.sample - - self.assertEqual(output_1.shape, output_2.shape) - - @unittest.skipIf(torch_device == "mps", "Training is not supported in mps") - def test_training(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model.train() - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device) - loss = torch.nn.functional.mse_loss(output, noise) - loss.backward() - - @unittest.skipIf(torch_device == "mps", "Training is not supported in mps") - def test_ema_training(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model.train() - ema_model = EMAModel(model.parameters()) - - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device) - loss = torch.nn.functional.mse_loss(output, noise) - loss.backward() - ema_model.step(model.parameters()) - - def test_outputs_equivalence(self): - def set_nan_tensor_to_zero(t): - # Temporary fallback until `aten::_index_put_impl_` is implemented in mps - # Track progress in https://github.com/pytorch/pytorch/issues/77764 - device = t.device - if device.type == "mps": - t = t.to("cpu") - t[t != t] = 0 - return t.to(device) - - def recursive_check(tuple_object, dict_object): - if isinstance(tuple_object, (List, Tuple)): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif isinstance(tuple_object, Dict): - for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()): - recursive_check(tuple_iterable_value, dict_iterable_value) - elif tuple_object is None: - return - else: - self.assertTrue( - torch.allclose( - set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5 - ), - msg=( - "Tuple and dict output are not equal. Difference:" - f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:" - f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has" - f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}." - ), - ) - - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - outputs_dict = model(**inputs_dict) - outputs_tuple = model(**inputs_dict, return_dict=False) - - recursive_check(outputs_tuple, outputs_dict) - - @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS") - def test_enable_disable_gradient_checkpointing(self): - if not self.model_class._supports_gradient_checkpointing: - return # Skip test if model does not support gradient checkpointing - - init_dict, _ = self.prepare_init_args_and_inputs_for_common() - - # at init model should have gradient checkpointing disabled - model = self.model_class(**init_dict) - self.assertFalse(model.is_gradient_checkpointing) - - # check enable works - model.enable_gradient_checkpointing() - self.assertTrue(model.is_gradient_checkpointing) - - # check disable works - model.disable_gradient_checkpointing() - self.assertFalse(model.is_gradient_checkpointing) - - def test_deprecated_kwargs(self): - has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters - has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0 - - if has_kwarg_in_model_class and not has_deprecated_kwarg: - raise ValueError( - f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs" - " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are" - " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs =" - " []`" - ) - - if not has_kwarg_in_model_class and has_deprecated_kwarg: - raise ValueError( - f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs" - " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to" - f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument" - " from `_deprecated_kwargs = []`" - ) diff --git a/diffusers/tests/test_modeling_common_flax.py b/diffusers/tests/test_modeling_common_flax.py deleted file mode 100644 index 8945aed7c93fb1e664c7b6d799f7e0a96525b1a2..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_modeling_common_flax.py +++ /dev/null @@ -1,66 +0,0 @@ -import inspect - -from diffusers.utils import is_flax_available -from diffusers.utils.testing_utils import require_flax - - -if is_flax_available(): - import jax - - -@require_flax -class FlaxModelTesterMixin: - def test_output(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"]) - jax.lax.stop_gradient(variables) - - output = model.apply(variables, inputs_dict["sample"]) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_forward_with_norm_groups(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - init_dict["norm_num_groups"] = 16 - init_dict["block_out_channels"] = (16, 32) - - model = self.model_class(**init_dict) - variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"]) - jax.lax.stop_gradient(variables) - - output = model.apply(variables, inputs_dict["sample"]) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_deprecated_kwargs(self): - has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters - has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0 - - if has_kwarg_in_model_class and not has_deprecated_kwarg: - raise ValueError( - f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs" - " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are" - " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs =" - " []`" - ) - - if not has_kwarg_in_model_class and has_deprecated_kwarg: - raise ValueError( - f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs" - " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to" - f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument" - " from `_deprecated_kwargs = []`" - ) diff --git a/diffusers/tests/test_outputs.py b/diffusers/tests/test_outputs.py deleted file mode 100644 index 50cbd1d54ee403f2b8e79c8ada629b6b97b1be66..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_outputs.py +++ /dev/null @@ -1,60 +0,0 @@ -import unittest -from dataclasses import dataclass -from typing import List, Union - -import numpy as np -import PIL.Image - -from diffusers.utils.outputs import BaseOutput - - -@dataclass -class CustomOutput(BaseOutput): - images: Union[List[PIL.Image.Image], np.ndarray] - - -class ConfigTester(unittest.TestCase): - def test_outputs_single_attribute(self): - outputs = CustomOutput(images=np.random.rand(1, 3, 4, 4)) - - # check every way of getting the attribute - assert isinstance(outputs.images, np.ndarray) - assert outputs.images.shape == (1, 3, 4, 4) - assert isinstance(outputs["images"], np.ndarray) - assert outputs["images"].shape == (1, 3, 4, 4) - assert isinstance(outputs[0], np.ndarray) - assert outputs[0].shape == (1, 3, 4, 4) - - # test with a non-tensor attribute - outputs = CustomOutput(images=[PIL.Image.new("RGB", (4, 4))]) - - # check every way of getting the attribute - assert isinstance(outputs.images, list) - assert isinstance(outputs.images[0], PIL.Image.Image) - assert isinstance(outputs["images"], list) - assert isinstance(outputs["images"][0], PIL.Image.Image) - assert isinstance(outputs[0], list) - assert isinstance(outputs[0][0], PIL.Image.Image) - - def test_outputs_dict_init(self): - # test output reinitialization with a `dict` for compatibility with `accelerate` - outputs = CustomOutput({"images": np.random.rand(1, 3, 4, 4)}) - - # check every way of getting the attribute - assert isinstance(outputs.images, np.ndarray) - assert outputs.images.shape == (1, 3, 4, 4) - assert isinstance(outputs["images"], np.ndarray) - assert outputs["images"].shape == (1, 3, 4, 4) - assert isinstance(outputs[0], np.ndarray) - assert outputs[0].shape == (1, 3, 4, 4) - - # test with a non-tensor attribute - outputs = CustomOutput({"images": [PIL.Image.new("RGB", (4, 4))]}) - - # check every way of getting the attribute - assert isinstance(outputs.images, list) - assert isinstance(outputs.images[0], PIL.Image.Image) - assert isinstance(outputs["images"], list) - assert isinstance(outputs["images"][0], PIL.Image.Image) - assert isinstance(outputs[0], list) - assert isinstance(outputs[0][0], PIL.Image.Image) diff --git a/diffusers/tests/test_pipelines.py b/diffusers/tests/test_pipelines.py deleted file mode 100644 index 0525eaca50daa74b6118e9669d36451d761a42e8..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_pipelines.py +++ /dev/null @@ -1,1300 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import json -import os -import random -import shutil -import sys -import tempfile -import unittest -import unittest.mock as mock - -import numpy as np -import PIL -import requests_mock -import safetensors.torch -import torch -from parameterized import parameterized -from PIL import Image -from requests.exceptions import HTTPError -from transformers import CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer - -from diffusers import ( - AutoencoderKL, - DDIMPipeline, - DDIMScheduler, - DDPMPipeline, - DDPMScheduler, - DiffusionPipeline, - DPMSolverMultistepScheduler, - EulerAncestralDiscreteScheduler, - EulerDiscreteScheduler, - LMSDiscreteScheduler, - PNDMScheduler, - StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipelineLegacy, - StableDiffusionPipeline, - UNet2DConditionModel, - UNet2DModel, - UniPCMultistepScheduler, - logging, -) -from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import ( - CONFIG_NAME, - WEIGHTS_NAME, - floats_tensor, - is_flax_available, - nightly, - require_torch_2, - slow, - torch_device, -) -from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, load_numpy, require_compel, require_torch_gpu - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class DownloadTests(unittest.TestCase): - def test_one_request_upon_cached(self): - # TODO: For some reason this test fails on MPS where no HEAD call is made. - if torch_device == "mps": - return - - with tempfile.TemporaryDirectory() as tmpdirname: - with requests_mock.mock(real_http=True) as m: - DiffusionPipeline.download( - "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname - ) - - download_requests = [r.method for r in m.request_history] - assert download_requests.count("HEAD") == 15, "15 calls to files" - assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json" - assert ( - len(download_requests) == 32 - ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json" - - with requests_mock.mock(real_http=True) as m: - DiffusionPipeline.download( - "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname - ) - - cache_requests = [r.method for r in m.request_history] - assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD" - assert cache_requests.count("GET") == 1, "model info is only GET" - assert ( - len(cache_requests) == 2 - ), "We should call only `model_info` to check for _commit hash and `send_telemetry`" - - def test_download_only_pytorch(self): - with tempfile.TemporaryDirectory() as tmpdirname: - # pipeline has Flax weights - tmpdirname = DiffusionPipeline.download( - "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname - ) - - all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a flax file even if we have some here: - # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack - assert not any(f.endswith(".msgpack") for f in files) - # We need to never convert this tiny model to safetensors for this test to pass - assert not any(f.endswith(".safetensors") for f in files) - - def test_force_safetensors_error(self): - with tempfile.TemporaryDirectory() as tmpdirname: - # pipeline has Flax weights - with self.assertRaises(EnvironmentError): - tmpdirname = DiffusionPipeline.download( - "hf-internal-testing/tiny-stable-diffusion-pipe-no-safetensors", - safety_checker=None, - cache_dir=tmpdirname, - use_safetensors=True, - ) - - def test_returned_cached_folder(self): - prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - _, local_path = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, return_cached_folder=True - ) - pipe_2 = StableDiffusionPipeline.from_pretrained(local_path) - - pipe = pipe.to(torch_device) - pipe_2 = pipe_2.to(torch_device) - - generator = torch.manual_seed(0) - out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - generator = torch.manual_seed(0) - out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - assert np.max(np.abs(out - out_2)) < 1e-3 - - def test_download_safetensors(self): - with tempfile.TemporaryDirectory() as tmpdirname: - # pipeline has Flax weights - tmpdirname = DiffusionPipeline.download( - "hf-internal-testing/tiny-stable-diffusion-pipe-safetensors", - safety_checker=None, - cache_dir=tmpdirname, - ) - - all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a pytorch file even if we have some here: - # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack - assert not any(f.endswith(".bin") for f in files) - - def test_download_no_safety_checker(self): - prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - pipe = pipe.to(torch_device) - generator = torch.manual_seed(0) - out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch") - pipe_2 = pipe_2.to(torch_device) - generator = torch.manual_seed(0) - out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - assert np.max(np.abs(out - out_2)) < 1e-3 - - def test_load_no_safety_checker_explicit_locally(self): - prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - pipe = pipe.to(torch_device) - generator = torch.manual_seed(0) - out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None) - pipe_2 = pipe_2.to(torch_device) - - generator = torch.manual_seed(0) - - out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - assert np.max(np.abs(out - out_2)) < 1e-3 - - def test_load_no_safety_checker_default_locally(self): - prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch") - pipe = pipe.to(torch_device) - - generator = torch.manual_seed(0) - out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe.save_pretrained(tmpdirname) - pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname) - pipe_2 = pipe_2.to(torch_device) - - generator = torch.manual_seed(0) - - out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - assert np.max(np.abs(out - out_2)) < 1e-3 - - def test_cached_files_are_used_when_no_internet(self): - # A mock response for an HTTP head request to emulate server down - response_mock = mock.Mock() - response_mock.status_code = 500 - response_mock.headers = {} - response_mock.raise_for_status.side_effect = HTTPError - response_mock.json.return_value = {} - - # Download this model to make sure it's in the cache. - orig_pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")} - - # Under the mock environment we get a 500 error when trying to reach the model. - with mock.patch("requests.request", return_value=response_mock): - # Download this model to make sure it's in the cache. - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, local_files_only=True - ) - comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")} - - for m1, m2 in zip(orig_comps.values(), comps.values()): - for p1, p2 in zip(m1.parameters(), m2.parameters()): - if p1.data.ne(p2.data).sum() > 0: - assert False, "Parameters not the same!" - - def test_download_from_variant_folder(self): - for safe_avail in [False, True]: - import diffusers - - diffusers.utils.import_utils._safetensors_available = safe_avail - - other_format = ".bin" if safe_avail else ".safetensors" - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdirname = StableDiffusionPipeline.download( - "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname - ) - all_root_files = [t[-1] for t in os.walk(tmpdirname)] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a variant file even if we have some here: - # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet - assert len(files) == 15, f"We should only download 15 files, not {len(files)}" - assert not any(f.endswith(other_format) for f in files) - # no variants - assert not any(len(f.split(".")) == 3 for f in files) - - diffusers.utils.import_utils._safetensors_available = True - - def test_download_variant_all(self): - for safe_avail in [False, True]: - import diffusers - - diffusers.utils.import_utils._safetensors_available = safe_avail - - other_format = ".bin" if safe_avail else ".safetensors" - this_format = ".safetensors" if safe_avail else ".bin" - variant = "fp16" - - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdirname = StableDiffusionPipeline.download( - "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant - ) - all_root_files = [t[-1] for t in os.walk(tmpdirname)] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a non-variant file even if we have some here: - # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet - assert len(files) == 15, f"We should only download 15 files, not {len(files)}" - # unet, vae, text_encoder, safety_checker - assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4 - # all checkpoints should have variant ending - assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) - assert not any(f.endswith(other_format) for f in files) - - diffusers.utils.import_utils._safetensors_available = True - - def test_download_variant_partly(self): - for safe_avail in [False, True]: - import diffusers - - diffusers.utils.import_utils._safetensors_available = safe_avail - - other_format = ".bin" if safe_avail else ".safetensors" - this_format = ".safetensors" if safe_avail else ".bin" - variant = "no_ema" - - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdirname = StableDiffusionPipeline.download( - "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant - ) - all_root_files = [t[-1] for t in os.walk(tmpdirname)] - files = [item for sublist in all_root_files for item in sublist] - - unet_files = os.listdir(os.path.join(tmpdirname, "unet")) - - # Some of the downloaded files should be a non-variant file, check: - # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet - assert len(files) == 15, f"We should only download 15 files, not {len(files)}" - # only unet has "no_ema" variant - assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files - assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1 - # vae, safety_checker and text_encoder should have no variant - assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3 - assert not any(f.endswith(other_format) for f in files) - - diffusers.utils.import_utils._safetensors_available = True - - def test_download_broken_variant(self): - for safe_avail in [False, True]: - import diffusers - - diffusers.utils.import_utils._safetensors_available = safe_avail - # text encoder is missing no variant and "no_ema" variant weights, so the following can't work - for variant in [None, "no_ema"]: - with self.assertRaises(OSError) as error_context: - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdirname = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/stable-diffusion-broken-variants", - cache_dir=tmpdirname, - variant=variant, - ) - - assert "Error no file name" in str(error_context.exception) - - # text encoder has fp16 variants so we can load it - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdirname = StableDiffusionPipeline.download( - "hf-internal-testing/stable-diffusion-broken-variants", cache_dir=tmpdirname, variant="fp16" - ) - - all_root_files = [t[-1] for t in os.walk(tmpdirname)] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a non-variant file even if we have some here: - # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet - assert len(files) == 15, f"We should only download 15 files, not {len(files)}" - # only unet has "no_ema" variant - - diffusers.utils.import_utils._safetensors_available = True - - def test_text_inversion_download(self): - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - pipe = pipe.to(torch_device) - - num_tokens = len(pipe.tokenizer) - - # single token load local - with tempfile.TemporaryDirectory() as tmpdirname: - ten = {"<*>": torch.ones((32,))} - torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) - - pipe.load_textual_inversion(tmpdirname) - - token = pipe.tokenizer.convert_tokens_to_ids("<*>") - assert token == num_tokens, "Added token must be at spot `num_tokens`" - assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32 - assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>" - - prompt = "hey <*>" - out = pipe(prompt, num_inference_steps=1, output_type="numpy").images - assert out.shape == (1, 128, 128, 3) - - # single token load local with weight name - with tempfile.TemporaryDirectory() as tmpdirname: - ten = {"<**>": 2 * torch.ones((1, 32))} - torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) - - pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin") - - token = pipe.tokenizer.convert_tokens_to_ids("<**>") - assert token == num_tokens + 1, "Added token must be at spot `num_tokens`" - assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64 - assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>" - - prompt = "hey <**>" - out = pipe(prompt, num_inference_steps=1, output_type="numpy").images - assert out.shape == (1, 128, 128, 3) - - # multi token load - with tempfile.TemporaryDirectory() as tmpdirname: - ten = {"<***>": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])} - torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin")) - - pipe.load_textual_inversion(tmpdirname) - - token = pipe.tokenizer.convert_tokens_to_ids("<***>") - token_1 = pipe.tokenizer.convert_tokens_to_ids("<***>_1") - token_2 = pipe.tokenizer.convert_tokens_to_ids("<***>_2") - - assert token == num_tokens + 2, "Added token must be at spot `num_tokens`" - assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`" - assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`" - assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96 - assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128 - assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160 - assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***><***>_1<***>_2" - - prompt = "hey <***>" - out = pipe(prompt, num_inference_steps=1, output_type="numpy").images - assert out.shape == (1, 128, 128, 3) - - # multi token load a1111 - with tempfile.TemporaryDirectory() as tmpdirname: - ten = { - "string_to_param": { - "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))]) - }, - "name": "<****>", - } - torch.save(ten, os.path.join(tmpdirname, "a1111.bin")) - - pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin") - - token = pipe.tokenizer.convert_tokens_to_ids("<****>") - token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1") - token_2 = pipe.tokenizer.convert_tokens_to_ids("<****>_2") - - assert token == num_tokens + 5, "Added token must be at spot `num_tokens`" - assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`" - assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`" - assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96 - assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128 - assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160 - assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****><****>_1<****>_2" - - prompt = "hey <****>" - out = pipe(prompt, num_inference_steps=1, output_type="numpy").images - assert out.shape == (1, 128, 128, 3) - - -class CustomPipelineTests(unittest.TestCase): - def test_load_custom_pipeline(self): - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline" - ) - pipeline = pipeline.to(torch_device) - # NOTE that `"CustomPipeline"` is not a class that is defined in this library, but solely on the Hub - # under https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L24 - assert pipeline.__class__.__name__ == "CustomPipeline" - - def test_load_custom_github(self): - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="main" - ) - - # make sure that on "main" pipeline gives only ones because of: https://github.com/huggingface/diffusers/pull/1690 - with torch.no_grad(): - output = pipeline() - - assert output.numel() == output.sum() - - # hack since Python doesn't like overwriting modules: https://stackoverflow.com/questions/3105801/unload-a-module-in-python - # Could in the future work with hashes instead. - del sys.modules["diffusers_modules.git.one_step_unet"] - - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="0.10.2" - ) - with torch.no_grad(): - output = pipeline() - - assert output.numel() != output.sum() - - assert pipeline.__class__.__name__ == "UnetSchedulerOneForwardPipeline" - - def test_run_custom_pipeline(self): - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline" - ) - pipeline = pipeline.to(torch_device) - images, output_str = pipeline(num_inference_steps=2, output_type="np") - - assert images[0].shape == (1, 32, 32, 3) - - # compare output to https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L102 - assert output_str == "This is a test" - - def test_local_custom_pipeline_repo(self): - local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline") - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path - ) - pipeline = pipeline.to(torch_device) - images, output_str = pipeline(num_inference_steps=2, output_type="np") - - assert pipeline.__class__.__name__ == "CustomLocalPipeline" - assert images[0].shape == (1, 32, 32, 3) - # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102 - assert output_str == "This is a local test" - - def test_local_custom_pipeline_file(self): - local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline") - local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py") - pipeline = DiffusionPipeline.from_pretrained( - "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path - ) - pipeline = pipeline.to(torch_device) - images, output_str = pipeline(num_inference_steps=2, output_type="np") - - assert pipeline.__class__.__name__ == "CustomLocalPipeline" - assert images[0].shape == (1, 32, 32, 3) - # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102 - assert output_str == "This is a local test" - - @slow - @require_torch_gpu - def test_download_from_git(self): - clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" - - feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id) - clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16) - - pipeline = DiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - custom_pipeline="clip_guided_stable_diffusion", - clip_model=clip_model, - feature_extractor=feature_extractor, - torch_dtype=torch.float16, - ) - pipeline.enable_attention_slicing() - pipeline = pipeline.to(torch_device) - - # NOTE that `"CLIPGuidedStableDiffusion"` is not a class that is defined in the pypi package of th e library, but solely on the community examples folder of GitHub under: - # https://github.com/huggingface/diffusers/blob/main/examples/community/clip_guided_stable_diffusion.py - assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion" - - image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0] - assert image.shape == (512, 512, 3) - - -class PipelineFastTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - import diffusers - - diffusers.utils.import_utils._safetensors_available = True - - def dummy_image(self): - batch_size = 1 - num_channels = 3 - sizes = (32, 32) - - image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device) - return image - - def dummy_uncond_unet(self, sample_size=32): - torch.manual_seed(0) - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=sample_size, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - return model - - def dummy_cond_unet(self, sample_size=32): - torch.manual_seed(0) - model = UNet2DConditionModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=sample_size, - in_channels=4, - out_channels=4, - down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), - up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), - cross_attention_dim=32, - ) - return model - - @property - def dummy_vae(self): - torch.manual_seed(0) - model = AutoencoderKL( - block_out_channels=[32, 64], - in_channels=3, - out_channels=3, - down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], - up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], - latent_channels=4, - ) - return model - - @property - def dummy_text_encoder(self): - torch.manual_seed(0) - config = CLIPTextConfig( - bos_token_id=0, - eos_token_id=2, - hidden_size=32, - intermediate_size=37, - layer_norm_eps=1e-05, - num_attention_heads=4, - num_hidden_layers=5, - pad_token_id=1, - vocab_size=1000, - ) - return CLIPTextModel(config) - - @property - def dummy_extractor(self): - def extract(*args, **kwargs): - class Out: - def __init__(self): - self.pixel_values = torch.ones([0]) - - def to(self, device): - self.pixel_values.to(device) - return self - - return Out() - - return extract - - @parameterized.expand( - [ - [DDIMScheduler, DDIMPipeline, 32], - [DDPMScheduler, DDPMPipeline, 32], - [DDIMScheduler, DDIMPipeline, (32, 64)], - [DDPMScheduler, DDPMPipeline, (64, 32)], - ] - ) - def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32): - unet = self.dummy_uncond_unet(sample_size) - scheduler = scheduler_fn() - pipeline = pipeline_fn(unet, scheduler).to(torch_device) - - generator = torch.manual_seed(0) - out_image = pipeline( - generator=generator, - num_inference_steps=2, - output_type="np", - ).images - sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size - assert out_image.shape == (1, *sample_size, 3) - - def test_stable_diffusion_components(self): - """Test that components property works correctly""" - unet = self.dummy_cond_unet() - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0] - init_image = Image.fromarray(np.uint8(image)).convert("RGB") - mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32)) - - # make sure here that pndm scheduler skips prk - inpaint = StableDiffusionInpaintPipelineLegacy( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ).to(torch_device) - img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device) - text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device) - - prompt = "A painting of a squirrel eating a burger" - - generator = torch.manual_seed(0) - image_inpaint = inpaint( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - image=init_image, - mask_image=mask_image, - ).images - image_img2img = img2img( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - image=init_image, - ).images - image_text2img = text2img( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - ).images - - assert image_inpaint.shape == (1, 32, 32, 3) - assert image_img2img.shape == (1, 32, 32, 3) - assert image_text2img.shape == (1, 64, 64, 3) - - @require_torch_gpu - def test_pipe_false_offload_warn(self): - unet = self.dummy_cond_unet() - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - sd = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - - sd.enable_model_cpu_offload() - - logger = logging.get_logger("diffusers.pipelines.pipeline_utils") - with CaptureLogger(logger) as cap_logger: - sd.to("cuda") - - assert "It is strongly recommended against doing so" in str(cap_logger) - - sd = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - - def test_set_scheduler(self): - unet = self.dummy_cond_unet() - scheduler = PNDMScheduler(skip_prk_steps=True) - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - sd = StableDiffusionPipeline( - unet=unet, - scheduler=scheduler, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - - sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, DDIMScheduler) - sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, DDPMScheduler) - sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, PNDMScheduler) - sd.scheduler = LMSDiscreteScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, LMSDiscreteScheduler) - sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, EulerDiscreteScheduler) - sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler) - sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config) - assert isinstance(sd.scheduler, DPMSolverMultistepScheduler) - - def test_set_scheduler_consistency(self): - unet = self.dummy_cond_unet() - pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") - ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - sd = StableDiffusionPipeline( - unet=unet, - scheduler=pndm, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - - pndm_config = sd.scheduler.config - sd.scheduler = DDPMScheduler.from_config(pndm_config) - sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config) - pndm_config_2 = sd.scheduler.config - pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config} - - assert dict(pndm_config) == dict(pndm_config_2) - - sd = StableDiffusionPipeline( - unet=unet, - scheduler=ddim, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=None, - feature_extractor=self.dummy_extractor, - ) - - ddim_config = sd.scheduler.config - sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config) - sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config) - ddim_config_2 = sd.scheduler.config - ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config} - - assert dict(ddim_config) == dict(ddim_config_2) - - def test_save_safe_serialization(self): - pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch") - with tempfile.TemporaryDirectory() as tmpdirname: - pipeline.save_pretrained(tmpdirname, safe_serialization=True) - - # Validate that the VAE safetensor exists and are of the correct format - vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors") - assert os.path.exists(vae_path), f"Could not find {vae_path}" - _ = safetensors.torch.load_file(vae_path) - - # Validate that the UNet safetensor exists and are of the correct format - unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors") - assert os.path.exists(unet_path), f"Could not find {unet_path}" - _ = safetensors.torch.load_file(unet_path) - - # Validate that the text encoder safetensor exists and are of the correct format - text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors") - assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}" - _ = safetensors.torch.load_file(text_encoder_path) - - pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname) - assert pipeline.unet is not None - assert pipeline.vae is not None - assert pipeline.text_encoder is not None - assert pipeline.scheduler is not None - assert pipeline.feature_extractor is not None - - def test_no_pytorch_download_when_doing_safetensors(self): - # by default we don't download - with tempfile.TemporaryDirectory() as tmpdirname: - _ = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname - ) - - path = os.path.join( - tmpdirname, - "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all", - "snapshots", - "07838d72e12f9bcec1375b0482b80c1d399be843", - "unet", - ) - # safetensors exists - assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors")) - # pytorch does not - assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin")) - - def test_no_safetensors_download_when_doing_pytorch(self): - # mock diffusers safetensors not available - import diffusers - - diffusers.utils.import_utils._safetensors_available = False - - with tempfile.TemporaryDirectory() as tmpdirname: - _ = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname - ) - - path = os.path.join( - tmpdirname, - "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all", - "snapshots", - "07838d72e12f9bcec1375b0482b80c1d399be843", - "unet", - ) - # safetensors does not exists - assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors")) - # pytorch does - assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin")) - - diffusers.utils.import_utils._safetensors_available = True - - def test_optional_components(self): - unet = self.dummy_cond_unet() - pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler") - vae = self.dummy_vae - bert = self.dummy_text_encoder - tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") - - orig_sd = StableDiffusionPipeline( - unet=unet, - scheduler=pndm, - vae=vae, - text_encoder=bert, - tokenizer=tokenizer, - safety_checker=unet, - feature_extractor=self.dummy_extractor, - ) - sd = orig_sd - - assert sd.config.requires_safety_checker is True - - with tempfile.TemporaryDirectory() as tmpdirname: - sd.save_pretrained(tmpdirname) - - # Test that passing None works - sd = StableDiffusionPipeline.from_pretrained( - tmpdirname, feature_extractor=None, safety_checker=None, requires_safety_checker=False - ) - - assert sd.config.requires_safety_checker is False - assert sd.config.safety_checker == (None, None) - assert sd.config.feature_extractor == (None, None) - - with tempfile.TemporaryDirectory() as tmpdirname: - sd.save_pretrained(tmpdirname) - - # Test that loading previous None works - sd = StableDiffusionPipeline.from_pretrained(tmpdirname) - - assert sd.config.requires_safety_checker is False - assert sd.config.safety_checker == (None, None) - assert sd.config.feature_extractor == (None, None) - - orig_sd.save_pretrained(tmpdirname) - - # Test that loading without any directory works - shutil.rmtree(os.path.join(tmpdirname, "safety_checker")) - with open(os.path.join(tmpdirname, sd.config_name)) as f: - config = json.load(f) - config["safety_checker"] = [None, None] - with open(os.path.join(tmpdirname, sd.config_name), "w") as f: - json.dump(config, f) - - sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False) - sd.save_pretrained(tmpdirname) - sd = StableDiffusionPipeline.from_pretrained(tmpdirname) - - assert sd.config.requires_safety_checker is False - assert sd.config.safety_checker == (None, None) - assert sd.config.feature_extractor == (None, None) - - # Test that loading from deleted model index works - with open(os.path.join(tmpdirname, sd.config_name)) as f: - config = json.load(f) - del config["safety_checker"] - del config["feature_extractor"] - with open(os.path.join(tmpdirname, sd.config_name), "w") as f: - json.dump(config, f) - - sd = StableDiffusionPipeline.from_pretrained(tmpdirname) - - assert sd.config.requires_safety_checker is False - assert sd.config.safety_checker == (None, None) - assert sd.config.feature_extractor == (None, None) - - with tempfile.TemporaryDirectory() as tmpdirname: - sd.save_pretrained(tmpdirname) - - # Test that partially loading works - sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor) - - assert sd.config.requires_safety_checker is False - assert sd.config.safety_checker == (None, None) - assert sd.config.feature_extractor != (None, None) - - # Test that partially loading works - sd = StableDiffusionPipeline.from_pretrained( - tmpdirname, - feature_extractor=self.dummy_extractor, - safety_checker=unet, - requires_safety_checker=[True, True], - ) - - assert sd.config.requires_safety_checker == [True, True] - assert sd.config.safety_checker != (None, None) - assert sd.config.feature_extractor != (None, None) - - with tempfile.TemporaryDirectory() as tmpdirname: - sd.save_pretrained(tmpdirname) - sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor) - - assert sd.config.requires_safety_checker == [True, True] - assert sd.config.safety_checker != (None, None) - assert sd.config.feature_extractor != (None, None) - - -@slow -@require_torch_gpu -class PipelineSlowTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_smart_download(self): - model_id = "hf-internal-testing/unet-pipeline-dummy" - with tempfile.TemporaryDirectory() as tmpdirname: - _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True) - local_repo_name = "--".join(["models"] + model_id.split("/")) - snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots") - snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0]) - - # inspect all downloaded files to make sure that everything is included - assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name)) - assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME)) - assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME)) - assert os.path.isfile(os.path.join(snapshot_dir, WEIGHTS_NAME)) - assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME)) - assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME)) - assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME)) - # let's make sure the super large numpy file: - # https://huggingface.co/hf-internal-testing/unet-pipeline-dummy/blob/main/big_array.npy - # is not downloaded, but all the expected ones - assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy")) - - def test_warning_unused_kwargs(self): - model_id = "hf-internal-testing/unet-pipeline-dummy" - logger = logging.get_logger("diffusers.pipelines") - with tempfile.TemporaryDirectory() as tmpdirname: - with CaptureLogger(logger) as cap_logger: - DiffusionPipeline.from_pretrained( - model_id, - not_used=True, - cache_dir=tmpdirname, - force_download=True, - ) - - assert ( - cap_logger.out.strip().split("\n")[-1] - == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored." - ) - - def test_from_save_pretrained(self): - # 1. Load models - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - scheduler = DDPMScheduler(num_train_timesteps=10) - - ddpm = DDPMPipeline(model, scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - with tempfile.TemporaryDirectory() as tmpdirname: - ddpm.save_pretrained(tmpdirname) - new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) - new_ddpm.to(torch_device) - - generator = torch.Generator(device=torch_device).manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(0) - new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" - - @require_torch_2 - def test_from_save_pretrained_dynamo(self): - # 1. Load models - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - model = torch.compile(model) - scheduler = DDPMScheduler(num_train_timesteps=10) - - ddpm = DDPMPipeline(model, scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - with tempfile.TemporaryDirectory() as tmpdirname: - ddpm.save_pretrained(tmpdirname) - new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) - new_ddpm.to(torch_device) - - generator = torch.Generator(device=torch_device).manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(0) - new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" - - def test_from_pretrained_hub(self): - model_path = "google/ddpm-cifar10-32" - - scheduler = DDPMScheduler(num_train_timesteps=10) - - ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler) - ddpm = ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler) - ddpm_from_hub = ddpm_from_hub.to(torch_device) - ddpm_from_hub.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=torch_device).manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(0) - new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" - - def test_from_pretrained_hub_pass_model(self): - model_path = "google/ddpm-cifar10-32" - - scheduler = DDPMScheduler(num_train_timesteps=10) - - # pass unet into DiffusionPipeline - unet = UNet2DModel.from_pretrained(model_path) - ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler) - ddpm_from_hub_custom_model = ddpm_from_hub_custom_model.to(torch_device) - ddpm_from_hub_custom_model.set_progress_bar_config(disable=None) - - ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler) - ddpm_from_hub = ddpm_from_hub.to(torch_device) - ddpm_from_hub_custom_model.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=torch_device).manual_seed(0) - image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(0) - new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" - - def test_output_format(self): - model_path = "google/ddpm-cifar10-32" - - scheduler = DDIMScheduler.from_pretrained(model_path) - pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - images = pipe(output_type="numpy").images - assert images.shape == (1, 32, 32, 3) - assert isinstance(images, np.ndarray) - - images = pipe(output_type="pil", num_inference_steps=4).images - assert isinstance(images, list) - assert len(images) == 1 - assert isinstance(images[0], PIL.Image.Image) - - # use PIL by default - images = pipe(num_inference_steps=4).images - assert isinstance(images, list) - assert isinstance(images[0], PIL.Image.Image) - - def test_from_flax_from_pt(self): - pipe_pt = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - pipe_pt.to(torch_device) - - if not is_flax_available(): - raise ImportError("Make sure flax is installed.") - - from diffusers import FlaxStableDiffusionPipeline - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe_pt.save_pretrained(tmpdirname) - - pipe_flax, params = FlaxStableDiffusionPipeline.from_pretrained( - tmpdirname, safety_checker=None, from_pt=True - ) - - with tempfile.TemporaryDirectory() as tmpdirname: - pipe_flax.save_pretrained(tmpdirname, params=params) - pipe_pt_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None, from_flax=True) - pipe_pt_2.to(torch_device) - - prompt = "Hello" - - generator = torch.manual_seed(0) - image_0 = pipe_pt( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - ).images[0] - - generator = torch.manual_seed(0) - image_1 = pipe_pt_2( - [prompt], - generator=generator, - num_inference_steps=2, - output_type="np", - ).images[0] - - assert np.abs(image_0 - image_1).sum() < 1e-5, "Models don't give the same forward pass" - - @require_compel - def test_weighted_prompts_compel(self): - from compel import Compel - - pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") - pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) - pipe.enable_model_cpu_offload() - pipe.enable_attention_slicing() - - compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder) - - prompt = "a red cat playing with a ball{}" - - prompts = [prompt.format(s) for s in ["", "++", "--"]] - - prompt_embeds = compel(prompts) - - generator = [torch.Generator(device="cpu").manual_seed(33) for _ in range(prompt_embeds.shape[0])] - - images = pipe( - prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20, output_type="numpy" - ).images - - for i, image in enumerate(images): - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - f"/compel/forest_{i}.npy" - ) - - assert np.abs(image - expected_image).max() < 1e-2 - - -@nightly -@require_torch_gpu -class PipelineNightlyTests(unittest.TestCase): - def tearDown(self): - # clean up the VRAM after each test - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_ddpm_ddim_equality_batched(self): - seed = 0 - model_id = "google/ddpm-cifar10-32" - - unet = UNet2DModel.from_pretrained(model_id) - ddpm_scheduler = DDPMScheduler() - ddim_scheduler = DDIMScheduler() - - ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler) - ddim.to(torch_device) - ddim.set_progress_bar_config(disable=None) - - generator = torch.Generator(device=torch_device).manual_seed(seed) - ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(seed) - ddim_images = ddim( - batch_size=2, - generator=generator, - num_inference_steps=1000, - eta=1.0, - output_type="numpy", - use_clipped_model_output=True, # Need this to make DDIM match DDPM - ).images - - # the values aren't exactly equal, but the images look the same visually - assert np.abs(ddpm_images - ddim_images).max() < 1e-1 diff --git a/diffusers/tests/test_pipelines_common.py b/diffusers/tests/test_pipelines_common.py deleted file mode 100644 index 13fbe924c799a6b427d1ed55b4d8f6cb4dd824fb..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_pipelines_common.py +++ /dev/null @@ -1,590 +0,0 @@ -import contextlib -import gc -import inspect -import io -import re -import tempfile -import unittest -from typing import Callable, Union - -import numpy as np -import torch - -import diffusers -from diffusers import DiffusionPipeline -from diffusers.utils import logging -from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available -from diffusers.utils.testing_utils import require_torch, torch_device - - -torch.backends.cuda.matmul.allow_tf32 = False - - -def to_np(tensor): - if isinstance(tensor, torch.Tensor): - tensor = tensor.detach().cpu().numpy() - - return tensor - - -@require_torch -class PipelineTesterMixin: - """ - This mixin is designed to be used with unittest.TestCase classes. - It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline, - equivalence of dict and tuple outputs, etc. - """ - - # Canonical parameters that are passed to `__call__` regardless - # of the type of pipeline. They are always optional and have common - # sense default values. - required_optional_params = frozenset( - [ - "num_inference_steps", - "num_images_per_prompt", - "generator", - "latents", - "output_type", - "return_dict", - "callback", - "callback_steps", - ] - ) - - # set these parameters to False in the child class if the pipeline does not support the corresponding functionality - test_attention_slicing = True - test_cpu_offload = True - test_xformers_attention = True - - def get_generator(self, seed): - device = torch_device if torch_device != "mps" else "cpu" - generator = torch.Generator(device).manual_seed(seed) - return generator - - @property - def pipeline_class(self) -> Union[Callable, DiffusionPipeline]: - raise NotImplementedError( - "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. " - "See existing pipeline tests for reference." - ) - - def get_dummy_components(self): - raise NotImplementedError( - "You need to implement `get_dummy_components(self)` in the child test class. " - "See existing pipeline tests for reference." - ) - - def get_dummy_inputs(self, device, seed=0): - raise NotImplementedError( - "You need to implement `get_dummy_inputs(self, device, seed)` in the child test class. " - "See existing pipeline tests for reference." - ) - - @property - def params(self) -> frozenset: - raise NotImplementedError( - "You need to set the attribute `params` in the child test class. " - "`params` are checked for if all values are present in `__call__`'s signature." - " You can set `params` using one of the common set of parameters defined in`pipeline_params.py`" - " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to " - "image pipelines, including prompts and prompt embedding overrides." - "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, " - "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline " - "with non-configurable height and width arguments should set the attribute as " - "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. " - "See existing pipeline tests for reference." - ) - - @property - def batch_params(self) -> frozenset: - raise NotImplementedError( - "You need to set the attribute `batch_params` in the child test class. " - "`batch_params` are the parameters required to be batched when passed to the pipeline's " - "`__call__` method. `pipeline_params.py` provides some common sets of parameters such as " - "`TEXT_TO_IMAGE_BATCH_PARAMS`, `IMAGE_VARIATION_BATCH_PARAMS`, etc... If your pipeline's " - "set of batch arguments has minor changes from one of the common sets of batch arguments, " - "do not make modifications to the existing common sets of batch arguments. I.e. a text to " - "image pipeline `negative_prompt` is not batched should set the attribute as " - "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. " - "See existing pipeline tests for reference." - ) - - def tearDown(self): - # clean up the VRAM after each test in case of CUDA runtime errors - super().tearDown() - gc.collect() - torch.cuda.empty_cache() - - def test_save_load_local(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() - self.assertLess(max_diff, 1e-4) - - def test_pipeline_call_signature(self): - self.assertTrue( - hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method" - ) - - parameters = inspect.signature(self.pipeline_class.__call__).parameters - - optional_parameters = set() - - for k, v in parameters.items(): - if v.default != inspect._empty: - optional_parameters.add(k) - - parameters = set(parameters.keys()) - parameters.remove("self") - parameters.discard("kwargs") # kwargs can be added if arguments of pipeline call function are deprecated - - remaining_required_parameters = set() - - for param in self.params: - if param not in parameters: - remaining_required_parameters.add(param) - - self.assertTrue( - len(remaining_required_parameters) == 0, - f"Required parameters not present: {remaining_required_parameters}", - ) - - remaining_required_optional_parameters = set() - - for param in self.required_optional_params: - if param not in optional_parameters: - remaining_required_optional_parameters.add(param) - - self.assertTrue( - len(remaining_required_optional_parameters) == 0, - f"Required optional parameters not present: {remaining_required_optional_parameters}", - ) - - def test_inference_batch_consistent(self): - self._test_inference_batch_consistent() - - def _test_inference_batch_consistent( - self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"] - ): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - - logger = logging.get_logger(pipe.__module__) - logger.setLevel(level=diffusers.logging.FATAL) - - # batchify inputs - for batch_size in batch_sizes: - batched_inputs = {} - for name, value in inputs.items(): - if name in self.batch_params: - # prompt is string - if name == "prompt": - len_prompt = len(value) - # make unequal batch sizes - batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] - - # make last batch super long - batched_inputs[name][-1] = 2000 * "very long" - # or else we have images - else: - batched_inputs[name] = batch_size * [value] - elif name == "batch_size": - batched_inputs[name] = batch_size - else: - batched_inputs[name] = value - - for arg in additional_params_copy_to_batched_inputs: - batched_inputs[arg] = inputs[arg] - - batched_inputs["output_type"] = None - - if self.pipeline_class.__name__ == "DanceDiffusionPipeline": - batched_inputs.pop("output_type") - - output = pipe(**batched_inputs) - - assert len(output[0]) == batch_size - - batched_inputs["output_type"] = "np" - - if self.pipeline_class.__name__ == "DanceDiffusionPipeline": - batched_inputs.pop("output_type") - - output = pipe(**batched_inputs)[0] - - assert output.shape[0] == batch_size - - logger.setLevel(level=diffusers.logging.WARNING) - - def test_inference_batch_single_identical(self): - self._test_inference_batch_single_identical() - - def _test_inference_batch_single_identical( - self, - test_max_difference=None, - test_mean_pixel_difference=None, - relax_max_difference=False, - expected_max_diff=1e-4, - additional_params_copy_to_batched_inputs=["num_inference_steps"], - ): - if test_max_difference is None: - # TODO(Pedro) - not sure why, but not at all reproducible at the moment it seems - # make sure that batched and non-batched is identical - test_max_difference = torch_device != "mps" - - if test_mean_pixel_difference is None: - # TODO same as above - test_mean_pixel_difference = torch_device != "mps" - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - - logger = logging.get_logger(pipe.__module__) - logger.setLevel(level=diffusers.logging.FATAL) - - # batchify inputs - batched_inputs = {} - batch_size = 3 - for name, value in inputs.items(): - if name in self.batch_params: - # prompt is string - if name == "prompt": - len_prompt = len(value) - # make unequal batch sizes - batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)] - - # make last batch super long - batched_inputs[name][-1] = 2000 * "very long" - # or else we have images - else: - batched_inputs[name] = batch_size * [value] - elif name == "batch_size": - batched_inputs[name] = batch_size - elif name == "generator": - batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)] - else: - batched_inputs[name] = value - - for arg in additional_params_copy_to_batched_inputs: - batched_inputs[arg] = inputs[arg] - - if self.pipeline_class.__name__ != "DanceDiffusionPipeline": - batched_inputs["output_type"] = "np" - - output_batch = pipe(**batched_inputs) - assert output_batch[0].shape[0] == batch_size - - inputs["generator"] = self.get_generator(0) - - output = pipe(**inputs) - - logger.setLevel(level=diffusers.logging.WARNING) - if test_max_difference: - if relax_max_difference: - # Taking the median of the largest differences - # is resilient to outliers - diff = np.abs(output_batch[0][0] - output[0][0]) - diff = diff.flatten() - diff.sort() - max_diff = np.median(diff[-5:]) - else: - max_diff = np.abs(output_batch[0][0] - output[0][0]).max() - assert max_diff < expected_max_diff - - if test_mean_pixel_difference: - assert_mean_pixel_difference(output_batch[0][0], output[0][0]) - - def test_dict_tuple_outputs_equivalent(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(torch_device))[0] - output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0] - - max_diff = np.abs(to_np(output) - to_np(output_tuple)).max() - self.assertLess(max_diff, 1e-4) - - def test_components_function(self): - init_components = self.get_dummy_components() - pipe = self.pipeline_class(**init_components) - - self.assertTrue(hasattr(pipe, "components")) - self.assertTrue(set(pipe.components.keys()) == set(init_components.keys())) - - @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") - def test_float16_inference(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe_fp16 = self.pipeline_class(**components) - pipe_fp16.to(torch_device, torch.float16) - pipe_fp16.set_progress_bar_config(disable=None) - - output = pipe(**self.get_dummy_inputs(torch_device))[0] - output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0] - - max_diff = np.abs(to_np(output) - to_np(output_fp16)).max() - self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.") - - @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") - def test_save_load_float16(self): - components = self.get_dummy_components() - for name, module in components.items(): - if hasattr(module, "half"): - components[name] = module.to(torch_device).half() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for name, component in pipe_loaded.components.items(): - if hasattr(component, "dtype"): - self.assertTrue( - component.dtype == torch.float16, - f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() - self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.") - - def test_save_load_optional_components(self): - if not hasattr(self.pipeline_class, "_optional_components"): - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - # set all optional components to None - for optional_component in pipe._optional_components: - setattr(pipe, optional_component, None) - - inputs = self.get_dummy_inputs(torch_device) - output = pipe(**inputs)[0] - - with tempfile.TemporaryDirectory() as tmpdir: - pipe.save_pretrained(tmpdir) - pipe_loaded = self.pipeline_class.from_pretrained(tmpdir) - pipe_loaded.to(torch_device) - pipe_loaded.set_progress_bar_config(disable=None) - - for optional_component in pipe._optional_components: - self.assertTrue( - getattr(pipe_loaded, optional_component) is None, - f"`{optional_component}` did not stay set to None after loading.", - ) - - inputs = self.get_dummy_inputs(torch_device) - output_loaded = pipe_loaded(**inputs)[0] - - max_diff = np.abs(to_np(output) - to_np(output_loaded)).max() - self.assertLess(max_diff, 1e-4) - - @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices") - def test_to_device(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - pipe.to("cpu") - model_devices = [component.device.type for component in components.values() if hasattr(component, "device")] - self.assertTrue(all(device == "cpu" for device in model_devices)) - - output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0] - self.assertTrue(np.isnan(output_cpu).sum() == 0) - - pipe.to("cuda") - model_devices = [component.device.type for component in components.values() if hasattr(component, "device")] - self.assertTrue(all(device == "cuda" for device in model_devices)) - - output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0] - self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0) - - def test_to_dtype(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.set_progress_bar_config(disable=None) - - model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes)) - - pipe.to(torch_dtype=torch.float16) - model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")] - self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes)) - - def test_attention_slicing_forward_pass(self): - self._test_attention_slicing_forward_pass() - - def _test_attention_slicing_forward_pass( - self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3 - ): - if not self.test_attention_slicing: - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_without_slicing = pipe(**inputs)[0] - - pipe.enable_attention_slicing(slice_size=1) - inputs = self.get_dummy_inputs(torch_device) - output_with_slicing = pipe(**inputs)[0] - - if test_max_difference: - max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max() - self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results") - - if test_mean_pixel_difference: - assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0]) - - @unittest.skipIf( - torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"), - reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher", - ) - def test_cpu_offload_forward_pass(self): - if not self.test_cpu_offload: - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs)[0] - - pipe.enable_sequential_cpu_offload() - inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs)[0] - - max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max() - self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results") - - @unittest.skipIf( - torch_device != "cuda" or not is_xformers_available(), - reason="XFormers attention is only available with CUDA and `xformers` installed", - ) - def test_xformers_attention_forwardGenerator_pass(self): - self._test_xformers_attention_forwardGenerator_pass() - - def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=True, expected_max_diff=1e-4): - if not self.test_xformers_attention: - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs)[0] - - pipe.enable_xformers_memory_efficient_attention() - inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs)[0] - - if test_max_difference: - max_diff = np.abs(output_with_offload - output_without_offload).max() - self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results") - - assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0]) - - def test_progress_bar(self): - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe.to(torch_device) - - inputs = self.get_dummy_inputs(torch_device) - with io.StringIO() as stderr, contextlib.redirect_stderr(stderr): - _ = pipe(**inputs) - stderr = stderr.getvalue() - # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img, - # so we just match "5" in "#####| 1/5 [00:01<00:00]" - max_steps = re.search("/(.*?) ", stderr).group(1) - self.assertTrue(max_steps is not None and len(max_steps) > 0) - self.assertTrue( - f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step" - ) - - pipe.set_progress_bar_config(disable=True) - with io.StringIO() as stderr, contextlib.redirect_stderr(stderr): - _ = pipe(**inputs) - self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled") - - def test_num_images_per_prompt(self): - sig = inspect.signature(self.pipeline_class.__call__) - - if "num_images_per_prompt" not in sig.parameters: - return - - components = self.get_dummy_components() - pipe = self.pipeline_class(**components) - pipe = pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - batch_sizes = [1, 2] - num_images_per_prompts = [1, 2] - - for batch_size in batch_sizes: - for num_images_per_prompt in num_images_per_prompts: - inputs = self.get_dummy_inputs(torch_device) - - for key in inputs.keys(): - if key in self.batch_params: - inputs[key] = batch_size * [inputs[key]] - - images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images - - assert images.shape[0] == batch_size * num_images_per_prompt - - -# Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used. -# This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a -# reference image. -def assert_mean_pixel_difference(image, expected_image): - image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32) - expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32) - avg_diff = np.abs(image - expected_image).mean() - assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average" diff --git a/diffusers/tests/test_pipelines_flax.py b/diffusers/tests/test_pipelines_flax.py deleted file mode 100644 index a461930f3a83ecfc8134d50ce5978d329d79f5c9..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_pipelines_flax.py +++ /dev/null @@ -1,226 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import tempfile -import unittest - -import numpy as np - -from diffusers.utils import is_flax_available -from diffusers.utils.testing_utils import require_flax, slow - - -if is_flax_available(): - import jax - import jax.numpy as jnp - from flax.jax_utils import replicate - from flax.training.common_utils import shard - from jax import pmap - - from diffusers import FlaxDDIMScheduler, FlaxDiffusionPipeline, FlaxStableDiffusionPipeline - - -@require_flax -class DownloadTests(unittest.TestCase): - def test_download_only_pytorch(self): - with tempfile.TemporaryDirectory() as tmpdirname: - # pipeline has Flax weights - _ = FlaxDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname - ) - - all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))] - files = [item for sublist in all_root_files for item in sublist] - - # None of the downloaded files should be a PyTorch file even if we have some here: - # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_pytorch_model.bin - assert not any(f.endswith(".bin") for f in files) - - -@slow -@require_flax -class FlaxPipelineTests(unittest.TestCase): - def test_dummy_all_tpus(self): - pipeline, params = FlaxStableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None - ) - - prompt = ( - "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of" - " field, close up, split lighting, cinematic" - ) - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 4 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = pipeline.prepare_inputs(prompt) - - p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,)) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, num_samples) - prompt_ids = shard(prompt_ids) - - images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images - - assert images.shape == (num_samples, 1, 64, 64, 3) - if jax.device_count() == 8: - assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3 - assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1 - - images_pil = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:]))) - - assert len(images_pil) == num_samples - - def test_stable_diffusion_v1_4(self): - pipeline, params = FlaxStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", revision="flax", safety_checker=None - ) - - prompt = ( - "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of" - " field, close up, split lighting, cinematic" - ) - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 50 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = pipeline.prepare_inputs(prompt) - - p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,)) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, num_samples) - prompt_ids = shard(prompt_ids) - - images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images - - assert images.shape == (num_samples, 1, 512, 512, 3) - if jax.device_count() == 8: - assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3 - assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1 - - def test_stable_diffusion_v1_4_bfloat_16(self): - pipeline, params = FlaxStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", revision="bf16", dtype=jnp.bfloat16, safety_checker=None - ) - - prompt = ( - "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of" - " field, close up, split lighting, cinematic" - ) - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 50 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = pipeline.prepare_inputs(prompt) - - p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,)) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, num_samples) - prompt_ids = shard(prompt_ids) - - images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images - - assert images.shape == (num_samples, 1, 512, 512, 3) - if jax.device_count() == 8: - assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3 - assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1 - - def test_stable_diffusion_v1_4_bfloat_16_with_safety(self): - pipeline, params = FlaxStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", revision="bf16", dtype=jnp.bfloat16 - ) - - prompt = ( - "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of" - " field, close up, split lighting, cinematic" - ) - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 50 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = pipeline.prepare_inputs(prompt) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, num_samples) - prompt_ids = shard(prompt_ids) - - images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images - - assert images.shape == (num_samples, 1, 512, 512, 3) - if jax.device_count() == 8: - assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3 - assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1 - - def test_stable_diffusion_v1_4_bfloat_16_ddim(self): - scheduler = FlaxDDIMScheduler( - beta_start=0.00085, - beta_end=0.012, - beta_schedule="scaled_linear", - set_alpha_to_one=False, - steps_offset=1, - ) - - pipeline, params = FlaxStableDiffusionPipeline.from_pretrained( - "CompVis/stable-diffusion-v1-4", - revision="bf16", - dtype=jnp.bfloat16, - scheduler=scheduler, - safety_checker=None, - ) - scheduler_state = scheduler.create_state() - - params["scheduler"] = scheduler_state - - prompt = ( - "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of" - " field, close up, split lighting, cinematic" - ) - - prng_seed = jax.random.PRNGKey(0) - num_inference_steps = 50 - - num_samples = jax.device_count() - prompt = num_samples * [prompt] - prompt_ids = pipeline.prepare_inputs(prompt) - - p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,)) - - # shard inputs and rng - params = replicate(params) - prng_seed = jax.random.split(prng_seed, num_samples) - prompt_ids = shard(prompt_ids) - - images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images - - assert images.shape == (num_samples, 1, 512, 512, 3) - if jax.device_count() == 8: - assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3 - assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1 diff --git a/diffusers/tests/test_pipelines_onnx_common.py b/diffusers/tests/test_pipelines_onnx_common.py deleted file mode 100644 index 575ecd0075318e8ec62ab7cd76bff5b0b1ca82ad..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_pipelines_onnx_common.py +++ /dev/null @@ -1,12 +0,0 @@ -from diffusers.utils.testing_utils import require_onnxruntime - - -@require_onnxruntime -class OnnxPipelineTesterMixin: - """ - This mixin is designed to be used with unittest.TestCase classes. - It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline, - equivalence of dict and tuple outputs, etc. - """ - - pass diff --git a/diffusers/tests/test_training.py b/diffusers/tests/test_training.py deleted file mode 100644 index d540f997622148082874272ff7cebffea4d4450d..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_training.py +++ /dev/null @@ -1,86 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import torch - -from diffusers import DDIMScheduler, DDPMScheduler, UNet2DModel -from diffusers.training_utils import set_seed -from diffusers.utils.testing_utils import slow - - -torch.backends.cuda.matmul.allow_tf32 = False - - -class TrainingTests(unittest.TestCase): - def get_model_optimizer(self, resolution=32): - set_seed(0) - model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3) - optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) - return model, optimizer - - @slow - def test_training_step_equality(self): - device = "cpu" # ensure full determinism without setting the CUBLAS_WORKSPACE_CONFIG env variable - ddpm_scheduler = DDPMScheduler( - num_train_timesteps=1000, - beta_start=0.0001, - beta_end=0.02, - beta_schedule="linear", - clip_sample=True, - ) - ddim_scheduler = DDIMScheduler( - num_train_timesteps=1000, - beta_start=0.0001, - beta_end=0.02, - beta_schedule="linear", - clip_sample=True, - ) - - assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps - - # shared batches for DDPM and DDIM - set_seed(0) - clean_images = [torch.randn((4, 3, 32, 32)).clip(-1, 1).to(device) for _ in range(4)] - noise = [torch.randn((4, 3, 32, 32)).to(device) for _ in range(4)] - timesteps = [torch.randint(0, 1000, (4,)).long().to(device) for _ in range(4)] - - # train with a DDPM scheduler - model, optimizer = self.get_model_optimizer(resolution=32) - model.train().to(device) - for i in range(4): - optimizer.zero_grad() - ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) - ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample - loss = torch.nn.functional.mse_loss(ddpm_noise_pred, noise[i]) - loss.backward() - optimizer.step() - del model, optimizer - - # recreate the model and optimizer, and retry with DDIM - model, optimizer = self.get_model_optimizer(resolution=32) - model.train().to(device) - for i in range(4): - optimizer.zero_grad() - ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i]) - ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample - loss = torch.nn.functional.mse_loss(ddim_noise_pred, noise[i]) - loss.backward() - optimizer.step() - del model, optimizer - - self.assertTrue(torch.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-5)) - self.assertTrue(torch.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-5)) diff --git a/diffusers/tests/test_unet_2d_blocks.py b/diffusers/tests/test_unet_2d_blocks.py deleted file mode 100644 index e560240422ace376e8ccca989da9144ee8e8d98d..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_unet_2d_blocks.py +++ /dev/null @@ -1,337 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest - -from diffusers.models.unet_2d_blocks import * # noqa F403 -from diffusers.utils import torch_device - -from .test_unet_blocks_common import UNetBlockTesterMixin - - -class DownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = DownBlock2D # noqa F405 - block_type = "down" - - def test_output(self): - expected_slice = [-0.0232, -0.9869, 0.8054, -0.0637, -0.1688, -1.4264, 0.4470, -1.3394, 0.0904] - super().test_output(expected_slice) - - -class ResnetDownsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = ResnetDownsampleBlock2D # noqa F405 - block_type = "down" - - def test_output(self): - expected_slice = [0.0710, 0.2410, -0.7320, -1.0757, -1.1343, 0.3540, -0.0133, -0.2576, 0.0948] - super().test_output(expected_slice) - - -class AttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnDownBlock2D # noqa F405 - block_type = "down" - - def test_output(self): - expected_slice = [0.0636, 0.8964, -0.6234, -1.0131, 0.0844, 0.4935, 0.3437, 0.0911, -0.2957] - super().test_output(expected_slice) - - -class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = CrossAttnDownBlock2D # noqa F405 - block_type = "down" - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.2440, -0.6953, -0.2140, -0.3874, 0.1966, 1.2077, 0.0441, -0.7718, 0.2800] - super().test_output(expected_slice) - - -class SimpleCrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = SimpleCrossAttnDownBlock2D # noqa F405 - block_type = "down" - - @property - def dummy_input(self): - return super().get_dummy_input(include_encoder_hidden_states=True) - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - @unittest.skipIf(torch_device == "mps", "MPS result is not consistent") - def test_output(self): - expected_slice = [0.7921, -0.0992, -0.1962, -0.7695, -0.4242, 0.7804, 0.4737, 0.2765, 0.3338] - super().test_output(expected_slice) - - -class SkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = SkipDownBlock2D # noqa F405 - block_type = "down" - - @property - def dummy_input(self): - return super().get_dummy_input(include_skip_sample=True) - - def test_output(self): - expected_slice = [-0.0845, -0.2087, -0.2465, 0.0971, 0.1900, -0.0484, 0.2664, 0.4179, 0.5069] - super().test_output(expected_slice) - - -class AttnSkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnSkipDownBlock2D # noqa F405 - block_type = "down" - - @property - def dummy_input(self): - return super().get_dummy_input(include_skip_sample=True) - - def test_output(self): - expected_slice = [0.5539, 0.1609, 0.4924, 0.0537, -0.1995, 0.4050, 0.0979, -0.2721, -0.0642] - super().test_output(expected_slice) - - -class DownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = DownEncoderBlock2D # noqa F405 - block_type = "down" - - @property - def dummy_input(self): - return super().get_dummy_input(include_temb=False) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 32, - "out_channels": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [1.1102, 0.5302, 0.4872, -0.0023, -0.8042, 0.0483, -0.3489, -0.5632, 0.7626] - super().test_output(expected_slice) - - -class AttnDownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnDownEncoderBlock2D # noqa F405 - block_type = "down" - - @property - def dummy_input(self): - return super().get_dummy_input(include_temb=False) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 32, - "out_channels": 32, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.8966, -0.1486, 0.8568, 0.8141, -0.9046, -0.1342, -0.0972, -0.7417, 0.1538] - super().test_output(expected_slice) - - -class UNetMidBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = UNetMidBlock2D # noqa F405 - block_type = "mid" - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 32, - "temb_channels": 128, - } - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [-0.1062, 1.7248, 0.3494, 1.4569, -0.0910, -1.2421, -0.9984, 0.6736, 1.0028] - super().test_output(expected_slice) - - -class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = UNetMidBlock2DCrossAttn # noqa F405 - block_type = "mid" - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.1879, 2.2653, 0.5987, 1.1568, -0.8454, -1.6109, -0.8919, 0.8306, 1.6758] - super().test_output(expected_slice) - - -class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = UNetMidBlock2DSimpleCrossAttn # noqa F405 - block_type = "mid" - - @property - def dummy_input(self): - return super().get_dummy_input(include_encoder_hidden_states=True) - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.7143, 1.9974, 0.5448, 1.3977, 0.1282, -1.1237, -1.4238, 0.5530, 0.8880] - super().test_output(expected_slice) - - -class UpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = UpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - def test_output(self): - expected_slice = [-0.2041, -0.4165, -0.3022, 0.0041, -0.6628, -0.7053, 0.1928, -0.0325, 0.0523] - super().test_output(expected_slice) - - -class ResnetUpsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = ResnetUpsampleBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - def test_output(self): - expected_slice = [0.2287, 0.3549, -0.1346, 0.4797, -0.1715, -0.9649, 0.7305, -0.5864, -0.6244] - super().test_output(expected_slice) - - -class CrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = CrossAttnUpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [-0.2796, -0.4364, -0.1067, -0.2693, 0.1894, 0.3869, -0.3470, 0.4584, 0.5091] - super().test_output(expected_slice) - - -class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = SimpleCrossAttnUpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True) - - def prepare_init_args_and_inputs_for_common(self): - init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common() - init_dict["cross_attention_dim"] = 32 - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.2645, 0.1480, 0.0909, 0.8044, -0.9758, -0.9083, 0.0994, -1.1453, -0.7402] - super().test_output(expected_slice) - - -class AttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnUpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - @unittest.skipIf(torch_device == "mps", "MPS result is not consistent") - def test_output(self): - expected_slice = [0.0979, 0.1326, 0.0021, 0.0659, 0.2249, 0.0059, 0.1132, 0.5952, 0.1033] - super().test_output(expected_slice) - - -class SkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = SkipUpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - def test_output(self): - expected_slice = [-0.0893, -0.1234, -0.1506, -0.0332, 0.0123, -0.0211, 0.0566, 0.0143, 0.0362] - super().test_output(expected_slice) - - -class AttnSkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnSkipUpBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_res_hidden_states_tuple=True) - - def test_output(self): - expected_slice = [0.0361, 0.0617, 0.2787, -0.0350, 0.0342, 0.3421, -0.0843, 0.0913, 0.3015] - super().test_output(expected_slice) - - -class UpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = UpDecoderBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_temb=False) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = {"in_channels": 32, "out_channels": 32} - - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.4404, 0.1998, -0.9886, -0.3320, -0.3128, -0.7034, -0.6955, -0.2338, -0.3137] - super().test_output(expected_slice) - - -class AttnUpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase): - block_class = AttnUpDecoderBlock2D # noqa F405 - block_type = "up" - - @property - def dummy_input(self): - return super().get_dummy_input(include_temb=False) - - def prepare_init_args_and_inputs_for_common(self): - init_dict = {"in_channels": 32, "out_channels": 32} - - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self): - expected_slice = [0.6738, 0.4491, 0.1055, 1.0710, 0.7316, 0.3339, 0.3352, 0.1023, 0.3568] - super().test_output(expected_slice) diff --git a/diffusers/tests/test_unet_blocks_common.py b/diffusers/tests/test_unet_blocks_common.py deleted file mode 100644 index 17b7f65d6da31c43f062eaa6bed7284ce85e471f..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_unet_blocks_common.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import unittest -from typing import Tuple - -import torch - -from diffusers.utils import floats_tensor, randn_tensor, torch_all_close, torch_device -from diffusers.utils.testing_utils import require_torch - - -@require_torch -class UNetBlockTesterMixin: - @property - def dummy_input(self): - return self.get_dummy_input() - - @property - def output_shape(self): - if self.block_type == "down": - return (4, 32, 16, 16) - elif self.block_type == "mid": - return (4, 32, 32, 32) - elif self.block_type == "up": - return (4, 32, 64, 64) - - raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.") - - def get_dummy_input( - self, - include_temb=True, - include_res_hidden_states_tuple=False, - include_encoder_hidden_states=False, - include_skip_sample=False, - ): - batch_size = 4 - num_channels = 32 - sizes = (32, 32) - - generator = torch.manual_seed(0) - device = torch.device(torch_device) - shape = (batch_size, num_channels) + sizes - hidden_states = randn_tensor(shape, generator=generator, device=device) - dummy_input = {"hidden_states": hidden_states} - - if include_temb: - temb_channels = 128 - dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator, device=device) - - if include_res_hidden_states_tuple: - generator_1 = torch.manual_seed(1) - dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1, device=device),) - - if include_encoder_hidden_states: - dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32)).to(torch_device) - - if include_skip_sample: - dummy_input["skip_sample"] = randn_tensor(((batch_size, 3) + sizes), generator=generator, device=device) - - return dummy_input - - def prepare_init_args_and_inputs_for_common(self): - init_dict = { - "in_channels": 32, - "out_channels": 32, - "temb_channels": 128, - } - if self.block_type == "up": - init_dict["prev_output_channel"] = 32 - - if self.block_type == "mid": - init_dict.pop("out_channels") - - inputs_dict = self.dummy_input - return init_dict, inputs_dict - - def test_output(self, expected_slice): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - unet_block = self.block_class(**init_dict) - unet_block.to(torch_device) - unet_block.eval() - - with torch.no_grad(): - output = unet_block(**inputs_dict) - - if isinstance(output, Tuple): - output = output[0] - - self.assertEqual(output.shape, self.output_shape) - - output_slice = output[0, -1, -3:, -3:] - expected_slice = torch.tensor(expected_slice).to(torch_device) - assert torch_all_close(output_slice.flatten(), expected_slice, atol=5e-3) - - @unittest.skipIf(torch_device == "mps", "Training is not supported in mps") - def test_training(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - model = self.block_class(**init_dict) - model.to(torch_device) - model.train() - output = model(**inputs_dict) - - if isinstance(output, Tuple): - output = output[0] - - device = torch.device(torch_device) - noise = randn_tensor(output.shape, device=device) - loss = torch.nn.functional.mse_loss(output, noise) - loss.backward() diff --git a/diffusers/tests/test_utils.py b/diffusers/tests/test_utils.py deleted file mode 100755 index 4fc4e1a06638ae14848424db24f212ae24afbf34..0000000000000000000000000000000000000000 --- a/diffusers/tests/test_utils.py +++ /dev/null @@ -1,170 +0,0 @@ -# coding=utf-8 -# Copyright 2023 HuggingFace Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -from diffusers import __version__ -from diffusers.utils import deprecate - - -class DeprecateTester(unittest.TestCase): - higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:]) - lower_version = "0.0.1" - - def test_deprecate_function_arg(self): - kwargs = {"deprecated_arg": 4} - - with self.assertWarns(FutureWarning) as warning: - output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs) - - assert output == 4 - assert ( - str(warning.warning) - == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}." - " message" - ) - - def test_deprecate_function_arg_tuple(self): - kwargs = {"deprecated_arg": 4} - - with self.assertWarns(FutureWarning) as warning: - output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs) - - assert output == 4 - assert ( - str(warning.warning) - == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}." - " message" - ) - - def test_deprecate_function_args(self): - kwargs = {"deprecated_arg_1": 4, "deprecated_arg_2": 8} - with self.assertWarns(FutureWarning) as warning: - output_1, output_2 = deprecate( - ("deprecated_arg_1", self.higher_version, "Hey"), - ("deprecated_arg_2", self.higher_version, "Hey"), - take_from=kwargs, - ) - assert output_1 == 4 - assert output_2 == 8 - assert ( - str(warning.warnings[0].message) - == "The `deprecated_arg_1` argument is deprecated and will be removed in version" - f" {self.higher_version}. Hey" - ) - assert ( - str(warning.warnings[1].message) - == "The `deprecated_arg_2` argument is deprecated and will be removed in version" - f" {self.higher_version}. Hey" - ) - - def test_deprecate_function_incorrect_arg(self): - kwargs = {"deprecated_arg": 4} - - with self.assertRaises(TypeError) as error: - deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs) - - assert "test_deprecate_function_incorrect_arg in" in str(error.exception) - assert "line" in str(error.exception) - assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception) - - def test_deprecate_arg_no_kwarg(self): - with self.assertWarns(FutureWarning) as warning: - deprecate(("deprecated_arg", self.higher_version, "message")) - - assert ( - str(warning.warning) - == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message" - ) - - def test_deprecate_args_no_kwarg(self): - with self.assertWarns(FutureWarning) as warning: - deprecate( - ("deprecated_arg_1", self.higher_version, "Hey"), - ("deprecated_arg_2", self.higher_version, "Hey"), - ) - assert ( - str(warning.warnings[0].message) - == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey" - ) - assert ( - str(warning.warnings[1].message) - == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey" - ) - - def test_deprecate_class_obj(self): - class Args: - arg = 5 - - with self.assertWarns(FutureWarning) as warning: - arg = deprecate(("arg", self.higher_version, "message"), take_from=Args()) - - assert arg == 5 - assert ( - str(warning.warning) - == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" - ) - - def test_deprecate_class_objs(self): - class Args: - arg = 5 - foo = 7 - - with self.assertWarns(FutureWarning) as warning: - arg_1, arg_2 = deprecate( - ("arg", self.higher_version, "message"), - ("foo", self.higher_version, "message"), - ("does not exist", self.higher_version, "message"), - take_from=Args(), - ) - - assert arg_1 == 5 - assert arg_2 == 7 - assert ( - str(warning.warning) - == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" - ) - assert ( - str(warning.warnings[0].message) - == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message" - ) - assert ( - str(warning.warnings[1].message) - == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message" - ) - - def test_deprecate_incorrect_version(self): - kwargs = {"deprecated_arg": 4} - - with self.assertRaises(ValueError) as error: - deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs) - - assert ( - str(error.exception) - == "The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since diffusers' version" - f" {__version__} is >= {self.lower_version}" - ) - - def test_deprecate_incorrect_no_standard_warn(self): - with self.assertWarns(FutureWarning) as warning: - deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False) - - assert str(warning.warning) == "This message is better!!!" - - def test_deprecate_stacklevel(self): - with self.assertWarns(FutureWarning) as warning: - deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False) - assert str(warning.warning) == "This message is better!!!" - assert "diffusers/tests/test_utils.py" in warning.filename