diff --git a/diffusers/tests/__init__.py b/diffusers/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/conftest.py b/diffusers/tests/conftest.py
deleted file mode 100644
index 6a02a38163ab01b1c2d0d12d5578e06d91b77cc8..0000000000000000000000000000000000000000
--- a/diffusers/tests/conftest.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# tests directory-specific settings - this file is run automatically
-# by pytest before any tests are run
-
-import sys
-import warnings
-from os.path import abspath, dirname, join
-
-
-# allow having multiple repository checkouts and not needing to remember to rerun
-# 'pip install -e .[dev]' when switching between checkouts and running tests.
-git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
-sys.path.insert(1, git_repo_path)
-
-# silence FutureWarning warnings in tests since often we can't act on them until
-# they become normal warnings - i.e. the tests still need to test the current functionality
-warnings.simplefilter(action="ignore", category=FutureWarning)
-
-
-def pytest_addoption(parser):
-    from diffusers.utils.testing_utils import pytest_addoption_shared
-
-    pytest_addoption_shared(parser)
-
-
-def pytest_terminal_summary(terminalreporter):
-    from diffusers.utils.testing_utils import pytest_terminal_summary_main
-
-    make_reports = terminalreporter.config.getoption("--make-reports")
-    if make_reports:
-        pytest_terminal_summary_main(terminalreporter, id=make_reports)
diff --git a/diffusers/tests/fixtures/custom_pipeline/pipeline.py b/diffusers/tests/fixtures/custom_pipeline/pipeline.py
deleted file mode 100644
index 9119ae30f42f58aab8a52f303c1879e4b3803468..0000000000000000000000000000000000000000
--- a/diffusers/tests/fixtures/custom_pipeline/pipeline.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-
-from typing import Optional, Tuple, Union
-
-import torch
-
-from diffusers import DiffusionPipeline, ImagePipelineOutput
-
-
-class CustomLocalPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[torch.Generator] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-            generator=generator,
-        )
-        image = image.to(self.device)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image).prev_sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,), "This is a local test"
-
-        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/diffusers/tests/fixtures/custom_pipeline/what_ever.py b/diffusers/tests/fixtures/custom_pipeline/what_ever.py
deleted file mode 100644
index a8af08d3980a6e9dbd5af240792edf013cef7313..0000000000000000000000000000000000000000
--- a/diffusers/tests/fixtures/custom_pipeline/what_ever.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-
-# limitations under the License.
-
-
-from typing import Optional, Tuple, Union
-
-import torch
-
-from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput
-
-
-class CustomLocalPipeline(DiffusionPipeline):
-    r"""
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Parameters:
-        unet ([`UNet2DModel`]): U-Net architecture to denoise the encoded image.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
-            [`DDPMScheduler`], or [`DDIMScheduler`].
-    """
-
-    def __init__(self, unet, scheduler):
-        super().__init__()
-        self.register_modules(unet=unet, scheduler=scheduler)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        batch_size: int = 1,
-        generator: Optional[torch.Generator] = None,
-        num_inference_steps: int = 50,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        **kwargs,
-    ) -> Union[ImagePipelineOutput, Tuple]:
-        r"""
-        Args:
-            batch_size (`int`, *optional*, defaults to 1):
-                The number of images to generate.
-            generator (`torch.Generator`, *optional*):
-                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
-                deterministic.
-            eta (`float`, *optional*, defaults to 0.0):
-                The eta parameter which controls the scale of the variance (0 is DDIM and 1 is one type of DDPM).
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if
-            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
-            generated images.
-        """
-
-        # Sample gaussian noise to begin loop
-        image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
-            generator=generator,
-        )
-        image = image.to(self.device)
-
-        # set step values
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        for t in self.progress_bar(self.scheduler.timesteps):
-            # 1. predict noise model_output
-            model_output = self.unet(image, t).sample
-
-            # 2. predict previous mean of image x_t-1 and add variance depending on eta
-            # eta corresponds to η in paper and should be between [0, 1]
-            # do x_t -> x_t-1
-            image = self.scheduler.step(model_output, t, image).prev_sample
-
-        image = (image / 2 + 0.5).clamp(0, 1)
-        image = image.cpu().permute(0, 2, 3, 1).numpy()
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        if not return_dict:
-            return (image,), "This is a local test"
-
-        return ImagePipelineOutput(images=image), "This is a local test"
diff --git a/diffusers/tests/fixtures/elise_format0.mid b/diffusers/tests/fixtures/elise_format0.mid
deleted file mode 100644
index 33dbabe7ab1d4d28e43d9911255a510a8a672d77..0000000000000000000000000000000000000000
Binary files a/diffusers/tests/fixtures/elise_format0.mid and /dev/null differ
diff --git a/diffusers/tests/models/__init__.py b/diffusers/tests/models/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/models/test_models_unet_1d.py b/diffusers/tests/models/test_models_unet_1d.py
deleted file mode 100644
index b814f5f88a302c7c0bdc869ab7674c5657eee775..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_unet_1d.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import UNet1DModel
-from diffusers.utils import floats_tensor, slow, torch_device
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class UNet1DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet1DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 14
-        seq_len = 16
-
-        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
-        time_step = torch.tensor([10] * batch_size).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (4, 14, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 14, 16)
-
-    def test_ema_training(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_determinism(self):
-        super().test_determinism()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_save_pretrained(self):
-        super().test_from_save_pretrained()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_save_pretrained_variant(self):
-        super().test_from_save_pretrained_variant()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_model_from_pretrained(self):
-        super().test_model_from_pretrained()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_output(self):
-        super().test_output()
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64, 128, 256),
-            "in_channels": 14,
-            "out_channels": 14,
-            "time_embedding_type": "positional",
-            "use_timestep_embedding": True,
-            "flip_sin_to_cos": False,
-            "freq_shift": 1.0,
-            "out_block_type": "OutConv1DBlock",
-            "mid_block_type": "MidResTemporalBlock1D",
-            "down_block_types": ("DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"),
-            "up_block_types": ("UpResnetBlock1D", "UpResnetBlock1D", "UpResnetBlock1D"),
-            "act_fn": "mish",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="unet"
-        )
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_output_pretrained(self):
-        model = UNet1DModel.from_pretrained("bglick13/hopper-medium-v2-value-function-hor32", subfolder="unet")
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        num_features = model.in_channels
-        seq_len = 16
-        noise = torch.randn((1, seq_len, num_features)).permute(
-            0, 2, 1
-        )  # match original, we can update values and remove
-        time_step = torch.full((num_features,), 0)
-
-        with torch.no_grad():
-            output = model(noise, time_step).sample.permute(0, 2, 1)
-
-        output_slice = output[0, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-2.137172, 1.1426016, 0.3688687, -0.766922, 0.7303146, 0.11038864, -0.4760633, 0.13270172, 0.02591348])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-3))
-
-    def test_forward_with_norm_groups(self):
-        # Not implemented yet for this UNet
-        pass
-
-    @slow
-    def test_unet_1d_maestro(self):
-        model_id = "harmonai/maestro-150k"
-        model = UNet1DModel.from_pretrained(model_id, subfolder="unet")
-        model.to(torch_device)
-
-        sample_size = 65536
-        noise = torch.sin(torch.arange(sample_size)[None, None, :].repeat(1, 2, 1)).to(torch_device)
-        timestep = torch.tensor([1]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, timestep).sample
-
-        output_sum = output.abs().sum()
-        output_max = output.abs().max()
-
-        assert (output_sum - 224.0896).abs() < 4e-2
-        assert (output_max - 0.0607).abs() < 4e-4
-
-
-class UNetRLModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet1DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_features = 14
-        seq_len = 16
-
-        noise = floats_tensor((batch_size, num_features, seq_len)).to(torch_device)
-        time_step = torch.tensor([10] * batch_size).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (4, 14, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 14, 1)
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_determinism(self):
-        super().test_determinism()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_outputs_equivalence(self):
-        super().test_outputs_equivalence()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_save_pretrained(self):
-        super().test_from_save_pretrained()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_save_pretrained_variant(self):
-        super().test_from_save_pretrained_variant()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_model_from_pretrained(self):
-        super().test_model_from_pretrained()
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_output(self):
-        # UNetRL is a value-function is different output shape
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = torch.Size((inputs_dict["sample"].shape[0], 1))
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_ema_training(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 14,
-            "out_channels": 14,
-            "down_block_types": ["DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D", "DownResnetBlock1D"],
-            "up_block_types": [],
-            "out_block_type": "ValueFunction",
-            "mid_block_type": "ValueFunctionMidBlock1D",
-            "block_out_channels": [32, 64, 128, 256],
-            "layers_per_block": 1,
-            "downsample_each_block": True,
-            "use_timestep_embedding": True,
-            "freq_shift": 1.0,
-            "flip_sin_to_cos": False,
-            "time_embedding_type": "positional",
-            "act_fn": "mish",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_from_pretrained_hub(self):
-        value_function, vf_loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
-        )
-        self.assertIsNotNone(value_function)
-        self.assertEqual(len(vf_loading_info["missing_keys"]), 0)
-
-        value_function.to(torch_device)
-        image = value_function(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    @unittest.skipIf(torch_device == "mps", "mish op not supported in MPS")
-    def test_output_pretrained(self):
-        value_function, vf_loading_info = UNet1DModel.from_pretrained(
-            "bglick13/hopper-medium-v2-value-function-hor32", output_loading_info=True, subfolder="value_function"
-        )
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        num_features = value_function.in_channels
-        seq_len = 14
-        noise = torch.randn((1, seq_len, num_features)).permute(
-            0, 2, 1
-        )  # match original, we can update values and remove
-        time_step = torch.full((num_features,), 0)
-
-        with torch.no_grad():
-            output = value_function(noise, time_step).sample
-
-        # fmt: off
-        expected_output_slice = torch.tensor([165.25] * seq_len)
-        # fmt: on
-        self.assertTrue(torch.allclose(output, expected_output_slice, rtol=1e-3))
-
-    def test_forward_with_norm_groups(self):
-        # Not implemented yet for this UNet
-        pass
diff --git a/diffusers/tests/models/test_models_unet_2d.py b/diffusers/tests/models/test_models_unet_2d.py
deleted file mode 100644
index 8f831fcf7cbfb298c5e4deb489cc0edae1f76a51..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_unet_2d.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import math
-import unittest
-
-import torch
-
-from diffusers import UNet2DModel
-from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class Unet2DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": ("DownBlock2D", "AttnDownBlock2D"),
-            "up_block_types": ("AttnUpBlock2D", "UpBlock2D"),
-            "attention_head_dim": None,
-            "out_channels": 3,
-            "in_channels": 3,
-            "layers_per_block": 2,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-
-class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "sample_size": 32,
-            "in_channels": 4,
-            "out_channels": 4,
-            "layers_per_block": 2,
-            "block_out_channels": (32, 64),
-            "attention_head_dim": 32,
-            "down_block_types": ("DownBlock2D", "DownBlock2D"),
-            "up_block_types": ("UpBlock2D", "UpBlock2D"),
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input).sample
-
-        assert image is not None, "Make sure output is not None"
-
-    @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
-    def test_from_pretrained_accelerate(self):
-        model, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        model.to(torch_device)
-        image = model(**self.dummy_input).sample
-
-        assert image is not None, "Make sure output is not None"
-
-    @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU")
-    def test_from_pretrained_accelerate_wont_change_results(self):
-        # by defautl model loading will use accelerate as `low_cpu_mem_usage=True`
-        model_accelerate, _ = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update", output_loading_info=True)
-        model_accelerate.to(torch_device)
-        model_accelerate.eval()
-
-        noise = torch.randn(
-            1,
-            model_accelerate.config.in_channels,
-            model_accelerate.config.sample_size,
-            model_accelerate.config.sample_size,
-            generator=torch.manual_seed(0),
-        )
-        noise = noise.to(torch_device)
-        time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
-
-        arr_accelerate = model_accelerate(noise, time_step)["sample"]
-
-        # two models don't need to stay in the device at the same time
-        del model_accelerate
-        torch.cuda.empty_cache()
-        gc.collect()
-
-        model_normal_load, _ = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True, low_cpu_mem_usage=False
-        )
-        model_normal_load.to(torch_device)
-        model_normal_load.eval()
-        arr_normal_load = model_normal_load(noise, time_step)["sample"]
-
-        assert torch_all_close(arr_accelerate, arr_normal_load, rtol=1e-3)
-
-    def test_output_pretrained(self):
-        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
-        model.eval()
-        model.to(torch_device)
-
-        noise = torch.randn(
-            1,
-            model.config.in_channels,
-            model.config.sample_size,
-            model.config.sample_size,
-            generator=torch.manual_seed(0),
-        )
-        noise = noise.to(torch_device)
-        time_step = torch.tensor([10] * noise.shape[0]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, time_step).sample
-
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
-        # fmt: on
-
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3))
-
-
-class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [10]).to(dtype=torch.int32, device=torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64, 64, 64],
-            "in_channels": 3,
-            "layers_per_block": 1,
-            "out_channels": 3,
-            "time_embedding_type": "fourier",
-            "norm_eps": 1e-6,
-            "mid_block_scale_factor": math.sqrt(2.0),
-            "norm_num_groups": None,
-            "down_block_types": [
-                "SkipDownBlock2D",
-                "AttnSkipDownBlock2D",
-                "SkipDownBlock2D",
-                "SkipDownBlock2D",
-            ],
-            "up_block_types": [
-                "SkipUpBlock2D",
-                "SkipUpBlock2D",
-                "AttnSkipUpBlock2D",
-                "SkipUpBlock2D",
-            ],
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @slow
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        inputs = self.dummy_input
-        noise = floats_tensor((4, 3) + (256, 256)).to(torch_device)
-        inputs["sample"] = noise
-        image = model(**inputs)
-
-        assert image is not None, "Make sure output is not None"
-
-    @slow
-    def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        batch_size = 4
-        num_channels = 3
-        sizes = (256, 256)
-
-        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, time_step).sample
-
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
-        # fmt: on
-
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
-
-    def test_output_pretrained_ve_large(self):
-        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, time_step).sample
-
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
-        # fmt: on
-
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
-
-    def test_forward_with_norm_groups(self):
-        # not required for this model
-        pass
diff --git a/diffusers/tests/models/test_models_unet_2d_condition.py b/diffusers/tests/models/test_models_unet_2d_condition.py
deleted file mode 100644
index c0cb9d3d8ebde3322c89dbec44f58f16985f6243..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_unet_2d_condition.py
+++ /dev/null
@@ -1,944 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import os
-import tempfile
-import unittest
-
-import torch
-from parameterized import parameterized
-
-from diffusers import UNet2DConditionModel
-from diffusers.models.attention_processor import LoRAAttnProcessor
-from diffusers.utils import (
-    floats_tensor,
-    load_hf_numpy,
-    logging,
-    require_torch_gpu,
-    slow,
-    torch_all_close,
-    torch_device,
-)
-from diffusers.utils.import_utils import is_xformers_available
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-def create_lora_layers(model):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-        lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
-
-        # add 1 to weights to mock trained weights
-        with torch.no_grad():
-            lora_attn_procs[name].to_q_lora.up.weight += 1
-            lora_attn_procs[name].to_k_lora.up.weight += 1
-            lora_attn_procs[name].to_v_lora.up.weight += 1
-            lora_attn_procs[name].to_out_lora.up.weight += 1
-
-    return lora_attn_procs
-
-
-class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DConditionModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-        encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
-    @property
-    def input_shape(self):
-        return (4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
-            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
-            "cross_attention_dim": 32,
-            "attention_head_dim": 8,
-            "out_channels": 4,
-            "in_channels": 4,
-            "layers_per_block": 2,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_enable_works(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        model.enable_xformers_memory_efficient_attention()
-
-        assert (
-            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
-            == "XFormersAttnProcessor"
-        ), "xformers is not enabled"
-
-    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
-    def test_gradient_checkpointing(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        assert not model.is_gradient_checkpointing and model.training
-
-        out = model(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model.zero_grad()
-
-        labels = torch.randn_like(out)
-        loss = (out - labels).mean()
-        loss.backward()
-
-        # re-instantiate the model now enabling gradient checkpointing
-        model_2 = self.model_class(**init_dict)
-        # clone model
-        model_2.load_state_dict(model.state_dict())
-        model_2.to(torch_device)
-        model_2.enable_gradient_checkpointing()
-
-        assert model_2.is_gradient_checkpointing and model_2.training
-
-        out_2 = model_2(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model_2.zero_grad()
-        loss_2 = (out_2 - labels).mean()
-        loss_2.backward()
-
-        # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < 1e-5)
-        named_params = dict(model.named_parameters())
-        named_params_2 = dict(model_2.named_parameters())
-        for name, param in named_params.items():
-            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
-
-    def test_model_with_attention_head_dim_tuple(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_use_linear_projection(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["use_linear_projection"] = True
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_cross_attention_dim_tuple(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["cross_attention_dim"] = (32, 32)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_simple_projection(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        batch_size, _, _, sample_size = inputs_dict["sample"].shape
-
-        init_dict["class_embed_type"] = "simple_projection"
-        init_dict["projection_class_embeddings_input_dim"] = sample_size
-
-        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_with_class_embeddings_concat(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        batch_size, _, _, sample_size = inputs_dict["sample"].shape
-
-        init_dict["class_embed_type"] = "simple_projection"
-        init_dict["projection_class_embeddings_input_dim"] = sample_size
-        init_dict["class_embeddings_concat"] = True
-
-        inputs_dict["class_labels"] = floats_tensor((batch_size, sample_size)).to(torch_device)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_model_attention_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        model.set_attention_slice("auto")
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-        model.set_attention_slice("max")
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-        model.set_attention_slice(2)
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-    def test_model_sliceable_head_dim(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-
-        def check_sliceable_dim_attr(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                assert isinstance(module.sliceable_head_dim, int)
-
-            for child in module.children():
-                check_sliceable_dim_attr(child)
-
-        # retrieve number of attention layers
-        for module in model.children():
-            check_sliceable_dim_attr(module)
-
-    def test_special_attn_proc(self):
-        class AttnEasyProc(torch.nn.Module):
-            def __init__(self, num):
-                super().__init__()
-                self.weight = torch.nn.Parameter(torch.tensor(num))
-                self.is_run = False
-                self.number = 0
-                self.counter = 0
-
-            def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, number=None):
-                batch_size, sequence_length, _ = hidden_states.shape
-                attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-                query = attn.to_q(hidden_states)
-
-                encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
-                key = attn.to_k(encoder_hidden_states)
-                value = attn.to_v(encoder_hidden_states)
-
-                query = attn.head_to_batch_dim(query)
-                key = attn.head_to_batch_dim(key)
-                value = attn.head_to_batch_dim(value)
-
-                attention_probs = attn.get_attention_scores(query, key, attention_mask)
-                hidden_states = torch.bmm(attention_probs, value)
-                hidden_states = attn.batch_to_head_dim(hidden_states)
-
-                # linear proj
-                hidden_states = attn.to_out[0](hidden_states)
-                # dropout
-                hidden_states = attn.to_out[1](hidden_states)
-
-                hidden_states += self.weight
-
-                self.is_run = True
-                self.counter += 1
-                self.number = number
-
-                return hidden_states
-
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        processor = AttnEasyProc(5.0)
-
-        model.set_attn_processor(processor)
-        model(**inputs_dict, cross_attention_kwargs={"number": 123}).sample
-
-        assert processor.counter == 12
-        assert processor.is_run
-        assert processor.number == 123
-
-    def test_lora_processors(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        with torch.no_grad():
-            sample1 = model(**inputs_dict).sample
-
-        lora_attn_procs = {}
-        for name in model.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = model.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = model.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-
-            # add 1 to weights to mock trained weights
-            with torch.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight += 1
-                lora_attn_procs[name].to_k_lora.up.weight += 1
-                lora_attn_procs[name].to_v_lora.up.weight += 1
-                lora_attn_procs[name].to_out_lora.up.weight += 1
-
-        # make sure we can set a list of attention processors
-        model.set_attn_processor(lora_attn_procs)
-        model.to(torch_device)
-
-        # test that attn processors can be set to itself
-        model.set_attn_processor(model.attn_processors)
-
-        with torch.no_grad():
-            sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-            sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-            sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample1 - sample2).abs().max() < 1e-4
-        assert (sample3 - sample4).abs().max() < 1e-4
-
-        # sample 2 and sample 3 should be different
-        assert (sample2 - sample3).abs().max() > 1e-4
-
-    def test_lora_save_load(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        with torch.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with torch.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-            torch.manual_seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.to(torch_device)
-            new_model.load_attn_procs(tmpdirname)
-
-        with torch.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_load_safetensors(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        with torch.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = {}
-        for name in model.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = model.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = model.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-            lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
-
-            # add 1 to weights to mock trained weights
-            with torch.no_grad():
-                lora_attn_procs[name].to_q_lora.up.weight += 1
-                lora_attn_procs[name].to_k_lora.up.weight += 1
-                lora_attn_procs[name].to_v_lora.up.weight += 1
-                lora_attn_procs[name].to_out_lora.up.weight += 1
-
-        model.set_attn_processor(lora_attn_procs)
-
-        with torch.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname, safe_serialization=True)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
-            torch.manual_seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.to(torch_device)
-            new_model.load_attn_procs(tmpdirname)
-
-        with torch.no_grad():
-            new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-
-        # LoRA and no LoRA should NOT be the same
-        assert (sample - old_sample).abs().max() > 1e-4
-
-    def test_lora_save_safetensors_load_torch(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        lora_attn_procs = {}
-        for name in model.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = model.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = model.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-            lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
-
-        model.set_attn_processor(lora_attn_procs)
-        # Saving as torch, properly reloads with directly filename
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-            torch.manual_seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.to(torch_device)
-            new_model.load_attn_procs(tmpdirname, weight_name="pytorch_lora_weights.bin")
-
-    def test_lora_save_torch_force_load_safetensors_error(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        lora_attn_procs = {}
-        for name in model.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = model.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = model.config.block_out_channels[block_id]
-
-            lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-            lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
-
-        model.set_attn_processor(lora_attn_procs)
-        # Saving as torch, properly reloads with directly filename
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_attn_procs(tmpdirname)
-            self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
-            torch.manual_seed(0)
-            new_model = self.model_class(**init_dict)
-            new_model.to(torch_device)
-            with self.assertRaises(IOError) as e:
-                new_model.load_attn_procs(tmpdirname, use_safetensors=True)
-            self.assertIn("Error no file named pytorch_lora_weights.safetensors", str(e.exception))
-
-    def test_lora_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        with torch.no_grad():
-            old_sample = model(**inputs_dict).sample
-
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        with torch.no_grad():
-            sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-
-        model.set_default_attn_processor()
-
-        with torch.no_grad():
-            new_sample = model(**inputs_dict).sample
-
-        assert (sample - new_sample).abs().max() < 1e-4
-        assert (sample - old_sample).abs().max() < 1e-4
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = (8, 16)
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        # default
-        with torch.no_grad():
-            sample = model(**inputs_dict).sample
-
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-
-        assert (sample - on_sample).abs().max() < 1e-4
-        assert (sample - off_sample).abs().max() < 1e-4
-
-
-@slow
-class UNet2DConditionModelIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
-    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
-        revision = "fp16" if fp16 else None
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = UNet2DConditionModel.from_pretrained(
-            model_id, subfolder="unet", torch_dtype=torch_dtype, revision=revision
-        )
-        model.to(torch_device).eval()
-
-        return model
-
-    def test_set_attention_slice_auto(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        unet = self.get_unet_model()
-        unet.set_attention_slice("auto")
-
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-
-        with torch.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_max(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        unet = self.get_unet_model()
-        unet.set_attention_slice("max")
-
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-
-        with torch.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_int(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        unet = self.get_unet_model()
-        unet.set_attention_slice(2)
-
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-
-        with torch.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes < 5 * 10**9
-
-    def test_set_attention_slice_list(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        # there are 32 sliceable layers
-        slice_list = 16 * [2, 3]
-        unet = self.get_unet_model()
-        unet.set_attention_slice(slice_list)
-
-        latents = self.get_latents(33)
-        encoder_hidden_states = self.get_encoder_hidden_states(33)
-        timestep = 1
-
-        with torch.no_grad():
-            _ = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes < 5 * 10**9
-
-    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        hidden_states = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return hidden_states
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, 4, [-0.4424, 0.1510, -0.1937, 0.2118, 0.3746, -0.3957, 0.0160, -0.0435]],
-            [47, 0.55, [-0.1508, 0.0379, -0.3075, 0.2540, 0.3633, -0.0821, 0.1719, -0.0207]],
-            [21, 0.89, [-0.6479, 0.6364, -0.3464, 0.8697, 0.4443, -0.6289, -0.0091, 0.1778]],
-            [9, 1000, [0.8888, -0.5659, 0.5834, -0.7469, 1.1912, -0.3923, 1.1241, -0.4424]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_v1_4(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4")
-        latents = self.get_latents(seed)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
-            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
-            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
-            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_v1_4_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
-        latents = self.get_latents(seed, fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, 4, [-0.4430, 0.1570, -0.1867, 0.2376, 0.3205, -0.3681, 0.0525, -0.0722]],
-            [47, 0.55, [-0.1415, 0.0129, -0.3136, 0.2257, 0.3430, -0.0536, 0.2114, -0.0436]],
-            [21, 0.89, [-0.7091, 0.6664, -0.3643, 0.9032, 0.4499, -0.6541, 0.0139, 0.1750]],
-            [9, 1000, [0.8878, -0.5659, 0.5844, -0.7442, 1.1883, -0.3927, 1.1192, -0.4423]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_v1_5(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5")
-        latents = self.get_latents(seed)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [-0.2695, -0.1669, 0.0073, -0.3181, -0.1187, -0.1676, -0.1395, -0.5972]],
-            [17, 0.55, [-0.1290, -0.2588, 0.0551, -0.0916, 0.3286, 0.0238, -0.3669, 0.0322]],
-            [8, 0.89, [-0.5283, 0.1198, 0.0870, -0.1141, 0.9189, -0.0150, 0.5474, 0.4319]],
-            [3, 1000, [-0.5601, 0.2411, -0.5435, 0.1268, 1.1338, -0.2427, -0.0280, -1.0020]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_v1_5_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-v1-5", fp16=True)
-        latents = self.get_latents(seed, fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, 4, [-0.7639, 0.0106, -0.1615, -0.3487, -0.0423, -0.7972, 0.0085, -0.4858]],
-            [47, 0.55, [-0.6564, 0.0795, -1.9026, -0.6258, 1.8235, 1.2056, 1.2169, 0.9073]],
-            [21, 0.89, [0.0327, 0.4399, -0.6358, 0.3417, 0.4120, -0.5621, -0.0397, -1.0430]],
-            [9, 1000, [0.1600, 0.7303, -1.0556, -0.3515, -0.7440, -1.2037, -1.8149, -1.8931]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_inpaint(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting")
-        latents = self.get_latents(seed, shape=(4, 9, 64, 64))
-        encoder_hidden_states = self.get_encoder_hidden_states(seed)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == (4, 4, 64, 64)
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [-0.1047, -1.7227, 0.1067, 0.0164, -0.5698, -0.4172, -0.1388, 1.1387]],
-            [17, 0.55, [0.0975, -0.2856, -0.3508, -0.4600, 0.3376, 0.2930, -0.2747, -0.7026]],
-            [8, 0.89, [-0.0952, 0.0183, -0.5825, -0.1981, 0.1131, 0.4668, -0.0395, -0.3486]],
-            [3, 1000, [0.4790, 0.4949, -1.0732, -0.7158, 0.7959, -0.9478, 0.1105, -0.9741]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_compvis_sd_inpaint_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="runwayml/stable-diffusion-inpainting", fp16=True)
-        latents = self.get_latents(seed, shape=(4, 9, 64, 64), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == (4, 4, 64, 64)
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
-            [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
-            [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
-            [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_stabilityai_sd_v2_fp16(self, seed, timestep, expected_slice):
-        model = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
-        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
-
-        timestep = torch.tensor([timestep], dtype=torch.long, device=torch_device)
-
-        with torch.no_grad():
-            sample = model(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
diff --git a/diffusers/tests/models/test_models_unet_2d_flax.py b/diffusers/tests/models/test_models_unet_2d_flax.py
deleted file mode 100644
index 69a0704dca9dae32a7d612b82cbedc0454a0a1b5..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_unet_2d_flax.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import gc
-import unittest
-
-from parameterized import parameterized
-
-from diffusers import FlaxUNet2DConditionModel
-from diffusers.utils import is_flax_available
-from diffusers.utils.testing_utils import load_hf_numpy, require_flax, slow
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-
-
-@slow
-@require_flax
-class FlaxUNet2DConditionModelIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-
-    def get_latents(self, seed=0, shape=(4, 4, 64, 64), fp16=False):
-        dtype = jnp.bfloat16 if fp16 else jnp.float32
-        image = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
-        return image
-
-    def get_unet_model(self, fp16=False, model_id="CompVis/stable-diffusion-v1-4"):
-        dtype = jnp.bfloat16 if fp16 else jnp.float32
-        revision = "bf16" if fp16 else None
-
-        model, params = FlaxUNet2DConditionModel.from_pretrained(
-            model_id, subfolder="unet", dtype=dtype, revision=revision
-        )
-        return model, params
-
-    def get_encoder_hidden_states(self, seed=0, shape=(4, 77, 768), fp16=False):
-        dtype = jnp.bfloat16 if fp16 else jnp.float32
-        hidden_states = jnp.array(load_hf_numpy(self.get_file_format(seed, shape)), dtype=dtype)
-        return hidden_states
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [-0.2323, -0.1304, 0.0813, -0.3093, -0.0919, -0.1571, -0.1125, -0.5806]],
-            [17, 0.55, [-0.0831, -0.2443, 0.0901, -0.0919, 0.3396, 0.0103, -0.3743, 0.0701]],
-            [8, 0.89, [-0.4863, 0.0859, 0.0875, -0.1658, 0.9199, -0.0114, 0.4839, 0.4639]],
-            [3, 1000, [-0.5649, 0.2402, -0.5518, 0.1248, 1.1328, -0.2443, -0.0325, -1.0078]],
-            # fmt: on
-        ]
-    )
-    def test_compvis_sd_v1_4_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
-        model, params = self.get_unet_model(model_id="CompVis/stable-diffusion-v1-4", fp16=True)
-        latents = self.get_latents(seed, fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, fp16=True)
-
-        sample = model.apply(
-            {"params": params},
-            latents,
-            jnp.array(timestep, dtype=jnp.int32),
-            encoder_hidden_states=encoder_hidden_states,
-        ).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
-        expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
-
-        # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, in the same hardware
-        assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [83, 4, [0.1514, 0.0807, 0.1624, 0.1016, -0.1896, 0.0263, 0.0677, 0.2310]],
-            [17, 0.55, [0.1164, -0.0216, 0.0170, 0.1589, -0.3120, 0.1005, -0.0581, -0.1458]],
-            [8, 0.89, [-0.1758, -0.0169, 0.1004, -0.1411, 0.1312, 0.1103, -0.1996, 0.2139]],
-            [3, 1000, [0.1214, 0.0352, -0.0731, -0.1562, -0.0994, -0.0906, -0.2340, -0.0539]],
-            # fmt: on
-        ]
-    )
-    def test_stabilityai_sd_v2_flax_vs_torch_fp16(self, seed, timestep, expected_slice):
-        model, params = self.get_unet_model(model_id="stabilityai/stable-diffusion-2", fp16=True)
-        latents = self.get_latents(seed, shape=(4, 4, 96, 96), fp16=True)
-        encoder_hidden_states = self.get_encoder_hidden_states(seed, shape=(4, 77, 1024), fp16=True)
-
-        sample = model.apply(
-            {"params": params},
-            latents,
-            jnp.array(timestep, dtype=jnp.int32),
-            encoder_hidden_states=encoder_hidden_states,
-        ).sample
-
-        assert sample.shape == latents.shape
-
-        output_slice = jnp.asarray(jax.device_get((sample[-1, -2:, -2:, :2].flatten())), dtype=jnp.float32)
-        expected_output_slice = jnp.array(expected_slice, dtype=jnp.float32)
-
-        # Found torch (float16) and flax (bfloat16) outputs to be within this tolerance, on the same hardware
-        assert jnp.allclose(output_slice, expected_output_slice, atol=1e-2)
diff --git a/diffusers/tests/models/test_models_unet_3d_condition.py b/diffusers/tests/models/test_models_unet_3d_condition.py
deleted file mode 100644
index 5a0d74a3ea5ad6956e791029d4a3be2528ca4d28..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_unet_3d_condition.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers.models import ModelMixin, UNet3DConditionModel
-from diffusers.models.attention_processor import LoRAAttnProcessor
-from diffusers.utils import (
-    floats_tensor,
-    logging,
-    skip_mps,
-    torch_device,
-)
-from diffusers.utils.import_utils import is_xformers_available
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-logger = logging.get_logger(__name__)
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-def create_lora_layers(model):
-    lora_attn_procs = {}
-    for name in model.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = model.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(model.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = model.config.block_out_channels[block_id]
-
-        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-        lora_attn_procs[name] = lora_attn_procs[name].to(model.device)
-
-        # add 1 to weights to mock trained weights
-        with torch.no_grad():
-            lora_attn_procs[name].to_q_lora.up.weight += 1
-            lora_attn_procs[name].to_k_lora.up.weight += 1
-            lora_attn_procs[name].to_v_lora.up.weight += 1
-            lora_attn_procs[name].to_out_lora.up.weight += 1
-
-    return lora_attn_procs
-
-
-@skip_mps
-class UNet3DConditionModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet3DConditionModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        num_frames = 4
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels, num_frames) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-        encoder_hidden_states = floats_tensor((batch_size, 4, 32)).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
-    @property
-    def input_shape(self):
-        return (4, 4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": (32, 64),
-            "down_block_types": (
-                "CrossAttnDownBlock3D",
-                "DownBlock3D",
-            ),
-            "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
-            "cross_attention_dim": 32,
-            "attention_head_dim": 8,
-            "out_channels": 4,
-            "in_channels": 4,
-            "layers_per_block": 1,
-            "sample_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_enable_works(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        model.enable_xformers_memory_efficient_attention()
-
-        assert (
-            model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
-            == "XFormersAttnProcessor"
-        ), "xformers is not enabled"
-
-    # Overriding to set `norm_num_groups` needs to be different for this model.
-    def test_forward_with_norm_groups(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["norm_num_groups"] = 32
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    # Overriding since the UNet3D outputs a different structure.
-    def test_determinism(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            # Warmup pass when using mps (see #372)
-            if torch_device == "mps" and isinstance(model, ModelMixin):
-                model(**self.dummy_input)
-
-            first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first.sample
-
-            second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second.sample
-
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
-
-    def test_model_attention_slicing(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 8
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        model.set_attention_slice("auto")
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-        model.set_attention_slice("max")
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-        model.set_attention_slice(2)
-        with torch.no_grad():
-            output = model(**inputs_dict)
-        assert output is not None
-
-    # (`attn_processors`) needs to be implemented in this model for this test.
-    # def test_lora_processors(self):
-
-    # (`attn_processors`) needs to be implemented in this model for this test.
-    # def test_lora_save_load(self):
-
-    # (`attn_processors`) needs to be implemented for this test in the model.
-    # def test_lora_save_load_safetensors(self):
-
-    # (`attn_processors`) needs to be implemented for this test in the model.
-    # def test_lora_save_safetensors_load_torch(self):
-
-    # (`attn_processors`) needs to be implemented for this test.
-    # def test_lora_save_torch_force_load_safetensors_error(self):
-
-    # (`attn_processors`) needs to be added for this test.
-    # def test_lora_on_off(self):
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_lora_xformers_on_off(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["attention_head_dim"] = 4
-
-        torch.manual_seed(0)
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        lora_attn_procs = create_lora_layers(model)
-        model.set_attn_processor(lora_attn_procs)
-
-        # default
-        with torch.no_grad():
-            sample = model(**inputs_dict).sample
-
-            model.enable_xformers_memory_efficient_attention()
-            on_sample = model(**inputs_dict).sample
-
-            model.disable_xformers_memory_efficient_attention()
-            off_sample = model(**inputs_dict).sample
-
-        assert (sample - on_sample).abs().max() < 1e-4
-        assert (sample - off_sample).abs().max() < 1e-4
-
-
-# (todo: sayakpaul) implement SLOW tests.
diff --git a/diffusers/tests/models/test_models_vae.py b/diffusers/tests/models/test_models_vae.py
deleted file mode 100644
index abd4a078e6922f8454bd6b3b7f8a35b53a834d80..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_vae.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import torch
-from parameterized import parameterized
-
-from diffusers import AutoencoderKL
-from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKL
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
-    def test_gradient_checkpointing(self):
-        # enable deterministic behavior for gradient checkpointing
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-
-        assert not model.is_gradient_checkpointing and model.training
-
-        out = model(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model.zero_grad()
-
-        labels = torch.randn_like(out)
-        loss = (out - labels).mean()
-        loss.backward()
-
-        # re-instantiate the model now enabling gradient checkpointing
-        model_2 = self.model_class(**init_dict)
-        # clone model
-        model_2.load_state_dict(model.state_dict())
-        model_2.to(torch_device)
-        model_2.enable_gradient_checkpointing()
-
-        assert model_2.is_gradient_checkpointing and model_2.training
-
-        out_2 = model_2(**inputs_dict).sample
-        # run the backwards pass on the model. For backwards pass, for simplicity purpose,
-        # we won't calculate the loss and rather backprop on out.sum()
-        model_2.zero_grad()
-        loss_2 = (out_2 - labels).mean()
-        loss_2.backward()
-
-        # compare the output and parameters gradients
-        self.assertTrue((loss - loss_2).abs() < 1e-5)
-        named_params = dict(model.named_parameters())
-        named_params_2 = dict(model_2.named_parameters())
-        for name, param in named_params.items():
-            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
-        model = model.to(torch_device)
-        model.eval()
-
-        if torch_device == "mps":
-            generator = torch.manual_seed(0)
-        else:
-            generator = torch.Generator(device=torch_device).manual_seed(0)
-
-        image = torch.randn(
-            1,
-            model.config.in_channels,
-            model.config.sample_size,
-            model.config.sample_size,
-            generator=torch.manual_seed(0),
-        )
-        image = image.to(torch_device)
-        with torch.no_grad():
-            output = model(image, sample_posterior=True, generator=generator).sample
-
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-
-        # Since the VAE Gaussian prior's generator is seeded on the appropriate device,
-        # the expected output slices are not the same for CPU and GPU.
-        if torch_device == "mps":
-            expected_output_slice = torch.tensor(
-                [
-                    -4.0078e-01,
-                    -3.8323e-04,
-                    -1.2681e-01,
-                    -1.1462e-01,
-                    2.0095e-01,
-                    1.0893e-01,
-                    -8.8247e-02,
-                    -3.0361e-01,
-                    -9.8644e-03,
-                ]
-            )
-        elif torch_device == "cpu":
-            expected_output_slice = torch.tensor(
-                [-0.1352, 0.0878, 0.0419, -0.0818, -0.1069, 0.0688, -0.1458, -0.4446, -0.0026]
-            )
-        else:
-            expected_output_slice = torch.tensor(
-                [-0.2421, 0.4642, 0.2507, -0.0438, 0.0682, 0.3160, -0.2018, -0.0727, 0.2485]
-            )
-
-        self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2))
-
-
-@slow
-class AutoencoderKLIntegrationTests(unittest.TestCase):
-    def get_file_format(self, seed, shape):
-        return f"gaussian_noise_s={seed}_shape={'_'.join([str(s) for s in shape])}.npy"
-
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_sd_image(self, seed=0, shape=(4, 3, 512, 512), fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        image = torch.from_numpy(load_hf_numpy(self.get_file_format(seed, shape))).to(torch_device).to(dtype)
-        return image
-
-    def get_sd_vae_model(self, model_id="CompVis/stable-diffusion-v1-4", fp16=False):
-        revision = "fp16" if fp16 else None
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderKL.from_pretrained(
-            model_id,
-            subfolder="vae",
-            torch_dtype=torch_dtype,
-            revision=revision,
-        )
-        model.to(torch_device).eval()
-
-        return model
-
-    def get_generator(self, seed=0):
-        if torch_device == "mps":
-            return torch.manual_seed(seed)
-        return torch.Generator(device=torch_device).manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.1603, 0.9878, -0.0495, -0.0790, -0.2709, 0.8375, -0.2060, -0.0824], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]],
-            [47, [-0.2376, 0.1168, 0.1332, -0.4840, -0.2508, -0.0791, -0.0493, -0.4089], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.0513, 0.0289, 1.3799, 0.2166, -0.2573, -0.0871, 0.5103, -0.0999]],
-            [47, [-0.4128, -0.1320, -0.3704, 0.1965, -0.4116, -0.2332, -0.3340, 0.2247]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_stable_diffusion_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        image = self.get_sd_image(seed, fp16=True)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(image, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-2)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.1609, 0.9866, -0.0487, -0.0777, -0.2716, 0.8368, -0.2055, -0.0814], [-0.2395, 0.0098, 0.0102, -0.0709, -0.2840, -0.0274, -0.0718, -0.1824]],
-            [47, [-0.2377, 0.1147, 0.1333, -0.4841, -0.2506, -0.0805, -0.0491, -0.4085], [0.0350, 0.0847, 0.0467, 0.0344, -0.0842, -0.0547, -0.0633, -0.1131]],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_mode(self, seed, expected_slice, expected_slice_mps):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-
-        with torch.no_grad():
-            sample = model(image).sample
-
-        assert sample.shape == image.shape
-
-        output_slice = sample[-1, -2:, -2:, :2].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice_mps if torch_device == "mps" else expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [13, [-0.2051, -0.1803, -0.2311, -0.2114, -0.3292, -0.3574, -0.2953, -0.3323]],
-            [37, [-0.2632, -0.2625, -0.2199, -0.2741, -0.4539, -0.4990, -0.3720, -0.4925]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_stable_diffusion_decode(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64))
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [27, [-0.0369, 0.0207, -0.0776, -0.0682, -0.1747, -0.1930, -0.1465, -0.2039]],
-            [16, [-0.1628, -0.2134, -0.2747, -0.2642, -0.3774, -0.4404, -0.3687, -0.4277]],
-            # fmt: on
-        ]
-    )
-    @require_torch_gpu
-    def test_stable_diffusion_decode_fp16(self, seed, expected_slice):
-        model = self.get_sd_vae_model(fp16=True)
-        encoding = self.get_sd_image(seed, shape=(3, 4, 64, 64), fp16=True)
-
-        with torch.no_grad():
-            sample = model.decode(encoding).sample
-
-        assert list(sample.shape) == [3, 3, 512, 512]
-
-        output_slice = sample[-1, -2:, :2, -2:].flatten().float().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=5e-3)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [-0.3001, 0.0918, -2.6984, -3.9720, -3.2099, -5.0353, 1.7338, -0.2065, 3.4267]],
-            [47, [-1.5030, -4.3871, -6.0355, -9.1157, -1.6661, -2.7853, 2.1607, -5.0823, 2.5633]],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_encode_sample(self, seed, expected_slice):
-        model = self.get_sd_vae_model()
-        image = self.get_sd_image(seed)
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            dist = model.encode(image).latent_dist
-            sample = dist.sample(generator=generator)
-
-        assert list(sample.shape) == [image.shape[0], 4] + [i // 8 for i in image.shape[2:]]
-
-        output_slice = sample[0, -1, -3:, -3:].flatten().cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        tolerance = 1e-3 if torch_device != "mps" else 1e-2
-        assert torch_all_close(output_slice, expected_output_slice, atol=tolerance)
diff --git a/diffusers/tests/models/test_models_vae_flax.py b/diffusers/tests/models/test_models_vae_flax.py
deleted file mode 100644
index 8fedb85eccfc73e9a0900f7bb947887da3ffe4e9..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_vae_flax.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest
-
-from diffusers import FlaxAutoencoderKL
-from diffusers.utils import is_flax_available
-from diffusers.utils.testing_utils import require_flax
-
-from ..test_modeling_common_flax import FlaxModelTesterMixin
-
-
-if is_flax_available():
-    import jax
-
-
-@require_flax
-class FlaxAutoencoderKLTests(FlaxModelTesterMixin, unittest.TestCase):
-    model_class = FlaxAutoencoderKL
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        prng_key = jax.random.PRNGKey(0)
-        image = jax.random.uniform(prng_key, ((batch_size, num_channels) + sizes))
-
-        return {"sample": image, "prng_key": prng_key}
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
diff --git a/diffusers/tests/models/test_models_vq.py b/diffusers/tests/models/test_models_vq.py
deleted file mode 100644
index 66c33e07371e066bad3f0465ab923d67b79b4f52..0000000000000000000000000000000000000000
--- a/diffusers/tests/models/test_models_vq.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import VQModel
-from diffusers.utils import floats_tensor, torch_device
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VQModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = VQModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_out_channels": [32, 64],
-            "in_channels": 3,
-            "out_channels": 3,
-            "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            "latent_channels": 3,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = VQModel.from_pretrained("fusing/vqgan-dummy")
-        model.to(torch_device).eval()
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
-        image = image.to(torch_device)
-        with torch.no_grad():
-            output = model(image).sample
-
-        output_slice = output[0, -1, -3:, -3:].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
diff --git a/diffusers/tests/pipeline_params.py b/diffusers/tests/pipeline_params.py
deleted file mode 100644
index a0ac6c641c0bafef0f770409e9b75ec0aee013c1..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipeline_params.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# These are canonical sets of parameters for different types of pipelines.
-# They are set on subclasses of `PipelineTesterMixin` as `params` and
-# `batch_params`.
-#
-# If a pipeline's set of arguments has minor changes from one of the common sets
-# of arguments, do not make modifications to the existing common sets of arguments.
-# I.e. a text to image pipeline with non-configurable height and width arguments
-# should set its attribute as `params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`.
-
-TEXT_TO_IMAGE_PARAMS = frozenset(
-    [
-        "prompt",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-    ]
-)
-
-TEXT_TO_IMAGE_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
-
-IMAGE_VARIATION_PARAMS = frozenset(
-    [
-        "image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_VARIATION_BATCH_PARAMS = frozenset(["image"])
-
-TEXT_GUIDED_IMAGE_VARIATION_PARAMS = frozenset(
-    [
-        "prompt",
-        "image",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-    ]
-)
-
-TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS = frozenset(["prompt", "image", "negative_prompt"])
-
-TEXT_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        # Text guided image variation with an image mask
-        "prompt",
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-    ]
-)
-
-TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["prompt", "image", "mask_image", "negative_prompt"])
-
-IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        # image variation with an image mask
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["image", "mask_image"])
-
-IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS = frozenset(
-    [
-        "example_image",
-        "image",
-        "mask_image",
-        "height",
-        "width",
-        "guidance_scale",
-    ]
-)
-
-IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS = frozenset(["example_image", "image", "mask_image"])
-
-CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS = frozenset(["class_labels"])
-
-CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS = frozenset(["class_labels"])
-
-UNCONDITIONAL_IMAGE_GENERATION_PARAMS = frozenset(["batch_size"])
-
-UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS = frozenset([])
-
-UNCONDITIONAL_AUDIO_GENERATION_PARAMS = frozenset(["batch_size"])
-
-UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS = frozenset([])
-
-TEXT_TO_AUDIO_PARAMS = frozenset(
-    [
-        "prompt",
-        "audio_length_in_s",
-        "guidance_scale",
-        "negative_prompt",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-    ]
-)
-
-TEXT_TO_AUDIO_BATCH_PARAMS = frozenset(["prompt", "negative_prompt"])
-TOKENS_TO_AUDIO_GENERATION_PARAMS = frozenset(["input_tokens"])
-
-TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS = frozenset(["input_tokens"])
diff --git a/diffusers/tests/pipelines/__init__.py b/diffusers/tests/pipelines/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/altdiffusion/__init__.py b/diffusers/tests/pipelines/altdiffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py b/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
deleted file mode 100644
index faa56e18f74835a6f1fa2f63717fc9ba5c0a7e29..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, XLMRobertaTokenizer
-
-from diffusers import AltDiffusionPipeline, AutoencoderKL, DDIMScheduler, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class AltDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AltDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-
-        # TODO: address the non-deterministic text encoder (fails for save-load tests)
-        # torch.manual_seed(0)
-        # text_encoder_config = RobertaSeriesConfig(
-        #     hidden_size=32,
-        #     project_dim=32,
-        #     intermediate_size=37,
-        #     layer_norm_eps=1e-05,
-        #     num_attention_heads=4,
-        #     num_hidden_layers=5,
-        #     vocab_size=5002,
-        # )
-        # text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5002,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_alt_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        torch.manual_seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        # TODO: remove after fixing the non-deterministic text encoder
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-        components["text_encoder"] = text_encoder
-
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = "A photo of an astronaut"
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.5748162, 0.60447145, 0.48821217, 0.50100636, 0.5431185, 0.45763683, 0.49657696, 0.48132733, 0.47573093]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_alt_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        text_encoder_config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            vocab_size=5002,
-        )
-        # TODO: remove after fixing the non-deterministic text encoder
-        text_encoder = RobertaSeriesModelWithTransformation(text_encoder_config)
-        components["text_encoder"] = text_encoder
-        alt_pipe = AltDiffusionPipeline(**components)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = alt_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [0.51605093, 0.5707241, 0.47365507, 0.50578886, 0.5633877, 0.4642503, 0.5182081, 0.48763484, 0.49084237]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch_gpu
-class AltDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_alt_diffusion(self):
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", safety_checker=None)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = alt_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20, output_type="np")
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1010, 0.0800, 0.0794, 0.0885, 0.0843, 0.0762, 0.0769, 0.0729, 0.0586])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_alt_diffusion_fast_ddim(self):
-        scheduler = DDIMScheduler.from_pretrained("BAAI/AltDiffusion", subfolder="scheduler")
-
-        alt_pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion", scheduler=scheduler, safety_checker=None)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-
-        output = alt_pipe([prompt], generator=generator, num_inference_steps=2, output_type="numpy")
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4019, 0.4052, 0.3810, 0.4119, 0.3916, 0.3982, 0.4651, 0.4195, 0.5323])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
deleted file mode 100644
index 9396329434059db279d7b276af0301905fbc49cc..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import XLMRobertaTokenizer
-
-from diffusers import (
-    AltDiffusionImg2ImgPipeline,
-    AutoencoderKL,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
-    RobertaSeriesConfig,
-    RobertaSeriesModelWithTransformation,
-)
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = RobertaSeriesConfig(
-            hidden_size=32,
-            project_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=5006,
-        )
-        return RobertaSeriesModelWithTransformation(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_stable_diffusion_img2img_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        init_image = self.dummy_image.to(device)
-
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
-        alt_pipe = alt_pipe.to(device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = alt_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_img2img_fp16(self):
-        """Test that stable diffusion img2img works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
-        tokenizer.model_max_length = 77
-
-        init_image = self.dummy_image.to(torch_device)
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        alt_pipe = AltDiffusionImg2ImgPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
-        alt_pipe = alt_pipe.to(torch_device)
-        alt_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = alt_pipe(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        ).images
-
-        assert image.shape == (1, 32, 32, 3)
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        # resize to resolution that is divisible by 8 but not 16 or 32
-        init_image = init_image.resize((760, 504))
-
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        image_slice = image[255:258, 383:386, -1]
-
-        assert image.shape == (504, 760, 3)
-        expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-
-@slow
-@require_torch_gpu
-class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_img2img_pipeline_default(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
-        )
-
-        model_id = "BAAI/AltDiffusion"
-        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 768, 3)
-        # img2img is flaky across GPUs even in fp32, so using MAE here
-        assert np.abs(expected_image - image).max() < 1e-3
diff --git a/diffusers/tests/pipelines/audio_diffusion/__init__.py b/diffusers/tests/pipelines/audio_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
deleted file mode 100644
index ba389d9c936df1d096a54b02d332cfa8ac520901..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/audio_diffusion/test_audio_diffusion.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import (
-    AudioDiffusionPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    DiffusionPipeline,
-    Mel,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            sample_size=(32, 64),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_unet_condition(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            cross_attention_dim=10,
-        )
-        return model
-
-    @property
-    def dummy_vqvae_and_unet(self):
-        torch.manual_seed(0)
-        vqvae = AutoencoderKL(
-            sample_size=(128, 64),
-            in_channels=1,
-            out_channels=1,
-            latent_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-        )
-        unet = UNet2DModel(
-            sample_size=(64, 32),
-            in_channels=1,
-            out_channels=1,
-            layers_per_block=2,
-            block_out_channels=(128, 128),
-            down_block_types=("AttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "AttnUpBlock2D"),
-        )
-        return vqvae, unet
-
-    @slow
-    def test_audio_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        mel = Mel()
-
-        scheduler = DDPMScheduler()
-        pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator, steps=4)
-        audio = output.audios[0]
-        image = output.images[0]
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator, steps=4, return_dict=False)
-        image_from_tuple = output[0][0]
-
-        assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length)
-        assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0
-
-        scheduler = DDIMScheduler()
-        dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        np.random.seed(0)
-        raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,))
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
-        image = output.images[0]
-
-        assert (
-            image.height == self.dummy_vqvae_and_unet[0].sample_size[0]
-            and image.width == self.dummy_vqvae_and_unet[0].sample_size[1]
-        )
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-
-        dummy_unet_condition = self.dummy_unet_condition
-        pipe = AudioDiffusionPipeline(
-            vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler
-        )
-
-        np.random.seed(0)
-        encoding = torch.rand((1, 1, 10))
-        output = pipe(generator=generator, encoding=encoding)
-        image = output.images[0]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
-
-
-@slow
-@require_torch_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_audio_diffusion(self):
-        device = torch_device
-
-        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(42)
-        output = pipe(generator=generator)
-        audio = output.audios[0]
-        image = output.images[0]
-
-        assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length)
-        assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1]
-        image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
-        expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() == 0
diff --git a/diffusers/tests/pipelines/audioldm/__init__.py b/diffusers/tests/pipelines/audioldm/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/audioldm/test_audioldm.py b/diffusers/tests/pipelines/audioldm/test_audioldm.py
deleted file mode 100644
index 10de5440eb007ae4cfc57953ea943eeee3500340..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/audioldm/test_audioldm.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from transformers import (
-    ClapTextConfig,
-    ClapTextModelWithProjection,
-    RobertaTokenizer,
-    SpeechT5HifiGan,
-    SpeechT5HifiGanConfig,
-)
-
-from diffusers import (
-    AudioLDMPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    UNet2DConditionModel,
-)
-from diffusers.utils import slow, torch_device
-
-from ...pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = AudioLDMPipeline
-    params = TEXT_TO_AUDIO_PARAMS
-    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_waveforms_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(32, 64),
-            class_embed_type="simple_projection",
-            projection_class_embeddings_input_dim=32,
-            class_embeddings_concat=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=1,
-            out_channels=1,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = ClapTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            projection_dim=32,
-        )
-        text_encoder = ClapTextModelWithProjection(text_encoder_config)
-        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
-
-        vocoder_config = SpeechT5HifiGanConfig(
-            model_in_dim=8,
-            sampling_rate=16000,
-            upsample_initial_channel=16,
-            upsample_rates=[2, 2],
-            upsample_kernel_sizes=[4, 4],
-            resblock_kernel_sizes=[3, 7],
-            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5]],
-            normalize_before=False,
-        )
-
-        vocoder = SpeechT5HifiGan(vocoder_config)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "vocoder": vocoder,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-        }
-        return inputs
-
-    def test_audioldm_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0050, 0.0050, -0.0060, 0.0033, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0033]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-
-    def test_audioldm_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = audioldm_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=audioldm_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_inputs = text_inputs["input_ids"].to(torch_device)
-
-        prompt_embeds = audioldm_pipe.text_encoder(
-            text_inputs,
-        )
-        prompt_embeds = prompt_embeds.text_embeds
-        # additional L_2 normalization over each hidden-state
-        prompt_embeds = F.normalize(prompt_embeds, dim=-1)
-
-        inputs["prompt_embeds"] = prompt_embeds
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-
-    def test_audioldm_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = audioldm_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=audioldm_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_inputs = text_inputs["input_ids"].to(torch_device)
-
-            text_embeds = audioldm_pipe.text_encoder(
-                text_inputs,
-            )
-            text_embeds = text_embeds.text_embeds
-            # additional L_2 normalization over each hidden-state
-            text_embeds = F.normalize(text_embeds, dim=-1)
-
-            embeds.append(text_embeds)
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-
-        # forward
-        output = audioldm_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert np.abs(audio_1 - audio_2).max() < 1e-2
-
-    def test_audioldm_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "egg cracking"
-        output = audioldm_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 256
-
-        audio_slice = audio[:10]
-        expected_slice = np.array(
-            [-0.0051, 0.0050, -0.0060, 0.0034, -0.0026, 0.0033, -0.0027, 0.0033, -0.0028, 0.0032]
-        )
-
-        assert np.abs(audio_slice - expected_slice).max() < 1e-2
-
-    def test_audioldm_num_waveforms_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A hammer hitting a wooden surface"
-
-        # test num_waveforms_per_prompt=1 (default)
-        audios = audioldm_pipe(prompt, num_inference_steps=2).audios
-
-        assert audios.shape == (1, 256)
-
-        # test num_waveforms_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe([prompt] * batch_size, num_inference_steps=2).audios
-
-        assert audios.shape == (batch_size, 256)
-
-        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 2
-        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios
-
-        assert audios.shape == (num_waveforms_per_prompt, 256)
-
-        # test num_waveforms_per_prompt for batch of prompts
-        batch_size = 2
-        audios = audioldm_pipe(
-            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 256)
-
-    def test_audioldm_audio_length_in_s(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-        vocoder_sampling_rate = audioldm_pipe.vocoder.config.sampling_rate
-
-        inputs = self.get_dummy_inputs(device)
-        output = audioldm_pipe(audio_length_in_s=0.016, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.016
-
-        output = audioldm_pipe(audio_length_in_s=0.032, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) / vocoder_sampling_rate == 0.032
-
-    def test_audioldm_vocoder_model_in_dim(self):
-        components = self.get_dummy_components()
-        audioldm_pipe = AudioLDMPipeline(**components)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        prompt = ["hey"]
-
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        assert audio_shape == (1, 256)
-
-        config = audioldm_pipe.vocoder.config
-        config.model_in_dim *= 2
-        audioldm_pipe.vocoder = SpeechT5HifiGan(config).to(torch_device)
-        output = audioldm_pipe(prompt, num_inference_steps=1)
-        audio_shape = output.audios.shape
-        # waveform shape is unchanged, we just have 2x the number of mel channels in the spectrogram
-        assert audio_shape == (1, 256)
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(test_mean_pixel_difference=False)
-
-
-@slow
-# @require_torch_gpu
-class AudioLDMPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 8, 128, 16))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-
-    def test_audioldm(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        audio = audioldm_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-
-        audio_slice = audio[77230:77240]
-        expected_slice = np.array(
-            [-0.4884, -0.4607, 0.0023, 0.5007, 0.5896, 0.5151, 0.3813, -0.0208, -0.3687, -0.4315]
-        )
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-2
-
-    def test_audioldm_lms(self):
-        audioldm_pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm")
-        audioldm_pipe.scheduler = LMSDiscreteScheduler.from_config(audioldm_pipe.scheduler.config)
-        audioldm_pipe = audioldm_pipe.to(torch_device)
-        audioldm_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        audio = audioldm_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 1
-        assert len(audio) == 81920
-
-        audio_slice = audio[27780:27790]
-        expected_slice = np.array([-0.2131, -0.0873, -0.0124, -0.0189, 0.0569, 0.1373, 0.1883, 0.2886, 0.3297, 0.2212])
-        max_diff = np.abs(expected_slice - audio_slice).max()
-        assert max_diff < 1e-2
diff --git a/diffusers/tests/pipelines/dance_diffusion/__init__.py b/diffusers/tests/pipelines/dance_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
deleted file mode 100644
index bbd4aa694b769a0903c505383d9634de8ebd4063..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/dance_diffusion/test_dance_diffusion.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DanceDiffusionPipeline
-    params = UNCONDITIONAL_AUDIO_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    batch_params = UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS
-    test_attention_slicing = False
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet1DModel(
-            block_out_channels=(32, 32, 64),
-            extra_in_channels=16,
-            sample_size=512,
-            sample_rate=16_000,
-            in_channels=2,
-            out_channels=2,
-            flip_sin_to_cos=True,
-            use_timestep_embedding=False,
-            time_embedding_type="fourier",
-            mid_block_type="UNetMidBlock1D",
-            down_block_types=("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
-            up_block_types=("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
-        )
-        scheduler = IPNDMScheduler()
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "batch_size": 1,
-            "generator": generator,
-            "num_inference_steps": 4,
-        }
-        return inputs
-
-    def test_dance_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = DanceDiffusionPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = pipe(**inputs)
-        audio = output.audios
-
-        audio_slice = audio[0, -3:, -3:]
-
-        assert audio.shape == (1, 2, components["unet"].sample_size)
-        expected_slice = np.array([-0.7265, 1.0000, -0.8388, 0.1175, 0.9498, -1.0000])
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-
-@slow
-@require_torch_gpu
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_dance_diffusion(self):
-        device = torch_device
-
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-
-        audio_slice = audio[0, -3:, -3:]
-
-        assert audio.shape == (1, 2, pipe.unet.sample_size)
-        expected_slice = np.array([-0.0192, -0.0231, -0.0318, -0.0059, 0.0002, -0.0020])
-
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_dance_diffusion_fp16(self):
-        device = torch_device
-
-        pipe = DanceDiffusionPipeline.from_pretrained("harmonai/maestro-150k", torch_dtype=torch.float16)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe(generator=generator, num_inference_steps=100, audio_length_in_s=4.096)
-        audio = output.audios
-
-        audio_slice = audio[0, -3:, -3:]
-
-        assert audio.shape == (1, 2, pipe.unet.sample_size)
-        expected_slice = np.array([-0.0367, -0.0488, -0.0771, -0.0525, -0.0444, -0.0341])
-
-        assert np.abs(audio_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/ddim/__init__.py b/diffusers/tests/pipelines/ddim/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/ddim/test_ddim.py b/diffusers/tests/pipelines/ddim/test_ddim.py
deleted file mode 100644
index 4d2c4e490d638861c4d06fb3c2ddff489a2773d3..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/ddim/test_ddim.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
-
-from ...pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DDIMPipeline
-    params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "num_images_per_prompt",
-        "latents",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = DDIMScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "batch_size": 1,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        self.assertEqual(image.shape, (1, 32, 32, 3))
-        expected_slice = np.array(
-            [1.000e00, 5.717e-01, 4.717e-01, 1.000e00, 0.000e00, 1.000e00, 3.000e-04, 0.000e00, 9.000e-04]
-        )
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 1e-3)
-
-
-@slow
-@require_torch_gpu
-class DDIMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler()
-
-        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddim(generator=generator, eta=0.0, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.1723, 0.1617, 0.1600, 0.1626, 0.1497, 0.1513, 0.1505, 0.1442, 0.1453])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_inference_ema_bedroom(self):
-        model_id = "google/ddpm-ema-bedroom-256"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler.from_pretrained(model_id)
-
-        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.0060, 0.0201, 0.0344, 0.0024, 0.0018, 0.0002, 0.0022, 0.0000, 0.0069])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/ddpm/__init__.py b/diffusers/tests/pipelines/ddpm/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/ddpm/test_ddpm.py b/diffusers/tests/pipelines/ddpm/test_ddpm.py
deleted file mode 100644
index 5e3e47cb74fbe07bb9ddf73c40b200bcea945237..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/ddpm/test_ddpm.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class DDPMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_fast_inference(self):
-        device = "cpu"
-        unet = self.dummy_uncond_unet
-        scheduler = DDPMScheduler()
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = ddpm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [9.956e-01, 5.785e-01, 4.675e-01, 9.930e-01, 0.0, 1.000, 1.199e-03, 2.648e-04, 5.101e-04]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_inference_predict_sample(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDPMScheduler(prediction_type="sample")
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_eps = ddpm(generator=generator, num_inference_steps=2, output_type="numpy")[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_eps_slice = image_eps[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-        assert np.abs(image_slice.flatten() - image_eps_slice.flatten()).max() < tolerance
-
-
-@slow
-@require_torch_gpu
-class DDPMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDPMScheduler.from_pretrained(model_id)
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4200, 0.3588, 0.1939, 0.3847, 0.3382, 0.2647, 0.4155, 0.3582, 0.3385])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/dit/__init__.py b/diffusers/tests/pipelines/dit/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/dit/test_dit.py b/diffusers/tests/pipelines/dit/test_dit.py
deleted file mode 100644
index c514c3c7fa1d7b7a83307a04c37ca63dece289e5..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/dit/test_dit.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel
-from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import (
-    CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS,
-    CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS,
-)
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = DiTPipeline
-    params = CLASS_CONDITIONED_IMAGE_GENERATION_PARAMS
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents",
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = Transformer2DModel(
-            sample_size=16,
-            num_layers=2,
-            patch_size=4,
-            attention_head_dim=8,
-            num_attention_heads=2,
-            in_channels=4,
-            out_channels=8,
-            attention_bias=True,
-            activation_fn="gelu-approximate",
-            num_embeds_ada_norm=1000,
-            norm_type="ada_norm_zero",
-            norm_elementwise_affine=False,
-        )
-        vae = AutoencoderKL()
-        scheduler = DDIMScheduler()
-        components = {"transformer": transformer.eval(), "vae": vae.eval(), "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "class_labels": [1],
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        self.assertEqual(image.shape, (1, 16, 16, 3))
-        expected_slice = np.array([0.4380, 0.4141, 0.5159, 0.0000, 0.4282, 0.6680, 0.5485, 0.2545, 0.6719])
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 1e-3)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(relax_max_difference=True, expected_max_diff=1e-3)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3)
-
-
-@require_torch_gpu
-@slow
-class DiTPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_dit_256(self):
-        generator = torch.manual_seed(0)
-
-        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256")
-        pipe.to("cuda")
-
-        words = ["vase", "umbrella", "white shark", "white wolf"]
-        ids = pipe.get_label_ids(words)
-
-        images = pipe(ids, generator=generator, num_inference_steps=40, output_type="np").images
-
-        for word, image in zip(words, images):
-            expected_image = load_numpy(
-                f"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/dit/{word}.npy"
-            )
-            assert np.abs((expected_image - image).max()) < 1e-2
-
-    def test_dit_512(self):
-        pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-512")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
-
-        words = ["vase", "umbrella"]
-        ids = pipe.get_label_ids(words)
-
-        generator = torch.manual_seed(0)
-        images = pipe(ids, generator=generator, num_inference_steps=25, output_type="np").images
-
-        for word, image in zip(words, images):
-            expected_image = load_numpy(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-                f"/dit/{word}_512.npy"
-            )
-
-            assert np.abs((expected_image - image).max()) < 1e-1
diff --git a/diffusers/tests/pipelines/karras_ve/__init__.py b/diffusers/tests/pipelines/karras_ve/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/karras_ve/test_karras_ve.py b/diffusers/tests/pipelines/karras_ve/test_karras_ve.py
deleted file mode 100644
index 391e61a2b9c90c58049270a192884bd358621c52..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/karras_ve/test_karras_ve.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class KarrasVePipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = KarrasVeScheduler()
-
-        pipe = KarrasVePipeline(unet=unet, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = pipe(num_inference_steps=2, generator=generator, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch
-class KarrasVePipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        model_id = "google/ncsnpp-celebahq-256"
-        model = UNet2DModel.from_pretrained(model_id)
-        scheduler = KarrasVeScheduler()
-
-        pipe = KarrasVePipeline(unet=model, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pipe(num_inference_steps=20, generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.578, 0.5811, 0.5924, 0.5809, 0.587, 0.5886, 0.5861, 0.5802, 0.586])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/latent_diffusion/__init__.py b/diffusers/tests/pipelines/latent_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
deleted file mode 100644
index 3f2dbe5cec2a324d80fe7bcca1efffe9bcd3ab02..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
-from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = LDMTextToImagePipeline
-    params = TEXT_TO_IMAGE_PARAMS - {
-        "negative_prompt",
-        "negative_prompt_embeds",
-        "cross_attention_kwargs",
-        "prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=(32, 64),
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
-            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vqvae": vae,
-            "bert": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference_text2img(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        pipe = LDMTextToImagePipeline(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 16, 16, 3)
-        expected_slice = np.array([0.59450, 0.64078, 0.55509, 0.51229, 0.69640, 0.36960, 0.59296, 0.60801, 0.49332])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-
-@slow
-@require_torch_gpu
-class LDMTextToImagePipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, dtype=torch.float32, seed=0):
-        generator = torch.manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
-        max_diff = np.abs(expected_slice - image_slice).max()
-        assert max_diff < 1e-3
-
-
-@nightly
-@require_torch_gpu
-class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, dtype=torch.float32, seed=0):
-        generator = torch.manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_ldm_default_ddim(self):
-        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
deleted file mode 100644
index f1aa2f08efbaac9d5d8ce55b2a01ebf9fc538bd1..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel
-from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class LDMSuperResolutionPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=6,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    def test_inference_superresolution(self):
-        device = "cpu"
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vqvae = self.dummy_vq_model
-
-        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
-        ldm.to(device)
-        ldm.set_progress_bar_config(disable=None)
-
-        init_image = self.dummy_image.to(device)
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.8678, 0.8245, 0.6381, 0.6830, 0.4385, 0.5599, 0.4641, 0.6201, 0.5150])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_inference_superresolution_fp16(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vqvae = self.dummy_vq_model
-
-        # put models in fp16
-        unet = unet.half()
-        vqvae = vqvae.half()
-
-        ldm = LDMSuperResolutionPipeline(unet=unet, vqvae=vqvae, scheduler=scheduler)
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        init_image = self.dummy_image.to(torch_device)
-
-        image = ldm(init_image, num_inference_steps=2, output_type="numpy").images
-
-        assert image.shape == (1, 64, 64, 3)
-
-
-@slow
-@require_torch
-class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_superresolution(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/vq_diffusion/teddy_bear_pool.png"
-        )
-        init_image = init_image.resize((64, 64), resample=PIL_INTERPOLATION["lanczos"])
-
-        ldm = LDMSuperResolutionPipeline.from_pretrained("duongna/ldm-super-resolution", device_map="auto")
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.7644, 0.7679, 0.7642, 0.7633, 0.7666, 0.7560, 0.7425, 0.7257, 0.6907])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
deleted file mode 100644
index aa7b33730d1815d2b1de20b48c6106407cc41770..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel
-
-from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class LDMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    def test_inference_uncond(self):
-        unet = self.dummy_uncond_unet
-        scheduler = DDIMScheduler()
-        vae = self.dummy_vq_model
-
-        ldm = LDMPipeline(unet=unet, vqvae=vae, scheduler=scheduler)
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=2, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = ldm(generator=generator, num_inference_steps=2, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.8512, 0.818, 0.6411, 0.6808, 0.4465, 0.5618, 0.46, 0.6231, 0.5172])
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < tolerance
-
-
-@slow
-@require_torch
-class LDMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_uncond(self):
-        ldm = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
-        ldm.to(torch_device)
-        ldm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
-        tolerance = 1e-2 if torch_device != "mps" else 3e-2
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < tolerance
diff --git a/diffusers/tests/pipelines/paint_by_example/__init__.py b/diffusers/tests/pipelines/paint_by_example/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py b/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
deleted file mode 100644
index 81d1989200ac1ddbab305d5143ec98bcd654f46b..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/paint_by_example/test_paint_by_example.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPImageProcessor, CLIPVisionConfig
-
-from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = PaintByExamplePipeline
-    params = IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            image_size=32,
-            patch_size=4,
-        )
-        image_encoder = PaintByExampleImageEncoder(config, proj_size=32)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "image_encoder": image_encoder,
-            "safety_checker": None,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-
-    def convert_to_pt(self, image):
-        image = np.array(image.convert("RGB"))
-        image = image[None].transpose(0, 3, 1, 2)
-        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-        return image
-
-    def get_dummy_inputs(self, device="cpu", seed=0):
-        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        example_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "example_image": example_image,
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_paint_by_example_inpaint(self):
-        components = self.get_dummy_components()
-
-        # make sure here that pndm scheduler skips prk
-        pipe = PaintByExamplePipeline(**components)
-        pipe = pipe.to("cpu")
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        output = pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4701, 0.5555, 0.3994, 0.5107, 0.5691, 0.4517, 0.5125, 0.4769, 0.4539])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_paint_by_example_image_tensor(self):
-        device = "cpu"
-        inputs = self.get_dummy_inputs()
-        inputs.pop("mask_image")
-        image = self.convert_to_pt(inputs.pop("image"))
-        mask_image = image.clamp(0, 1) / 2
-
-        # make sure here that pndm scheduler skips prk
-        pipe = PaintByExamplePipeline(**self.get_dummy_components())
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(image=image, mask_image=mask_image[:, 0], **inputs)
-        out_1 = output.images
-
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        mask_image = mask_image.cpu().permute(0, 2, 3, 1)[0]
-
-        image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(mask_image)).convert("RGB")
-
-        output = pipe(**self.get_dummy_inputs())
-        out_2 = output.images
-
-        assert out_1.shape == (1, 64, 64, 3)
-        assert np.abs(out_1.flatten() - out_2.flatten()).max() < 5e-2
-
-
-@slow
-@require_torch_gpu
-class PaintByExamplePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_paint_by_example(self):
-        # make sure here that pndm scheduler skips prk
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/dog_in_bucket.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/mask.png"
-        )
-        example_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/paint_by_example/panda.jpg"
-        )
-
-        pipe = PaintByExamplePipeline.from_pretrained("Fantasy-Studio/Paint-by-Example")
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(321)
-        output = pipe(
-            image=init_image,
-            mask_image=mask_image,
-            example_image=example_image,
-            generator=generator,
-            guidance_scale=5.0,
-            num_inference_steps=50,
-            output_type="np",
-        )
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4834, 0.4811, 0.4874, 0.5122, 0.5081, 0.5144, 0.5291, 0.5290, 0.5374])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/pndm/__init__.py b/diffusers/tests/pipelines/pndm/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/pndm/test_pndm.py b/diffusers/tests/pipelines/pndm/test_pndm.py
deleted file mode 100644
index bed5fea561dc670220c1864c614b68718e96a7ae..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/pndm/test_pndm.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class PNDMPipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = PNDMScheduler()
-
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.to(torch_device)
-        pndm.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = pndm(generator=generator, num_inference_steps=20, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = pndm(generator=generator, num_inference_steps=20, output_type="numpy", return_dict=False)[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch
-class PNDMPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = PNDMScheduler()
-
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        pndm.to(torch_device)
-        pndm.set_progress_bar_config(disable=None)
-        generator = torch.manual_seed(0)
-        image = pndm(generator=generator, output_type="numpy").images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/repaint/__init__.py b/diffusers/tests/pipelines/repaint/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/repaint/test_repaint.py b/diffusers/tests/pipelines/repaint/test_repaint.py
deleted file mode 100644
index 060e6c9161baab099bc11b3d843dd4b48f7e2fb6..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/repaint/test_repaint.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel
-from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device
-
-from ...pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = RePaintPipeline
-    params = IMAGE_INPAINTING_PARAMS - {"width", "height", "guidance_scale"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "latents",
-        "num_images_per_prompt",
-        "callback",
-        "callback_steps",
-    }
-    batch_params = IMAGE_INPAINTING_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        torch.manual_seed(0)
-        unet = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = RePaintScheduler()
-        components = {"unet": unet, "scheduler": scheduler}
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        image = np.random.RandomState(seed).standard_normal((1, 3, 32, 32))
-        image = torch.from_numpy(image).to(device=device, dtype=torch.float32)
-        mask = (image > 0).to(device=device, dtype=torch.float32)
-        inputs = {
-            "image": image,
-            "mask_image": mask,
-            "generator": generator,
-            "num_inference_steps": 5,
-            "eta": 0.0,
-            "jump_length": 2,
-            "jump_n_sample": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_repaint(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = RePaintPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([1.0000, 0.5426, 0.5497, 0.2200, 1.0000, 1.0000, 0.5623, 1.0000, 0.6274])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    # RePaint can hardly be made deterministic since the scheduler is currently always
-    # nondeterministic
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-
-@nightly
-@require_torch_gpu
-class RepaintPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_celebahq(self):
-        original_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "repaint/celeba_hq_256.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "repaint/celeba_hq_256_result.npy"
-        )
-
-        model_id = "google/ddpm-ema-celebahq-256"
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = RePaintScheduler.from_pretrained(model_id)
-
-        repaint = RePaintPipeline(unet=unet, scheduler=scheduler).to(torch_device)
-        repaint.set_progress_bar_config(disable=None)
-        repaint.enable_attention_slicing()
-
-        generator = torch.manual_seed(0)
-        output = repaint(
-            original_image,
-            mask_image,
-            num_inference_steps=250,
-            eta=0.0,
-            jump_length=10,
-            jump_n_sample=10,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).mean() < 1e-2
diff --git a/diffusers/tests/pipelines/score_sde_ve/__init__.py b/diffusers/tests/pipelines/score_sde_ve/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
deleted file mode 100644
index 036ecc3f6bf3c3a61780933c0a404ca91abe5dc4..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/score_sde_ve/test_score_sde_ve.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel
-from diffusers.utils.testing_utils import require_torch, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class ScoreSdeVeipelineFastTests(unittest.TestCase):
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def test_inference(self):
-        unet = self.dummy_uncond_unet
-        scheduler = ScoreSdeVeScheduler()
-
-        sde_ve = ScoreSdeVePipeline(unet=unet, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator).images
-
-        generator = torch.manual_seed(0)
-        image_from_tuple = sde_ve(num_inference_steps=2, output_type="numpy", generator=generator, return_dict=False)[
-            0
-        ]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch
-class ScoreSdeVePipelineIntegrationTests(unittest.TestCase):
-    def test_inference(self):
-        model_id = "google/ncsnpp-church-256"
-        model = UNet2DModel.from_pretrained(model_id)
-
-        scheduler = ScoreSdeVeScheduler.from_pretrained(model_id)
-
-        sde_ve = ScoreSdeVePipeline(unet=model, scheduler=scheduler)
-        sde_ve.to(torch_device)
-        sde_ve.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=10, output_type="numpy", generator=generator).images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-
-        expected_slice = np.array([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py b/diffusers/tests/pipelines/semantic_stable_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
deleted file mode 100644
index b312c8184390c0eb7df751cbbbf1e1b5146fb428..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
+++ /dev/null
@@ -1,601 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
-from diffusers.utils import floats_tensor, nightly, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class SafeDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_semantic_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_semantic_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_semantic_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_semantic_diffusion_fp16(self):
-        """Test that stable diffusion works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
-
-        assert image.shape == (1, 64, 64, 3)
-
-
-@nightly
-@require_torch_gpu
-class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_positive_guidance(self):
-        torch_device = "cuda"
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photo of a cat"
-        edit = {
-            "editing_prompt": ["sunglasses"],
-            "reverse_editing_direction": [False],
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 6,
-            "edit_threshold": 0.95,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-
-        seed = 3
-        guidance_scale = 7
-
-        # no sega enabled
-        generator = torch.Generator(torch_device)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.34673113,
-            0.38492733,
-            0.37597352,
-            0.34086335,
-            0.35650748,
-            0.35579205,
-            0.3384763,
-            0.34340236,
-            0.3573271,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        # with sega enabled
-        # generator = torch.manual_seed(seed)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.41887826,
-            0.37728766,
-            0.30138272,
-            0.41416335,
-            0.41664985,
-            0.36283392,
-            0.36191246,
-            0.43364465,
-            0.43001732,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_negative_guidance(self):
-        torch_device = "cuda"
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "an image of a crowded boulevard, realistic, 4k"
-        edit = {
-            "editing_prompt": "crowd, crowded, people",
-            "reverse_editing_direction": True,
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 8.3,
-            "edit_threshold": 0.9,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-
-        seed = 9
-        guidance_scale = 7
-
-        # no sega enabled
-        generator = torch.Generator(torch_device)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.43497998,
-            0.91814065,
-            0.7540739,
-            0.55580205,
-            0.8467265,
-            0.5389691,
-            0.62574506,
-            0.58897763,
-            0.50926757,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        # with sega enabled
-        # generator = torch.manual_seed(seed)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.3089719,
-            0.30500144,
-            0.29016042,
-            0.30630964,
-            0.325687,
-            0.29419225,
-            0.2908091,
-            0.28723598,
-            0.27696294,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_multi_cond_guidance(self):
-        torch_device = "cuda"
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a castle next to a river"
-        edit = {
-            "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
-            "reverse_editing_direction": False,
-            "edit_warmup_steps": [15, 18],
-            "edit_guidance_scale": 6,
-            "edit_threshold": [0.9, 0.8],
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-
-        seed = 48
-        guidance_scale = 7
-
-        # no sega enabled
-        generator = torch.Generator(torch_device)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.75163555,
-            0.76037145,
-            0.61785,
-            0.9189673,
-            0.8627701,
-            0.85189694,
-            0.8512813,
-            0.87012076,
-            0.8312857,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        # with sega enabled
-        # generator = torch.manual_seed(seed)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.73553365,
-            0.7537271,
-            0.74341905,
-            0.66480356,
-            0.6472925,
-            0.63039416,
-            0.64812905,
-            0.6749717,
-            0.6517102,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_guidance_fp16(self):
-        torch_device = "cuda"
-        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photo of a cat"
-        edit = {
-            "editing_prompt": ["sunglasses"],
-            "reverse_editing_direction": [False],
-            "edit_warmup_steps": 10,
-            "edit_guidance_scale": 6,
-            "edit_threshold": 0.95,
-            "edit_momentum_scale": 0.5,
-            "edit_mom_beta": 0.6,
-        }
-
-        seed = 3
-        guidance_scale = 7
-
-        # no sega enabled
-        generator = torch.Generator(torch_device)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.34887695,
-            0.3876953,
-            0.375,
-            0.34423828,
-            0.3581543,
-            0.35717773,
-            0.3383789,
-            0.34570312,
-            0.359375,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        # with sega enabled
-        # generator = torch.manual_seed(seed)
-        generator.manual_seed(seed)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            **edit,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [
-            0.42285156,
-            0.36914062,
-            0.29077148,
-            0.42041016,
-            0.41918945,
-            0.35498047,
-            0.3618164,
-            0.4423828,
-            0.43115234,
-        ]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py b/diffusers/tests/pipelines/spectrogram_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
deleted file mode 100644
index 594d7c598f7507d07973e9e2cd8f62a5f0a1b7fd..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline
-from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder
-from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device
-from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime
-
-from ...pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-MIDI_FILE = "./tests/fixtures/elise_format0.mid"
-
-
-class SpectrogramDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = SpectrogramDiffusionPipeline
-    required_optional_params = PipelineTesterMixin.required_optional_params - {
-        "callback",
-        "latents",
-        "callback_steps",
-        "output_type",
-        "num_images_per_prompt",
-    }
-    test_attention_slicing = False
-    test_cpu_offload = False
-    batch_params = TOKENS_TO_AUDIO_GENERATION_PARAMS
-    params = TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        notes_encoder = SpectrogramNotesEncoder(
-            max_length=2048,
-            vocab_size=1536,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-
-        continuous_encoder = SpectrogramContEncoder(
-            input_dims=128,
-            targets_context_length=256,
-            d_model=768,
-            dropout_rate=0.1,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            feed_forward_proj="gated-gelu",
-        )
-
-        decoder = T5FilmDecoder(
-            input_dims=128,
-            targets_length=256,
-            max_decoder_noise_time=20000.0,
-            d_model=768,
-            num_layers=1,
-            num_heads=1,
-            d_kv=4,
-            d_ff=2048,
-            dropout_rate=0.1,
-        )
-
-        scheduler = DDPMScheduler()
-
-        components = {
-            "notes_encoder": notes_encoder.eval(),
-            "continuous_encoder": continuous_encoder.eval(),
-            "decoder": decoder.eval(),
-            "scheduler": scheduler,
-            "melgan": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "input_tokens": [
-                [1134, 90, 1135, 1133, 1080, 112, 1132, 1080, 1133, 1079, 133, 1132, 1079, 1133, 1] + [0] * 2033
-            ],
-            "generator": generator,
-            "num_inference_steps": 4,
-            "output_type": "mel",
-        }
-        return inputs
-
-    def test_spectrogram_diffusion(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = SpectrogramDiffusionPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = pipe(**inputs)
-        mel = output.audios
-
-        mel_slice = mel[0, -3:, -3:]
-
-        assert mel_slice.shape == (3, 3)
-        expected_slice = np.array(
-            [-11.512925, -4.788215, -0.46172905, -2.051715, -10.539147, -10.970963, -9.091634, 4.0, 4.0]
-        )
-        assert np.abs(mel_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-    def test_inference_batch_single_identical(self):
-        pass
-
-    def test_inference_batch_consistent(self):
-        pass
-
-    @skip_mps
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@slow
-@require_torch_gpu
-@require_onnxruntime
-@require_note_seq
-class PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_callback(self):
-        # TODO - test that pipeline can decode tokens in a callback
-        # so that music can be played live
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        melgan = pipe.melgan
-        pipe.melgan = None
-
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        def callback(step, mel_output):
-            # decode mel to audio
-            audio = melgan(input_features=mel_output.astype(np.float32))[0]
-            assert len(audio[0]) == 81920 * (step + 1)
-            # simulate that audio is played
-            return audio
-
-        processor = MidiProcessor()
-        input_tokens = processor(MIDI_FILE)
-
-        input_tokens = input_tokens[:3]
-        generator = torch.manual_seed(0)
-        pipe(input_tokens, num_inference_steps=5, generator=generator, callback=callback, output_type="mel")
-
-    def test_spectrogram_fast(self):
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-        # just run two denoising loops
-        input_tokens = input_tokens[:2]
-
-        generator = torch.manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=2, generator=generator)
-
-        audio = output.audios[0]
-
-        assert abs(np.abs(audio).sum() - 3612.841) < 1e-1
-
-    def test_spectrogram(self):
-        device = torch_device
-
-        pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        processor = MidiProcessor()
-
-        input_tokens = processor(MIDI_FILE)
-
-        # just run 4 denoising loops
-        input_tokens = input_tokens[:4]
-
-        generator = torch.manual_seed(0)
-        output = pipe(input_tokens, num_inference_steps=100, generator=generator)
-
-        audio = output.audios[0]
-        assert abs(np.abs(audio).sum() - 9389.1111) < 5e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/__init__.py b/diffusers/tests/pipelines/stable_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
deleted file mode 100644
index 5282cfd8dd2472ca8bf1bb785c6ee69268d4be52..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_cycle_diffusion.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = CycleDiffusionPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
-        "negative_prompt",
-        "height",
-        "width",
-        "negative_prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"})
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            num_train_timesteps=1000,
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "An astronaut riding an elephant",
-            "source_prompt": "An astronaut riding a horse",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "eta": 0.1,
-            "strength": 0.8,
-            "guidance_scale": 3,
-            "source_guidance_scale": 1,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_cycle(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        pipe = CycleDiffusionPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = pipe(**inputs)
-        images = output.images
-
-        image_slice = images[0, -3:, -3:, -1]
-
-        assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4459, 0.4943, 0.4544, 0.6643, 0.5474, 0.4327, 0.5701, 0.5959, 0.5179])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_cycle_fp16(self):
-        components = self.get_dummy_components()
-        for name, module in components.items():
-            if hasattr(module, "half"):
-                components[name] = module.half()
-        pipe = CycleDiffusionPipeline(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)
-        images = output.images
-
-        image_slice = images[0, -3:, -3:, -1]
-
-        assert images.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.3506, 0.4543, 0.446, 0.4575, 0.5195, 0.4155, 0.5273, 0.518, 0.4116])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-
-@slow
-@require_torch_gpu
-class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_cycle_diffusion_pipeline_fp16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/cycle-diffusion/black_colored_car.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car_fp16.npy"
-        )
-        init_image = init_image.resize((512, 512))
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
-        pipe = CycleDiffusionPipeline.from_pretrained(
-            model_id, scheduler=scheduler, safety_checker=None, torch_dtype=torch.float16, revision="fp16"
-        )
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        source_prompt = "A black colored car"
-        prompt = "A blue colored car"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            source_prompt=source_prompt,
-            image=init_image,
-            num_inference_steps=100,
-            eta=0.1,
-            strength=0.85,
-            guidance_scale=3,
-            source_guidance_scale=1,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images
-
-        # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(image - expected_image).max() < 5e-1
-
-    def test_cycle_diffusion_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/cycle-diffusion/black_colored_car.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/cycle-diffusion/blue_colored_car.npy"
-        )
-        init_image = init_image.resize((512, 512))
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(model_id, subfolder="scheduler")
-        pipe = CycleDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, safety_checker=None)
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        source_prompt = "A black colored car"
-        prompt = "A blue colored car"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            source_prompt=source_prompt,
-            image=init_image,
-            num_inference_steps=100,
-            eta=0.1,
-            strength=0.85,
-            guidance_scale=3,
-            source_guidance_scale=1,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images
-
-        assert np.abs(image - expected_image).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
deleted file mode 100644
index 74783faae421cb0a10a89fda4f19454f4cf834a8..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-import numpy as np
-
-from diffusers import (
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    OnnxStableDiffusionPipeline,
-    PNDMScheduler,
-)
-from diffusers.utils.testing_utils import is_onnx_available, nightly, require_onnxruntime, require_torch_gpu
-
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-
-if is_onnx_available():
-    import onnxruntime as ort
-
-
-class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-    hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
-
-    def get_dummy_inputs(self, seed=0):
-        generator = np.random.RandomState(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_pipeline_default_ddim(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.65072, 0.58492, 0.48219, 0.55521, 0.53180, 0.55939, 0.50697, 0.39800, 0.46455])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_pipeline_pndm(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.65863, 0.59425, 0.49326, 0.56313, 0.53875, 0.56627, 0.51065, 0.39777, 0.46330])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_pipeline_lms(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_pipeline_euler(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.53755, 0.60786, 0.47402, 0.49488, 0.51869, 0.49819, 0.47985, 0.38957, 0.44279])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_pipeline_euler_ancestral(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.53817, 0.60812, 0.47384, 0.49530, 0.51894, 0.49814, 0.47984, 0.38958, 0.44271])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_pipeline_dpm_multistep(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.53895, 0.60808, 0.47933, 0.49608, 0.51886, 0.49950, 0.48053, 0.38957, 0.44200])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@nightly
-@require_onnxruntime
-@require_torch_gpu
-class OnnxStableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    @property
-    def gpu_provider(self):
-        return (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "15000000000",  # 15GB
-                "arena_extend_strategy": "kSameAsRequested",
-            },
-        )
-
-    @property
-    def gpu_options(self):
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
-        return options
-
-    def test_inference_default_pndm(self):
-        # using the PNDM scheduler by default
-        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        np.random.seed(0)
-        output = sd_pipe([prompt], guidance_scale=6.0, num_inference_steps=10, output_type="np")
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0452, 0.0390, 0.0087, 0.0350, 0.0617, 0.0364, 0.0544, 0.0523, 0.0720])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_inference_ddim(self):
-        ddim_scheduler = DDIMScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-        )
-        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            revision="onnx",
-            scheduler=ddim_scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "open neural network exchange"
-        generator = np.random.RandomState(0)
-        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2867, 0.1974, 0.1481, 0.7294, 0.7251, 0.6667, 0.4194, 0.5642, 0.6486])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_inference_k_lms(self):
-        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-        )
-        sd_pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            revision="onnx",
-            scheduler=lms_scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "open neural network exchange"
-        generator = np.random.RandomState(0)
-        output = sd_pipe([prompt], guidance_scale=7.5, num_inference_steps=10, generator=generator, output_type="np")
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2306, 0.1959, 0.1593, 0.6549, 0.6394, 0.5408, 0.5065, 0.6010, 0.6161])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_intermediate_state(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.6772, -0.3835, -1.2456, 0.1905, -1.0974, 0.6967, -1.9353, 0.0178, 1.0167]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 5:
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.3351, 0.2241, -0.1837, -0.2325, -0.6577, 0.3393, -0.0241, 0.5899, 1.3875]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-
-        test_callback_fn.has_been_called = False
-
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "Andromeda galaxy in a bottle"
-
-        generator = np.random.RandomState(0)
-        pipe(
-            prompt=prompt,
-            num_inference_steps=5,
-            guidance_scale=7.5,
-            generator=generator,
-            callback=test_callback_fn,
-            callback_steps=1,
-        )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 6
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = OnnxStableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        assert isinstance(pipe, OnnxStableDiffusionPipeline)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = OnnxStableDiffusionPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
deleted file mode 100644
index e1aa2f6dc0a1641f217f0b20ef93d2f82cf15140..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-
-from diffusers import (
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    OnnxStableDiffusionImg2ImgPipeline,
-    PNDMScheduler,
-)
-from diffusers.utils import floats_tensor
-from diffusers.utils.testing_utils import (
-    is_onnx_available,
-    load_image,
-    nightly,
-    require_onnxruntime,
-    require_torch_gpu,
-)
-
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-
-if is_onnx_available():
-    import onnxruntime as ort
-
-
-class OnnxStableDiffusionImg2ImgPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-    hub_checkpoint = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed))
-        generator = np.random.RandomState(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_pipeline_default_ddim(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.69643, 0.58484, 0.50314, 0.58760, 0.55368, 0.59643, 0.51529, 0.41217, 0.49087])
-        assert np.abs(image_slice - expected_slice).max() < 1e-1
-
-    def test_pipeline_pndm(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.61737, 0.54642, 0.53183, 0.54465, 0.52742, 0.60525, 0.49969, 0.40655, 0.48154])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_lms(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        # warmup pass to apply optimizations
-        _ = pipe(**self.get_dummy_inputs())
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.52761, 0.59977, 0.49033, 0.49619, 0.54282, 0.50311, 0.47600, 0.40918, 0.45203])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_euler(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_euler_ancestral(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.52911, 0.60004, 0.49229, 0.49805, 0.54502, 0.50680, 0.47777, 0.41028, 0.45304])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_dpm_multistep(self):
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 128, 128, 3)
-        expected_slice = np.array([0.65331, 0.58277, 0.48204, 0.56059, 0.53665, 0.56235, 0.50969, 0.40009, 0.46552])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-
-@nightly
-@require_onnxruntime
-@require_torch_gpu
-class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    @property
-    def gpu_provider(self):
-        return (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "15000000000",  # 15GB
-                "arena_extend_strategy": "kSameAsRequested",
-            },
-        )
-
-    @property
-    def gpu_options(self):
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
-        return options
-
-    def test_inference_default_pndm(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        # using the PNDM scheduler by default
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = np.random.RandomState(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            num_inference_steps=10,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 383:386, -1]
-
-        assert images.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.4909, 0.5059, 0.5372, 0.4623, 0.4876, 0.5049, 0.4820, 0.4956, 0.5019])
-        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
-
-    def test_inference_k_lms(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((768, 512))
-        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", subfolder="scheduler", revision="onnx"
-        )
-        pipe = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
-            revision="onnx",
-            scheduler=lms_scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = np.random.RandomState(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            num_inference_steps=20,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 383:386, -1]
-
-        assert images.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.8043, 0.926, 0.9581, 0.8119, 0.8954, 0.913, 0.7209, 0.7463, 0.7431])
-        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
deleted file mode 100644
index 16287d64d154872f50b49b822daec79641f11f11..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from diffusers import LMSDiscreteScheduler, OnnxStableDiffusionInpaintPipeline
-from diffusers.utils.testing_utils import (
-    is_onnx_available,
-    load_image,
-    nightly,
-    require_onnxruntime,
-    require_torch_gpu,
-)
-
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-
-if is_onnx_available():
-    import onnxruntime as ort
-
-
-class OnnxStableDiffusionPipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-    # FIXME: add fast tests
-    pass
-
-
-@nightly
-@require_onnxruntime
-@require_torch_gpu
-class OnnxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
-    @property
-    def gpu_provider(self):
-        return (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "15000000000",  # 15GB
-                "arena_extend_strategy": "kSameAsRequested",
-            },
-        )
-
-    @property
-    def gpu_options(self):
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
-        return options
-
-    def test_inference_default_pndm(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A red cat sitting on a park bench"
-
-        generator = np.random.RandomState(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            guidance_scale=7.5,
-            num_inference_steps=10,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 255:258, -1]
-
-        assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.2514, 0.3007, 0.3517, 0.1790, 0.2382, 0.3167, 0.1944, 0.2273, 0.2464])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_inference_k_lms(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", subfolder="scheduler", revision="onnx"
-        )
-        pipe = OnnxStableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting",
-            revision="onnx",
-            scheduler=lms_scheduler,
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A red cat sitting on a park bench"
-
-        generator = np.random.RandomState(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            guidance_scale=7.5,
-            num_inference_steps=20,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 255:258, -1]
-
-        assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0086, 0.0077, 0.0083, 0.0093, 0.0107, 0.0139, 0.0094, 0.0097, 0.0125])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index 235aa32f7338579210520c675b3776b830cbe3da..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from diffusers import OnnxStableDiffusionInpaintPipelineLegacy
-from diffusers.utils.testing_utils import (
-    is_onnx_available,
-    load_image,
-    load_numpy,
-    nightly,
-    require_onnxruntime,
-    require_torch_gpu,
-)
-
-
-if is_onnx_available():
-    import onnxruntime as ort
-
-
-@nightly
-@require_onnxruntime
-@require_torch_gpu
-class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
-    @property
-    def gpu_provider(self):
-        return (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "15000000000",  # 15GB
-                "arena_extend_strategy": "kSameAsRequested",
-            },
-        )
-
-    @property
-    def gpu_options(self):
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
-        return options
-
-    def test_inference(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/red_cat_sitting_on_a_park_bench_onnx.npy"
-        )
-
-        # using the PNDM scheduler by default
-        pipe = OnnxStableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            revision="onnx",
-            safety_checker=None,
-            feature_extractor=None,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A red cat sitting on a park bench"
-
-        generator = np.random.RandomState(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            num_inference_steps=15,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
deleted file mode 100644
index d1527a42a1e56b3de663c596a8457fab5006bfb2..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_onnx_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import (
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    OnnxStableDiffusionUpscalePipeline,
-    PNDMScheduler,
-)
-from diffusers.utils import floats_tensor
-from diffusers.utils.testing_utils import (
-    is_onnx_available,
-    load_image,
-    nightly,
-    require_onnxruntime,
-    require_torch_gpu,
-)
-
-from ...test_pipelines_onnx_common import OnnxPipelineTesterMixin
-
-
-if is_onnx_available():
-    import onnxruntime as ort
-
-
-class OnnxStableDiffusionUpscalePipelineFastTests(OnnxPipelineTesterMixin, unittest.TestCase):
-    # TODO: is there an appropriate internal test set?
-    hub_checkpoint = "ssube/stable-diffusion-x4-upscaler-onnx"
-
-    def get_dummy_inputs(self, seed=0):
-        image = floats_tensor((1, 3, 128, 128), rng=random.Random(seed))
-        generator = torch.manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_pipeline_default_ddpm(self):
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        # started as 128, should now be 512
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223]
-        )
-        assert np.abs(image_slice - expected_slice).max() < 1e-1
-
-    def test_pipeline_pndm(self):
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config, skip_prk_steps=True)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.6898892, 0.59240556, 0.52499527, 0.58866215, 0.52258235, 0.52572715, 0.62414473, 0.6174387, 0.6214964]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_dpm_multistep(self):
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.7659278, 0.76437664, 0.75579107, 0.7691116, 0.77666986, 0.7727672, 0.7758664, 0.7812226, 0.76942515]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_euler(self):
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.6974782, 0.68902093, 0.70135885, 0.7583618, 0.7804545, 0.7854912, 0.78667426, 0.78743863, 0.78070223]
-        )
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-    def test_pipeline_euler_ancestral(self):
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(self.hub_checkpoint, provider="CPUExecutionProvider")
-        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.77424496, 0.773601, 0.7645288, 0.7769598, 0.7772739, 0.7738688, 0.78187233, 0.77879584, 0.767043]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-
-@nightly
-@require_onnxruntime
-@require_torch_gpu
-class OnnxStableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
-    @property
-    def gpu_provider(self):
-        return (
-            "CUDAExecutionProvider",
-            {
-                "gpu_mem_limit": "15000000000",  # 15GB
-                "arena_extend_strategy": "kSameAsRequested",
-            },
-        )
-
-    @property
-    def gpu_options(self):
-        options = ort.SessionOptions()
-        options.enable_mem_pattern = False
-        return options
-
-    def test_inference_default_ddpm(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((128, 128))
-        # using the PNDM scheduler by default
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
-            "ssube/stable-diffusion-x4-upscaler-onnx",
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            guidance_scale=7.5,
-            num_inference_steps=10,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 383:386, -1]
-
-        assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4883, 0.4947, 0.4980, 0.4975, 0.4982, 0.4980, 0.5000, 0.5006, 0.4972])
-        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
-
-    def test_inference_k_lms(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        init_image = init_image.resize((128, 128))
-        lms_scheduler = LMSDiscreteScheduler.from_pretrained(
-            "ssube/stable-diffusion-x4-upscaler-onnx", subfolder="scheduler"
-        )
-        pipe = OnnxStableDiffusionUpscalePipeline.from_pretrained(
-            "ssube/stable-diffusion-x4-upscaler-onnx",
-            scheduler=lms_scheduler,
-            provider=self.gpu_provider,
-            sess_options=self.gpu_options,
-        )
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            guidance_scale=7.5,
-            num_inference_steps=20,
-            generator=generator,
-            output_type="np",
-        )
-        images = output.images
-        image_slice = images[0, 255:258, 383:386, -1]
-
-        assert images.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.50173753, 0.50223356, 0.502039, 0.50233036, 0.5023725, 0.5022601, 0.5018758, 0.50234085, 0.50241566]
-        )
-        # TODO: lower the tolerance after finding the cause of onnxruntime reproducibility issues
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
deleted file mode 100644
index 857122782d354cd5fcd5b69daf2f601be799c5d1..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ /dev/null
@@ -1,1025 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import gc
-import tempfile
-import time
-import unittest
-
-import numpy as np
-import torch
-from huggingface_hub import hf_hub_download
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    logging,
-)
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
-
-from ...models.test_models_unet_2d_condition import create_lora_layers
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_lora(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        # forward 1
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        # set lora layers
-        lora_attn_procs = create_lora_layers(sd_pipe.unet)
-        sd_pipe.unet.set_attn_processor(lora_attn_procs)
-        sd_pipe = sd_pipe.to(torch_device)
-
-        # forward 2
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
-        image = output.images
-        image_slice_1 = image[0, -3:, -3:, -1]
-
-        # forward 3
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
-        image = output.images
-        image_slice_2 = image[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice - image_slice_1).max() < 1e-2
-        assert np.abs(image_slice - image_slice_2).max() > 1e-2
-
-    def test_stable_diffusion_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = sd_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=sd_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        )
-        text_inputs = text_inputs["input_ids"].to(torch_device)
-
-        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
-
-        inputs["prompt_embeds"] = prompt_embeds
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-
-    def test_stable_diffusion_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = sd_pipe.tokenizer(
-                p,
-                padding="max_length",
-                max_length=sd_pipe.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_inputs = text_inputs["input_ids"].to(torch_device)
-
-            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-
-        # forward
-        output = sd_pipe(**inputs)
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-
-    def test_stable_diffusion_ddim_factor_8(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs, height=136, width=136)
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 136, 136, 3)
-        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    def test_stable_diffusion_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082293033599854,
-                0.5371589064598083,
-                0.4562119245529175,
-                0.5220914483070374,
-                0.5733777284622192,
-                0.4795039892196655,
-                0.5465868711471558,
-                0.5074326395988464,
-                0.5042197108268738,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.4707113206386566,
-                0.5372191071510315,
-                0.4563021957874298,
-                0.5220003724098206,
-                0.5734264850616455,
-                0.4794946610927582,
-                0.5463782548904419,
-                0.5074145197868347,
-                0.504422664642334,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.47082313895225525,
-                0.5371587872505188,
-                0.4562119245529175,
-                0.5220913887023926,
-                0.5733776688575745,
-                0.47950395941734314,
-                0.546586811542511,
-                0.5074326992034912,
-                0.5042197108268738,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_vae_slicing(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        image_count = 4
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_1 = sd_pipe(**inputs)
-
-        # make sure sliced vae decode yields the same result
-        sd_pipe.enable_vae_slicing()
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * image_count
-        output_2 = sd_pipe(**inputs)
-
-        # there is a small discrepancy at image borders vs. full batch decode
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
-
-    def test_stable_diffusion_vae_tiling(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-
-        # make sure here that pndm scheduler skips prk
-        components["safety_checker"] = None
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
-        generator = torch.Generator(device=device).manual_seed(0)
-        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        # make sure tiled vae decode yields the same result
-        sd_pipe.enable_vae_tiling()
-        generator = torch.Generator(device=device).manual_seed(0)
-        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
-
-        # test that tiled decode works with various shapes
-        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
-        for shape in shapes:
-            zeros = torch.zeros(shape).to(device)
-            sd_pipe.vae.decode(zeros)
-
-    def test_stable_diffusion_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array(
-            [
-                0.5108221173286438,
-                0.5688379406929016,
-                0.4685141146183014,
-                0.5098261833190918,
-                0.5657756328582764,
-                0.4631010890007019,
-                0.5226285457611084,
-                0.49129390716552734,
-                0.4899061322212219,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_long_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
-
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
-
-        assert cap_logger.out == cap_logger_2.out
-        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
-
-    def test_stable_diffusion_height_width_opt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "hey"
-
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (64, 64)
-
-        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (96, 96)
-
-        config = dict(sd_pipe.unet.config)
-        config["sample_size"] = 96
-        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
-        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
-        image_shape = output.images[0].shape[:2]
-        assert image_shape == (192, 192)
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_1_1_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # enable attention slicing
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_sliced = pipe(**inputs).images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 3.75 GB is allocated
-        assert mem_bytes < 3.75 * 10**9
-
-        # disable slicing
-        pipe.disable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image = pipe(**inputs).images
-
-        # make sure that more than 3.75 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.75 * 10**9
-        assert np.abs(image_sliced - image).max() < 1e-3
-
-    def test_stable_diffusion_vae_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        # enable vae slicing
-        pipe.enable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image_sliced = pipe(**inputs).images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 4 GB is allocated
-        assert mem_bytes < 4e9
-
-        # disable vae slicing
-        pipe.disable_vae_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        inputs["prompt"] = [inputs["prompt"]] * 4
-        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
-        image = pipe(**inputs).images
-
-        # make sure that more than 4 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 4e9
-        # There is a small discrepancy at the image borders vs. a fully batched version.
-        assert np.abs(image_sliced - image).max() < 1e-2
-
-    def test_stable_diffusion_vae_tiling(self):
-        torch.cuda.reset_peak_memory_stats()
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
-        pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
-
-        prompt = "a photograph of an astronaut riding a horse"
-
-        # enable vae tiling
-        pipe.enable_vae_tiling()
-        pipe.enable_model_cpu_offload()
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output_chunked = pipe(
-            [prompt],
-            width=1024,
-            height=1024,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        )
-        image_chunked = output_chunked.images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        # disable vae tiling
-        pipe.disable_vae_tiling()
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipe(
-            [prompt],
-            width=1024,
-            height=1024,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        )
-        image = output.images
-
-        assert mem_bytes < 1e10
-        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2
-
-    def test_stable_diffusion_fp16_vs_autocast(self):
-        # this test makes sure that the original model with autocast
-        # and the new model with fp16 yield the same result
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_fp16 = pipe(**inputs).images
-
-        with torch.autocast(torch_device):
-            inputs = self.get_inputs(torch_device)
-            image_autocast = pipe(**inputs).images
-
-        # Make sure results are close enough
-        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
-        # They ARE different since ops are not run always at the same precision
-        # however, they should be extremely close.
-        assert diff.mean() < 2e-2
-
-    def test_stable_diffusion_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-    def test_stable_diffusion_low_cpu_mem_usage(self):
-        pipeline_id = "CompVis/stable-diffusion-v1-4"
-
-        start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
-        pipeline_low_cpu_mem_usage.to(torch_device)
-        low_cpu_mem_usage_time = time.time() - start_time
-
-        start_time = time.time()
-        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
-        normal_load_time = time.time() - start_time
-
-        assert 2 * low_cpu_mem_usage_time < normal_load_time
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9
-
-    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-
-        # Normal inference
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            torch_dtype=torch.float16,
-        )
-        pipe.unet.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        outputs = pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        # With model offloading
-
-        # Reload but don't move to cuda
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            torch_dtype=torch.float16,
-        )
-        pipe.unet.set_default_attn_processor()
-
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-
-        outputs_offloaded = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
-
-        assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
-        assert mem_bytes_offloaded < mem_bytes
-        assert mem_bytes_offloaded < 3.5 * 10**9
-        for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
-            assert module.device == torch.device("cpu")
-
-        # With attention slicing
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_attention_slicing()
-        _ = pipe(**inputs)
-        mem_bytes_slicing = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes_slicing < mem_bytes_offloaded
-        assert mem_bytes_slicing < 3 * 10**9
-
-    def test_stable_diffusion_textual_inversion(self):
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.load_textual_inversion("sd-concepts-library/low-poly-hd-logos-icons")
-
-        a111_file = hf_hub_download("hf-internal-testing/text_inv_embedding_a1111_format", "winter_style.pt")
-        a111_file_neg = hf_hub_download(
-            "hf-internal-testing/text_inv_embedding_a1111_format", "winter_style_negative.pt"
-        )
-        pipe.load_textual_inversion(a111_file)
-        pipe.load_textual_inversion(a111_file_neg)
-        pipe.to("cuda")
-
-        generator = torch.Generator(device="cpu").manual_seed(1)
-
-        prompt = "An logo of a turtle in strong Style-Winter with <low-poly-hd-logos-icons>"
-        neg_prompt = "Style-Winter-neg"
-
-        image = pipe(prompt=prompt, negative_prompt=neg_prompt, generator=generator, output_type="np").images[0]
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_inv/winter_logo_style.npy"
-        )
-
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 5e-2
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_1_4_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_1_5_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
deleted file mode 100644
index d556e6318f430d2761bb2ef02556b5bf0d1fcb88..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
+++ /dev/null
@@ -1,594 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    StableDiffusionControlNetPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
-from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionControlNetPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        torch.manual_seed(0)
-        controlnet = ControlNetModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-        )
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        controlnet_embedder_scale_factor = 2
-        image = randn_tensor(
-            (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-            generator=generator,
-            device=torch.device(device),
-        )
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "image": image,
-        }
-
-        return inputs
-
-    def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
-
-
-class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionControlNetPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        torch.manual_seed(0)
-        controlnet1 = ControlNetModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-        )
-        torch.manual_seed(0)
-        controlnet2 = ControlNetModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            cross_attention_dim=32,
-            conditioning_embedding_out_channels=(16, 32),
-        )
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        controlnet = MultiControlNetModel([controlnet1, controlnet2])
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        controlnet_embedder_scale_factor = 2
-
-        images = [
-            randn_tensor(
-                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-                generator=generator,
-                device=torch.device(device),
-            ),
-            randn_tensor(
-                (1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
-                generator=generator,
-                device=torch.device(device),
-            ),
-        ]
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "image": images,
-        }
-
-        return inputs
-
-    def test_attention_slicing_forward_pass(self):
-        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=2e-3)
-
-    def test_save_pretrained_raise_not_implemented_exception(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            try:
-                # save_pretrained is not implemented for Multi-ControlNet
-                pipe.save_pretrained(tmpdir)
-            except NotImplementedError:
-                pass
-
-    # override PipelineTesterMixin
-    @unittest.skip("save pretrained not implemented")
-    def test_save_load_float16(self):
-        ...
-
-    # override PipelineTesterMixin
-    @unittest.skip("save pretrained not implemented")
-    def test_save_load_local(self):
-        ...
-
-    # override PipelineTesterMixin
-    @unittest.skip("save pretrained not implemented")
-    def test_save_load_optional_components(self):
-        ...
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_canny(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (768, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_depth(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "Stormtrooper's lecture"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_hed(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "oil painting of handsome old man, masterpiece"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (704, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_mlsd(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "room"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (704, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_normal(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "cute toy"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_openpose(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "Chef in the kitchen"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (768, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_scribble(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(5)
-        prompt = "bag"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (640, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_seg(self):
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(5)
-        prompt = "house"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
-        )
-
-        output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-3
-
-    def test_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
-        )
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        prompt = "house"
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
-        )
-
-        _ = pipe(
-            prompt,
-            image,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 7 GB is allocated
-        assert mem_bytes < 4 * 10**9
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_pose_and_canny(self):
-        controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
-        controlnet_pose = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
-
-        pipe = StableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=[controlnet_pose, controlnet_canny]
-        )
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        prompt = "bird and Chef"
-        image_canny = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-        image_pose = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
-        )
-
-        output = pipe(prompt, [image_pose, image_canny], generator=generator, output_type="np", num_inference_steps=3)
-
-        image = output.images[0]
-
-        assert image.shape == (768, 512, 3)
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy"
-        )
-
-        assert np.abs(expected_image - image).max() < 5e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py
deleted file mode 100644
index 268c013201775c8c78960960669ace207670fd51..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_flax_controlnet.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-from diffusers import FlaxControlNetModel, FlaxStableDiffusionControlNetPipeline
-from diffusers.utils import is_flax_available, load_image, slow
-from diffusers.utils.testing_utils import require_flax
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from flax.jax_utils import replicate
-    from flax.training.common_utils import shard
-
-
-@slow
-@require_flax
-class FlaxStableDiffusionControlNetPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-
-    def test_canny(self):
-        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.bfloat16
-        )
-        pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16
-        )
-        params["controlnet"] = controlnet_params
-
-        prompts = "bird"
-        num_samples = jax.device_count()
-        prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
-
-        canny_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
-        )
-        processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
-
-        rng = jax.random.PRNGKey(0)
-        rng = jax.random.split(rng, jax.device_count())
-
-        p_params = replicate(params)
-        prompt_ids = shard(prompt_ids)
-        processed_image = shard(processed_image)
-
-        images = pipe(
-            prompt_ids=prompt_ids,
-            image=processed_image,
-            params=p_params,
-            prng_seed=rng,
-            num_inference_steps=50,
-            jit=True,
-        ).images
-        assert images.shape == (jax.device_count(), 1, 768, 512, 3)
-
-        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
-        image_slice = images[0, 253:256, 253:256, -1]
-
-        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
-        expected_slice = jnp.array(
-            [0.167969, 0.116699, 0.081543, 0.154297, 0.132812, 0.108887, 0.169922, 0.169922, 0.205078]
-        )
-        print(f"output_slice: {output_slice}")
-        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
-
-    def test_pose(self):
-        controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
-            "lllyasviel/sd-controlnet-openpose", from_pt=True, dtype=jnp.bfloat16
-        )
-        pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", controlnet=controlnet, from_pt=True, dtype=jnp.bfloat16
-        )
-        params["controlnet"] = controlnet_params
-
-        prompts = "Chef in the kitchen"
-        num_samples = jax.device_count()
-        prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
-
-        pose_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
-        )
-        processed_image = pipe.prepare_image_inputs([pose_image] * num_samples)
-
-        rng = jax.random.PRNGKey(0)
-        rng = jax.random.split(rng, jax.device_count())
-
-        p_params = replicate(params)
-        prompt_ids = shard(prompt_ids)
-        processed_image = shard(processed_image)
-
-        images = pipe(
-            prompt_ids=prompt_ids,
-            image=processed_image,
-            params=p_params,
-            prng_seed=rng,
-            num_inference_steps=50,
-            jit=True,
-        ).images
-        assert images.shape == (jax.device_count(), 1, 768, 512, 3)
-
-        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
-        image_slice = images[0, 253:256, 253:256, -1]
-
-        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
-        expected_slice = jnp.array(
-            [[0.271484, 0.261719, 0.275391, 0.277344, 0.279297, 0.291016, 0.294922, 0.302734, 0.302734]]
-        )
-        print(f"output_slice: {output_slice}")
-        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
deleted file mode 100644
index 01c2e22e48161b125e783273b55e395050646609..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection
-
-from diffusers import (
-    AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    PNDMScheduler,
-    StableDiffusionImageVariationPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionImageVariationPipeline
-    params = IMAGE_VARIATION_PARAMS
-    batch_params = IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        image_encoder_config = CLIPVisionConfig(
-            hidden_size=32,
-            projection_dim=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            image_size=32,
-            patch_size=4,
-        )
-        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "image_encoder": image_encoder,
-            "feature_extractor": feature_extractor,
-            "safety_checker": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_img_variation_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImageVariationPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5167, 0.5746, 0.4835, 0.4914, 0.5605, 0.4691, 0.5201, 0.4898, 0.4958])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_img_variation_multiple_images(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImageVariationPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["image"] = 2 * [inputs["image"]]
-        output = sd_pipe(**inputs)
-
-        image = output.images
-
-        image_slice = image[-1, -3:, -3:, -1]
-
-        assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array([0.6568, 0.5470, 0.5684, 0.5444, 0.5945, 0.6221, 0.5508, 0.5531, 0.5263])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_imgvar/input_image_vermeer.png"
-        )
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "image": init_image,
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_img_variation_pipeline_default(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "lambdalabs/sd-image-variations-diffusers", safety_checker=None
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_img_variation_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            "fusing/sd-image-variations-diffusers",
-            safety_checker=None,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        model_id = "fusing/sd-image-variations-diffusers"
-        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
-            model_id, safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.6 GB is allocated
-        assert mem_bytes < 2.6 * 10**9
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_imgvar/input_image_vermeer.png"
-        )
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "image": init_image,
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_img_variation_pndm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img_variation_dpm(self):
-        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
deleted file mode 100644
index e27f83fc04feb38edb85755dd9eaa48d528b95f8..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,544 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImg2ImgPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        if input_image_type == "pt":
-            input_image = image
-        elif input_image_type == "np":
-            input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
-        elif input_image_type == "pil":
-            input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
-            input_image = VaeImageProcessor.numpy_to_pil(input_image)
-        else:
-            raise ValueError(f"unsupported input_image_type {input_image_type}.")
-
-        if output_type not in ["pt", "np", "pil"]:
-            raise ValueError(f"unsupported output_type {output_type}")
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": input_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": output_type,
-        }
-        return inputs
-
-    def test_stable_diffusion_img2img_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_multiple_init_images(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
-        image = sd_pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-
-        assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-    @skip_mps
-    def test_pt_np_pil_outputs_equivalent(self):
-        device = "cpu"
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0]
-
-        assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
-        assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
-
-    @skip_mps
-    def test_image_types_consistent(self):
-        device = "cpu"
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionImg2ImgPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0]
-        output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0]
-        output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0]
-
-        assert np.abs(output_pt - output_np).max() <= 1e-4
-        assert np.abs(output_pil - output_np).max() <= 1e-2
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/sketch-mountains-input.png"
-        )
-        inputs = {
-            "prompt": "a fantasy landscape, concept art, high resolution",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_stable_diffusion_img2img_default(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.4300, 0.4662, 0.4930, 0.3990, 0.4307, 0.4525, 0.3719, 0.4064, 0.3923])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_k_lms(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.0389, 0.0346, 0.0415, 0.0290, 0.0218, 0.0210, 0.0408, 0.0567, 0.0271])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_ddim(self):
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 768, 3)
-        expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_img2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.4958, 0.5107, 1.1045, 2.7539, 4.6680, 3.8320, 1.5049, 1.8633, 2.6523])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.4956, 0.5078, 1.0918, 2.7520, 4.6484, 3.8125, 1.5146, 1.8633, 2.6367])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.2 GB is allocated
-        assert mem_bytes < 2.2 * 10**9
-
-    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-
-        # Normal inference
-
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            safety_checker=None,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        # With model offloading
-
-        # Reload but don't move to cuda
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            safety_checker=None,
-            torch_dtype=torch.float16,
-        )
-
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-        _ = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
-
-        assert mem_bytes_offloaded < mem_bytes
-        for module in pipe.text_encoder, pipe.unet, pipe.vae:
-            assert module.device == torch.device("cpu")
-
-    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/img2img/sketch-mountains-input.jpg"
-        )
-        # resize to resolution that is divisible by 8 but not 16 or 32
-        init_image = init_image.resize((760, 504))
-
-        model_id = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "A fantasy landscape, trending on artstation"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            strength=0.75,
-            guidance_scale=7.5,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        image_slice = image[255:258, 383:386, -1]
-
-        assert image.shape == (504, 760, 3)
-        expected_slice = np.array([0.9393, 0.9500, 0.9399, 0.9438, 0.9458, 0.9400, 0.9455, 0.9414, 0.9423])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/sketch-mountains-input.png"
-        )
-        inputs = {
-            "prompt": "a fantasy landscape, concept art, high resolution",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "np",
-        }
-        return inputs
-
-    def test_img2img_pndm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/stable_diffusion_1_5_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img2img_ddim(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/stable_diffusion_1_5_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img2img_lms(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/stable_diffusion_1_5_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img2img_dpm(self):
-        sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_img2img/stable_diffusion_1_5_dpm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
deleted file mode 100644
index 3553679e0ef6db5a088f36f34558b10bf0d638d4..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInpaintPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
-from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInpaintPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4723, 0.5731, 0.3939, 0.5441, 0.5922, 0.4392, 0.5059, 0.4651, 0.4474])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_image_tensor(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = sd_pipe(**inputs)
-        out_pil = output.images
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["image"] = torch.tensor(np.array(inputs["image"]) / 127.5 - 1).permute(2, 0, 1).unsqueeze(0)
-        inputs["mask_image"] = torch.tensor(np.array(inputs["mask_image"]) / 255).permute(2, 0, 1)[:1].unsqueeze(0)
-        output = sd_pipe(**inputs)
-        out_tensor = output.images
-
-        assert out_pil.shape == (1, 64, 64, 3)
-        assert np.abs(out_pil.flatten() - out_tensor.flatten()).max() < 5e-2
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint_ddim(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0427, 0.0460, 0.0483, 0.0460, 0.0584, 0.0521, 0.1549, 0.1695, 0.1794])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_fp16(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16, safety_checker=None
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1350, 0.1123, 0.1350, 0.1641, 0.1328, 0.1230, 0.1289, 0.1531, 0.1687])
-
-        assert np.abs(expected_slice - image_slice).max() < 5e-2
-
-    def test_stable_diffusion_inpaint_pndm(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_k_lms(self):
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.9314, 0.7575, 0.9432, 0.8885, 0.9028, 0.7298, 0.9811, 0.9667, 0.7633])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.2 GB is allocated
-        assert mem_bytes < 2.2 * 10**9
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionInpaintPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "Face of a yellow cat, high resolution, sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/stable_diffusion_inpaint_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = PNDMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/stable_diffusion_inpaint_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/stable_diffusion_inpaint_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/stable_diffusion_inpaint_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-
-class StableDiffusionInpaintingPrepareMaskAndMaskedImageTests(unittest.TestCase):
-    def test_pil_inputs(self):
-        im = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
-        im = Image.fromarray(im)
-        mask = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
-        mask = Image.fromarray((mask * 255).astype(np.uint8))
-
-        t_mask, t_masked = prepare_mask_and_masked_image(im, mask)
-
-        self.assertTrue(isinstance(t_mask, torch.Tensor))
-        self.assertTrue(isinstance(t_masked, torch.Tensor))
-
-        self.assertEqual(t_mask.ndim, 4)
-        self.assertEqual(t_masked.ndim, 4)
-
-        self.assertEqual(t_mask.shape, (1, 1, 32, 32))
-        self.assertEqual(t_masked.shape, (1, 3, 32, 32))
-
-        self.assertTrue(t_mask.dtype == torch.float32)
-        self.assertTrue(t_masked.dtype == torch.float32)
-
-        self.assertTrue(t_mask.min() >= 0.0)
-        self.assertTrue(t_mask.max() <= 1.0)
-        self.assertTrue(t_masked.min() >= -1.0)
-        self.assertTrue(t_masked.min() <= 1.0)
-
-        self.assertTrue(t_mask.sum() > 0.0)
-
-    def test_np_inputs(self):
-        im_np = np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)
-        im_pil = Image.fromarray(im_np)
-        mask_np = np.random.randint(0, 255, (32, 32), dtype=np.uint8) > 127.5
-        mask_pil = Image.fromarray((mask_np * 255).astype(np.uint8))
-
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-        t_mask_pil, t_masked_pil = prepare_mask_and_masked_image(im_pil, mask_pil)
-
-        self.assertTrue((t_mask_np == t_mask_pil).all())
-        self.assertTrue((t_masked_np == t_masked_pil).all())
-
-    def test_torch_3D_2D_inputs(self):
-        im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
-        im_np = im_tensor.numpy().transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_3D_3D_inputs(self):
-        im_tensor = torch.randint(0, 255, (3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
-        im_np = im_tensor.numpy().transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0]
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_4D_2D_inputs(self):
-        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (32, 32), dtype=torch.uint8) > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_4D_3D_inputs(self):
-        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (1, 32, 32), dtype=torch.uint8) > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0]
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_4D_4D_inputs(self):
-        im_tensor = torch.randint(0, 255, (1, 3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (1, 1, 32, 32), dtype=torch.uint8) > 127.5
-        im_np = im_tensor.numpy()[0].transpose(1, 2, 0)
-        mask_np = mask_tensor.numpy()[0][0]
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        t_mask_np, t_masked_np = prepare_mask_and_masked_image(im_np, mask_np)
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_batch_4D_3D(self):
-        im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (2, 32, 32), dtype=torch.uint8) > 127.5
-
-        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
-        mask_nps = [mask.numpy() for mask in mask_tensor]
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
-        t_mask_np = torch.cat([n[0] for n in nps])
-        t_masked_np = torch.cat([n[1] for n in nps])
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_torch_batch_4D_4D(self):
-        im_tensor = torch.randint(0, 255, (2, 3, 32, 32), dtype=torch.uint8)
-        mask_tensor = torch.randint(0, 255, (2, 1, 32, 32), dtype=torch.uint8) > 127.5
-
-        im_nps = [im.numpy().transpose(1, 2, 0) for im in im_tensor]
-        mask_nps = [mask.numpy()[0] for mask in mask_tensor]
-
-        t_mask_tensor, t_masked_tensor = prepare_mask_and_masked_image(im_tensor / 127.5 - 1, mask_tensor)
-        nps = [prepare_mask_and_masked_image(i, m) for i, m in zip(im_nps, mask_nps)]
-        t_mask_np = torch.cat([n[0] for n in nps])
-        t_masked_np = torch.cat([n[1] for n in nps])
-
-        self.assertTrue((t_mask_tensor == t_mask_np).all())
-        self.assertTrue((t_masked_tensor == t_masked_np).all())
-
-    def test_shape_mismatch(self):
-        # test height and width
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(torch.randn(3, 32, 32), torch.randn(64, 64))
-        # test batch dim
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 64, 64))
-        # test batch dim
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(torch.randn(2, 3, 32, 32), torch.randn(4, 1, 64, 64))
-
-    def test_type_mismatch(self):
-        # test tensors-only
-        with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.rand(3, 32, 32).numpy())
-        # test tensors-only
-        with self.assertRaises(TypeError):
-            prepare_mask_and_masked_image(torch.rand(3, 32, 32).numpy(), torch.rand(3, 32, 32))
-
-    def test_channels_first(self):
-        # test channels first for 3D tensors
-        with self.assertRaises(AssertionError):
-            prepare_mask_and_masked_image(torch.rand(32, 32, 3), torch.rand(3, 32, 32))
-
-    def test_tensor_range(self):
-        # test im <= 1
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(torch.ones(3, 32, 32) * 2, torch.rand(32, 32))
-        # test im >= -1
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(torch.ones(3, 32, 32) * (-2), torch.rand(32, 32))
-        # test mask <= 1
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * 2)
-        # test mask >= 0
-        with self.assertRaises(ValueError):
-            prepare_mask_and_masked_image(torch.rand(3, 32, 32), torch.ones(32, 32) * -1)
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
deleted file mode 100644
index 15d94414ea2fa881a29d4876cb072da56492a0a0..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInpaintPipelineLegacy,
-    UNet2DConditionModel,
-    UNet2DModel,
-    VQModel,
-)
-from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device
-from diffusers.utils.testing_utils import load_numpy, require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_uncond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_cond_unet_inpaint(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vq_model(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_stable_diffusion_inpaint_legacy(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_legacy_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        negative_prompt = "french fries"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            prompt,
-            negative_prompt=negative_prompt,
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4941, 0.5396, 0.4689, 0.6338, 0.5392, 0.4094, 0.5477, 0.5904, 0.5165])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_inpaint_legacy_num_images_per_prompt(self):
-        device = "cpu"
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        # test num_images_per_prompt=1 (default)
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        ).images
-
-        assert images.shape == (1, 32, 32, 3)
-
-        # test num_images_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        ).images
-
-        assert images.shape == (batch_size, 32, 32, 3)
-
-        # test num_images_per_prompt for single prompt
-        num_images_per_prompt = 2
-        images = sd_pipe(
-            prompt,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (num_images_per_prompt, 32, 32, 3)
-
-        # test num_images_per_prompt for batch of prompts
-        batch_size = 2
-        images = sd_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-            num_images_per_prompt=num_images_per_prompt,
-        ).images
-
-        assert images.shape == (batch_size * num_images_per_prompt, 32, 32, 3)
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionInpaintLegacyPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "A red cat sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint_legacy_pndm(self):
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.5665, 0.6117, 0.6430, 0.4057, 0.4594, 0.5658, 0.1596, 0.3106, 0.4305])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_legacy_k_lms(self):
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.4534, 0.4467, 0.4329, 0.4329, 0.4339, 0.4220, 0.4244, 0.4332, 0.4426])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_inpaint_legacy_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.5977, 1.5449, 1.0586, -0.3250, 0.7383, -0.0862, 0.4631, -0.2571, -1.1289])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.5190, 1.1621, 0.6885, 0.2424, 0.3337, -0.1617, 0.6914, -0.1957, -0.5474])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 1e-3
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionInpaintLegacyPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint/input_bench_mask.png"
-        )
-        inputs = {
-            "prompt": "A red cat sitting on a park bench",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inpaint_pndm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_ddim(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_lms(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_inpaint_dpm(self):
-        sd_pipe = StableDiffusionInpaintPipelineLegacy.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 30
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_inpaint_legacy/stable_diffusion_1_5_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
deleted file mode 100644
index 25b0c6ea1432972a6303423ea8517420a6ab9499..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionInstructPix2PixPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import floats_tensor, load_image, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInstructPix2PixPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=8,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB")
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "image_guidance_scale": 1,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.7318, 0.3723, 0.4662, 0.623, 0.5770, 0.5014, 0.4281, 0.5550, 0.4813])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.7323, 0.3688, 0.4611, 0.6255, 0.5746, 0.5017, 0.433, 0.5553, 0.4827])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_multiple_init_images(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * 2
-
-        image = np.array(inputs["image"]).astype(np.float32) / 255.0
-        image = torch.from_numpy(image).unsqueeze(0).to(device)
-        image = image.permute(0, 3, 1, 2)
-        inputs["image"] = image.repeat(2, 1, 1, 1)
-
-        image = sd_pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-
-        assert image.shape == (2, 32, 32, 3)
-        expected_slice = np.array([0.606, 0.5712, 0.5099, 0.598, 0.5805, 0.7205, 0.6793, 0.554, 0.5607])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        slice = [round(x, 4) for x in image_slice.flatten().tolist()]
-        print(",".join([str(x) for x in slice]))
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.726, 0.3902, 0.4868, 0.585, 0.5672, 0.511, 0.3906, 0.551, 0.4846])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = torch.manual_seed(seed)
-        image = load_image(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg"
-        )
-        inputs = {
-            "prompt": "turn him into a cyborg",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "image_guidance_scale": 1.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_default(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.5902, 0.6015, 0.6027, 0.5983, 0.6092, 0.6061, 0.5765, 0.5785, 0.5555])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_k_lms(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.6578, 0.6817, 0.6972, 0.6761, 0.6856, 0.6916, 0.6428, 0.6516, 0.6301])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_ddim(self):
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3828, 0.3834, 0.3818, 0.3792, 0.3865, 0.3752, 0.3792, 0.3847, 0.3753])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.2463, -0.4644, -0.9756, 1.5176, 1.4414, 0.7866, 0.9897, 0.8521, 0.7983])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([-0.2644, -0.4626, -0.9653, 1.5176, 1.4551, 0.7686, 0.9805, 0.8452, 0.8115])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs()
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.2 GB is allocated
-        assert mem_bytes < 2.2 * 10**9
-
-    def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
-        inputs = self.get_inputs()
-        # resize to resolution that is divisible by 8 but not 16 or 32
-        inputs["image"] = inputs["image"].resize((504, 504))
-
-        model_id = "timbrooks/instruct-pix2pix"
-        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        output = pipe(**inputs)
-        image = output.images[0]
-
-        image_slice = image[255:258, 383:386, -1]
-
-        assert image.shape == (504, 504, 3)
-        expected_slice = np.array([0.2726, 0.2529, 0.2664, 0.2655, 0.2641, 0.2642, 0.2591, 0.2649, 0.2590])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
deleted file mode 100644
index 546b1d21252c6ef84d9d181839b7976d3d376082..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import StableDiffusionKDiffusionPipeline
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_1(self):
-        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        sd_pipe.set_scheduler("sample_euler")
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0447, 0.0492, 0.0468, 0.0408, 0.0383, 0.0408, 0.0354, 0.0380, 0.0339])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_2(self):
-        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        sd_pipe.set_scheduler("sample_euler")
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=9.0, num_inference_steps=20, output_type="np")
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1237, 0.1320, 0.1438, 0.1359, 0.1390, 0.1132, 0.1277, 0.1175, 0.1112])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-1
-
-    def test_stable_diffusion_karras_sigmas(self):
-        sd_pipe = StableDiffusionKDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        sd_pipe.set_scheduler("sample_dpmpp_2m")
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=15,
-            output_type="np",
-            use_karras_sigmas=True,
-        )
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array(
-            [0.11381689, 0.12112921, 0.1389457, 0.12549606, 0.1244964, 0.10831517, 0.11562866, 0.10867816, 0.10499048]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
deleted file mode 100644
index 2d9b1e54ee6ebaddf8d6ca133ef322ed06853980..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionModelEditingPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@skip_mps
-class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionModelEditingPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler()
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.manual_seed(seed)
-        inputs = {
-            "prompt": "A field of roses",
-            "generator": generator,
-            # Setting height and width to None to prevent OOMs on CPU.
-            "height": None,
-            "width": None,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_model_editing_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionModelEditingPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.5217179, 0.50658035, 0.5003239, 0.41109088, 0.3595158, 0.46607107, 0.5323504, 0.5335255, 0.49187922]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_model_editing_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionModelEditingPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.546259, 0.5108156, 0.50897664, 0.41931948, 0.3748669, 0.4669299, 0.5427151, 0.54561913, 0.49353]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_model_editing_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionModelEditingPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.47106352, 0.53579676, 0.45798016, 0.514294, 0.56856745, 0.4788605, 0.54380214, 0.5046455, 0.50404465]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_model_editing_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler()
-        sd_pipe = StableDiffusionModelEditingPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        # the pipeline does not expect pndm so test if it raises error.
-        with self.assertRaises(ValueError):
-            _ = sd_pipe(**inputs).images
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionModelEditingSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = torch.manual_seed(seed)
-        inputs = {
-            "prompt": "A field of roses",
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_model_editing_default(self):
-        model_ckpt = "CompVis/stable-diffusion-v1-4"
-        pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-
-        expected_slice = np.array(
-            [0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658]
-        )
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-2
-
-        # make sure image changes after editing
-        pipe.edit_model("A pack of roses", "A pack of blue roses")
-
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(expected_slice - image_slice).max() > 1e-1
-
-    def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        model_ckpt = "CompVis/stable-diffusion-v1-4"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionModelEditingPipeline.from_pretrained(
-            model_ckpt, scheduler=scheduler, safety_checker=None
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs()
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 4.4 GB is allocated
-        assert mem_bytes < 4.4 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
deleted file mode 100644
index af26e19cca732ee3144bb38929949499d41f64b5..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
+++ /dev/null
@@ -1,342 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPanoramaPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@skip_mps
-class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPanoramaPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler()
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.manual_seed(seed)
-        inputs = {
-            "prompt": "a photo of the dolomites",
-            "generator": generator,
-            # Setting height and width to None to prevent OOMs on CPU.
-            "height": None,
-            "width": None,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_panorama_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.5101, 0.5006, 0.4962, 0.3995, 0.3501, 0.4632, 0.5339, 0.525, 0.4878])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_panorama_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.5326, 0.5009, 0.5074, 0.4133, 0.371, 0.464, 0.5432, 0.5429, 0.4896])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_panorama_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [0.48235387, 0.5423796, 0.46016198, 0.5377287, 0.5803722, 0.4876525, 0.5515428, 0.5045897, 0.50709957]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_panorama_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler()
-        sd_pipe = StableDiffusionPanoramaPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        # the pipeline does not expect pndm so test if it raises error.
-        with self.assertRaises(ValueError):
-            _ = sd_pipe(**inputs).images
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionPanoramaSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, seed=0):
-        generator = torch.manual_seed(seed)
-        inputs = {
-            "prompt": "a photo of the dolomites",
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_panorama_default(self):
-        model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 2048, 3)
-
-        expected_slice = np.array(
-            [
-                0.36968392,
-                0.27025372,
-                0.32446766,
-                0.28379387,
-                0.36363274,
-                0.30733347,
-                0.27100027,
-                0.27054125,
-                0.25536096,
-            ]
-        )
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-2
-
-    def test_stable_diffusion_panorama_k_lms(self):
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 2048, 3)
-
-        expected_slice = np.array(
-            [
-                [
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                    0.0,
-                ]
-            ]
-        )
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-3
-
-    def test_stable_diffusion_panorama_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 256)
-                latents_slice = latents[0, -3:, -3:, -1]
-
-                expected_slice = np.array(
-                    [
-                        0.18681869,
-                        0.33907816,
-                        0.5361276,
-                        0.14432865,
-                        -0.02856611,
-                        -0.73941123,
-                        0.23397987,
-                        0.47322682,
-                        -0.37823164,
-                    ]
-                )
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 256)
-                latents_slice = latents[0, -3:, -3:, -1]
-
-                expected_slice = np.array(
-                    [
-                        0.18539645,
-                        0.33987248,
-                        0.5378559,
-                        0.14437142,
-                        -0.02455261,
-                        -0.7338317,
-                        0.23990755,
-                        0.47356272,
-                        -0.3786505,
-                    ]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
-
-    def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        model_ckpt = "stabilityai/stable-diffusion-2-base"
-        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
-        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs()
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 5.2 GB is allocated
-        assert mem_bytes < 5.5 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
deleted file mode 100644
index 46b93a0589ce1775e26921a6cc5dcdcf464c4b29..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMInverseScheduler,
-    DDIMScheduler,
-    DDPMScheduler,
-    EulerAncestralDiscreteScheduler,
-    LMSDiscreteScheduler,
-    StableDiffusionPix2PixZeroPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@skip_mps
-class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPix2PixZeroPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    @classmethod
-    def setUpClass(cls):
-        cls.source_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
-        )
-
-        cls.target_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
-        )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler()
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-            "inverse_scheduler": None,
-            "caption_generator": None,
-            "caption_processor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        generator = torch.manual_seed(seed)
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "cross_attention_guidance_amount": 0.15,
-            "source_embeds": self.source_embeds,
-            "target_embeds": self.target_embeds,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_zero_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5184, 0.503, 0.4917, 0.4022, 0.3455, 0.464, 0.5324, 0.5323, 0.4894])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5464, 0.5072, 0.5012, 0.4124, 0.3624, 0.466, 0.5413, 0.5468, 0.4927])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_zero_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
-        )
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5114, 0.5051, 0.5222, 0.5279, 0.5037, 0.5156, 0.4604, 0.4966, 0.504])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_pix2pix_zero_ddpm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = DDPMScheduler()
-        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5185, 0.5027, 0.492, 0.401, 0.3445, 0.464, 0.5321, 0.5327, 0.4892])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    # Non-determinism caused by the scheduler optimizing the latent inputs during inference
-    @unittest.skip("non-deterministic pipeline")
-    def test_inference_batch_single_identical(self):
-        return super().test_inference_batch_single_identical()
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionPix2PixZeroPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @classmethod
-    def setUpClass(cls):
-        cls.source_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt"
-        )
-
-        cls.target_embeds = load_pt(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt"
-        )
-
-    def get_inputs(self, seed=0):
-        generator = torch.manual_seed(seed)
-
-        inputs = {
-            "prompt": "turn him into a cyborg",
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "cross_attention_guidance_amount": 0.15,
-            "source_embeds": self.source_embeds,
-            "target_embeds": self.target_embeds,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_pix2pix_zero_default(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.5742, 0.5757, 0.5747, 0.5781, 0.5688, 0.5713, 0.5742, 0.5664, 0.5747])
-
-        assert np.abs(expected_slice - image_slice).max() < 5e-2
-
-    def test_stable_diffusion_pix2pix_zero_k_lms(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.6367, 0.5459, 0.5146, 0.5479, 0.4905, 0.4753, 0.4961, 0.4629, 0.4624])
-
-        assert np.abs(expected_slice - image_slice).max() < 5e-2
-
-    def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.1345, 0.268, 0.1539, 0.0726, 0.0959, 0.2261, -0.2673, 0.0277, -0.2062])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.1393, 0.2637, 0.1617, 0.0724, 0.0987, 0.2271, -0.2666, 0.0299, -0.2104])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 3
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs()
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 8.2 GB is allocated
-        assert mem_bytes < 8.2 * 10**9
-
-
-@slow
-@require_torch_gpu
-class InversionPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @classmethod
-    def setUpClass(cls):
-        raw_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
-        )
-
-        raw_image = raw_image.convert("RGB").resize((512, 512))
-
-        cls.raw_image = raw_image
-
-    def test_stable_diffusion_pix2pix_inversion(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
-        inv_latents = output[0]
-
-        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
-
-        assert inv_latents.shape == (1, 4, 64, 64)
-        expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666])
-
-        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
-
-    def test_stable_diffusion_2_pix2pix_inversion(self):
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
-        inv_latents = output[0]
-
-        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
-
-        assert inv_latents.shape == (1, 4, 64, 64)
-        expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050])
-
-        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
-
-    def test_stable_diffusion_pix2pix_full(self):
-        # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog.png
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.npy"
-        )
-
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator)
-        inv_latents = output[0]
-
-        source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-        target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-
-        source_embeds = pipe.get_embeds(source_prompts)
-        target_embeds = pipe.get_embeds(target_prompts)
-
-        image = pipe(
-            caption,
-            source_embeds=source_embeds,
-            target_embeds=target_embeds,
-            num_inference_steps=50,
-            cross_attention_guidance_amount=0.15,
-            generator=generator,
-            latents=inv_latents,
-            negative_prompt=caption,
-            output_type="np",
-        ).images
-
-        max_diff = np.abs(expected_image - image).mean()
-        assert max_diff < 0.05
-
-    def test_stable_diffusion_2_pix2pix_full(self):
-        # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog_2.png
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog_2.npy"
-        )
-
-        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
-
-        caption = "a photography of a cat with flowers"
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipe.invert(caption, image=self.raw_image, generator=generator)
-        inv_latents = output[0]
-
-        source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
-        target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
-
-        source_embeds = pipe.get_embeds(source_prompts)
-        target_embeds = pipe.get_embeds(target_prompts)
-
-        image = pipe(
-            caption,
-            source_embeds=source_embeds,
-            target_embeds=target_embeds,
-            num_inference_steps=125,
-            cross_attention_guidance_amount=0.015,
-            generator=generator,
-            latents=inv_latents,
-            negative_prompt=caption,
-            output_type="np",
-        ).images
-
-        mean_diff = np.abs(expected_image - image).mean()
-        assert mean_diff < 0.25
diff --git a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
deleted file mode 100644
index abaefbcad0118cf494d10e6ba4c44638af9d285d..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionSAGPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionSAGPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionSAGPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    test_cpu_offload = False
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": ".",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 1.0,
-            "sag_scale": 1.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_1(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        sag_pipe = sag_pipe.to(torch_device)
-        sag_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "."
-        generator = torch.manual_seed(0)
-        output = sag_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
-        )
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1568, 0.1738, 0.1695, 0.1693, 0.1507, 0.1705, 0.1547, 0.1751, 0.1949])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
-
-    def test_stable_diffusion_2(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sag_pipe = sag_pipe.to(torch_device)
-        sag_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "."
-        generator = torch.manual_seed(0)
-        output = sag_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, sag_scale=1.0, num_inference_steps=20, output_type="np"
-        )
-
-        image = output.images
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3459, 0.2876, 0.2537, 0.3002, 0.2671, 0.2160, 0.3026, 0.2262, 0.2371])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
-
-    def test_stable_diffusion_2_non_square(self):
-        sag_pipe = StableDiffusionSAGPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base")
-        sag_pipe = sag_pipe.to(torch_device)
-        sag_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "."
-        generator = torch.manual_seed(0)
-        output = sag_pipe(
-            [prompt],
-            width=768,
-            height=512,
-            generator=generator,
-            guidance_scale=7.5,
-            sag_scale=1.0,
-            num_inference_steps=20,
-            output_type="np",
-        )
-
-        image = output.images
-
-        assert image.shape == (1, 512, 768, 3)
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/__init__.py b/diffusers/tests/pipelines/stable_diffusion_2/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
deleted file mode 100644
index fa3c3d628e4f1ec74c6729db436e4f20c0e714c5..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ /dev/null
@@ -1,563 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    logging,
-)
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusion2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5649, 0.6022, 0.4804, 0.5270, 0.5585, 0.4643, 0.5159, 0.4963, 0.4793])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5099, 0.5677, 0.4671, 0.5128, 0.5697, 0.4676, 0.5277, 0.4964, 0.4946])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_lms(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler_ancestral(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerAncestralDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4715, 0.5376, 0.4569, 0.5224, 0.5734, 0.4797, 0.5465, 0.5074, 0.5046])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_k_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        components["scheduler"] = EulerDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4717, 0.5376, 0.4568, 0.5225, 0.5734, 0.4797, 0.5467, 0.5074, 0.5043])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_long_prompt(self):
-        components = self.get_dummy_components()
-        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
-        sd_pipe = StableDiffusionPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        do_classifier_free_guidance = True
-        negative_prompt = None
-        num_images_per_prompt = 1
-        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
-
-        prompt = 25 * "@"
-        with CaptureLogger(logger) as cap_logger_3:
-            text_embeddings_3 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        prompt = 100 * "@"
-        with CaptureLogger(logger) as cap_logger:
-            text_embeddings = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        negative_prompt = "Hello"
-        with CaptureLogger(logger) as cap_logger_2:
-            text_embeddings_2 = sd_pipe._encode_prompt(
-                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
-            )
-
-        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
-        assert text_embeddings.shape[1] == 77
-
-        assert cap_logger.out == cap_logger_2.out
-        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
-        assert cap_logger.out.count("@") == 25
-        assert cap_logger_3.out == ""
-
-
-@slow
-@require_torch_gpu
-class StableDiffusion2PipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_default_ddim(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_pndm(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_k_lms(self):
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1].flatten()
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.10440, 0.13115, 0.11100, 0.10141, 0.11440, 0.07215, 0.11332, 0.09693, 0.10006])
-        assert np.abs(image_slice - expected_slice).max() < 1e-4
-
-    def test_stable_diffusion_attention_slicing(self):
-        torch.cuda.reset_peak_memory_stats()
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # enable attention slicing
-        pipe.enable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image_sliced = pipe(**inputs).images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 3.3 GB is allocated
-        assert mem_bytes < 3.3 * 10**9
-
-        # disable slicing
-        pipe.disable_attention_slicing()
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        image = pipe(**inputs).images
-
-        # make sure that more than 3.3 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 3.3 * 10**9
-        assert np.abs(image_sliced - image).max() < 1e-3
-
-    def test_stable_diffusion_text2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.3862, -0.4507, -1.1729, 0.0686, -1.1045, 0.7124, -1.8301, 0.1903, 1.2773]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 64, 64)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [0.2720, -0.1863, -0.7383, -0.5029, -0.7534, 0.3970, -0.7646, 0.4468, 1.2686]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == inputs["num_inference_steps"]
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base", torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9
-
-    def test_stable_diffusion_pipeline_with_model_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-
-        # Normal inference
-
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base",
-            torch_dtype=torch.float16,
-        )
-        pipe.unet.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        outputs = pipe(**inputs)
-        mem_bytes = torch.cuda.max_memory_allocated()
-
-        # With model offloading
-
-        # Reload but don't move to cuda
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-base",
-            torch_dtype=torch.float16,
-        )
-        pipe.unet.set_default_attn_processor()
-
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_model_cpu_offload()
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_inputs(torch_device, dtype=torch.float16)
-        outputs_offloaded = pipe(**inputs)
-        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
-
-        assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
-        assert mem_bytes_offloaded < mem_bytes
-        assert mem_bytes_offloaded < 3 * 10**9
-        for module in pipe.text_encoder, pipe.unet, pipe.vae:
-            assert module.device == torch.device("cpu")
-
-        # With attention slicing
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe.enable_attention_slicing()
-        _ = pipe(**inputs)
-        mem_bytes_slicing = torch.cuda.max_memory_allocated()
-        assert mem_bytes_slicing < mem_bytes_offloaded
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusion2PipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "a photograph of an astronaut riding a horse",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 50,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_2_0_default_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_0_base_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_2_1_default_pndm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_ddim(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
-        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_lms(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_euler(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
-        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_euler.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_stable_diffusion_dpm(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base").to(torch_device)
-        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        image = sd_pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_2_text2img/stable_diffusion_2_1_base_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
deleted file mode 100644
index 780abf304a469ddefbe35d5f5132367fe3c8213d..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    StableDiffusionAttendAndExcitePipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import load_numpy, skip_mps, slow
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-@skip_mps
-class StableDiffusionAttendAndExcitePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionAttendAndExcitePipeline
-    test_attention_slicing = False
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"})
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = inputs = {
-            "prompt": "a cat and a frog",
-            "token_indices": [2, 5],
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-            "max_iter_to_alter": 2,
-            "thresholds": {0: 0.7},
-        }
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        self.assertEqual(image.shape, (1, 64, 64, 3))
-        expected_slice = np.array(
-            [0.5644937, 0.60543084, 0.48239064, 0.5206757, 0.55623394, 0.46045133, 0.5100435, 0.48919064, 0.4759359]
-        )
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 1e-3)
-
-    def test_inference_batch_consistent(self):
-        # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches
-        self._test_inference_batch_consistent(batch_sizes=[2, 4])
-
-
-@require_torch_gpu
-@slow
-class StableDiffusionAttendAndExcitePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_attend_and_excite_fp16(self):
-        generator = torch.manual_seed(51)
-
-        pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe.to("cuda")
-
-        prompt = "a painting of an elephant with glasses"
-        token_indices = [5, 7]
-
-        image = pipe(
-            prompt=prompt,
-            token_indices=token_indices,
-            guidance_scale=7.5,
-            generator=generator,
-            num_inference_steps=5,
-            max_iter_to_alter=5,
-            output_type="numpy",
-        ).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/attend-and-excite/elephant_glasses.npy"
-        )
-        assert np.abs((expected_image - image).max()) < 5e-1
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
deleted file mode 100644
index c2ad239f6888027a0e39c844c826f9482770b754..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
+++ /dev/null
@@ -1,587 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    DPTConfig,
-    DPTFeatureExtractor,
-    DPTForDepthEstimation,
-)
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionDepth2ImgPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import (
-    floats_tensor,
-    is_accelerate_available,
-    is_accelerate_version,
-    load_image,
-    load_numpy,
-    nightly,
-    slow,
-    torch_device,
-)
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@skip_mps
-class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionDepth2ImgPipeline
-    test_save_load_optional_components = False
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=5,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        backbone_config = {
-            "global_padding": "same",
-            "layer_type": "bottleneck",
-            "depths": [3, 4, 9],
-            "out_features": ["stage1", "stage2", "stage3"],
-            "embedding_dynamic_padding": True,
-            "hidden_sizes": [96, 192, 384, 768],
-            "num_groups": 2,
-        }
-        depth_estimator_config = DPTConfig(
-            image_size=32,
-            patch_size=16,
-            num_channels=3,
-            hidden_size=32,
-            num_hidden_layers=4,
-            backbone_out_indices=(0, 1, 2, 3),
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            is_decoder=False,
-            initializer_range=0.02,
-            is_hybrid=True,
-            backbone_config=backbone_config,
-            backbone_featmap_shape=[1, 384, 24, 24],
-        )
-        depth_estimator = DPTForDepthEstimation(depth_estimator_config)
-        feature_extractor = DPTFeatureExtractor.from_pretrained(
-            "hf-internal-testing/tiny-random-DPTForDepthEstimation"
-        )
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "depth_estimator": depth_estimator,
-            "feature_extractor": feature_extractor,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_save_load_local(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 1e-4)
-
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_save_load_float16(self):
-        components = self.get_dummy_components()
-        for name, module in components.items():
-            if hasattr(module, "half"):
-                components[name] = module.to(torch_device).half()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        for name, component in pipe_loaded.components.items():
-            if hasattr(component, "dtype"):
-                self.assertTrue(
-                    component.dtype == torch.float16,
-                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
-                )
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(output - output_loaded).max()
-        self.assertLess(max_diff, 2e-2, "The output of the fp16 pipeline changed after saving and loading.")
-
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_float16_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        for name, module in components.items():
-            if hasattr(module, "half"):
-                components[name] = module.half()
-        pipe_fp16 = self.pipeline_class(**components)
-        pipe_fp16.to(torch_device)
-        pipe_fp16.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
-        output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
-
-        max_diff = np.abs(output - output_fp16).max()
-        self.assertLess(max_diff, 1.3e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
-        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
-    )
-    def test_cpu_offload_forward_pass(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_offload = pipe(**inputs)[0]
-
-        pipe.enable_sequential_cpu_offload()
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_offload = pipe(**inputs)[0]
-
-        max_diff = np.abs(output_with_offload - output_without_offload).max()
-        self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
-
-    def test_dict_tuple_outputs_equivalent(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
-        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
-
-        max_diff = np.abs(output - output_tuple).max()
-        self.assertLess(max_diff, 1e-4)
-
-    def test_progress_bar(self):
-        super().test_progress_bar()
-
-    def test_stable_diffusion_depth2img_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        if torch_device == "mps":
-            expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])
-        else:
-            expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_depth2img_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "french fries"
-        output = pipe(**inputs, negative_prompt=negative_prompt)
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        if torch_device == "mps":
-            expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335])
-        else:
-            expected_slice = np.array([0.6296, 0.5125, 0.3890, 0.4456, 0.5955, 0.4621, 0.3810, 0.5310, 0.4626])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_depth2img_multiple_init_images(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * 2
-        inputs["image"] = 2 * [inputs["image"]]
-        image = pipe(**inputs).images
-        image_slice = image[-1, -3:, -3:, -1]
-
-        assert image.shape == (2, 32, 32, 3)
-
-        if torch_device == "mps":
-            expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])
-        else:
-            expected_slice = np.array([0.6267, 0.5232, 0.6001, 0.6738, 0.5029, 0.6429, 0.5364, 0.4159, 0.4674])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    def test_stable_diffusion_depth2img_pil(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = StableDiffusionDepth2ImgPipeline(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        if torch_device == "mps":
-            expected_slice = np.array([0.53232, 0.47015, 0.40868, 0.45651, 0.4891, 0.4668, 0.4287, 0.48822, 0.47439])
-        else:
-            expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        return super().test_attention_slicing_forward_pass()
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
-        )
-        inputs = {
-            "prompt": "two tigers",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_depth2img_pipeline_default(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 480, 640, 3)
-        expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.8260, 0.7747, 0.7421])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_depth2img_pipeline_k_lms(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 480, 640, 3)
-        expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.6370, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_depth2img_pipeline_ddim(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None
-        )
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images
-        image_slice = image[0, 253:256, 253:256, -1].flatten()
-
-        assert image.shape == (1, 480, 640, 3)
-        expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.6420, 0.6522, 0.6555, 0.6436])
-
-        assert np.abs(expected_slice - image_slice).max() < 1e-4
-
-    def test_stable_diffusion_depth2img_intermediate_state(self):
-        number_of_steps = 0
-
-        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 1:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 60, 80)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.7168, -1.5137, -0.1418, -2.9219, -2.7266, -2.4414, -2.1035, -3.0078, -1.7051]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 2:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 60, 80)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array(
-                    [-0.7109, -1.5068, -0.1403, -2.9160, -2.7207, -2.4414, -2.1035, -3.0059, -1.7090]
-                )
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        callback_fn.has_been_called = False
-
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        inputs = self.get_inputs(dtype=torch.float16)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        assert callback_fn.has_been_called
-        assert number_of_steps == 2
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        inputs = self.get_inputs(dtype=torch.float16)
-        _ = pipe(**inputs)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.9 GB is allocated
-        assert mem_bytes < 2.9 * 10**9
-
-
-@nightly
-@require_torch_gpu
-class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=device).manual_seed(seed)
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"
-        )
-        inputs = {
-            "prompt": "two tigers",
-            "image": init_image,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "strength": 0.75,
-            "guidance_scale": 7.5,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_depth2img_pndm(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_depth2img/stable_diffusion_2_0_pndm.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_depth2img_ddim(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_depth2img/stable_diffusion_2_0_ddim.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img2img_lms(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        image = pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_depth2img/stable_diffusion_2_0_lms.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
-
-    def test_img2img_dpm(self):
-        pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs()
-        inputs["num_inference_steps"] = 30
-        image = pipe(**inputs).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
-            "/stable_diffusion_depth2img/stable_diffusion_2_0_dpm_multi.npy"
-        )
-        max_diff = np.abs(expected_image - image).max()
-        assert max_diff < 1e-3
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
deleted file mode 100644
index 8db8ec7810068aab4517fe2066e3fab10a52f6f7..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-from diffusers import FlaxDPMSolverMultistepScheduler, FlaxStableDiffusionPipeline
-from diffusers.utils import is_flax_available, slow
-from diffusers.utils.testing_utils import require_flax
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from flax.jax_utils import replicate
-    from flax.training.common_utils import shard
-
-
-@slow
-@require_flax
-class FlaxStableDiffusion2PipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-
-    def test_stable_diffusion_flax(self):
-        sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2",
-            revision="bf16",
-            dtype=jnp.bfloat16,
-        )
-
-        prompt = "A painting of a squirrel eating a burger"
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = sd_pipe.prepare_inputs(prompt)
-
-        params = replicate(params)
-        prompt_ids = shard(prompt_ids)
-
-        prng_seed = jax.random.PRNGKey(0)
-        prng_seed = jax.random.split(prng_seed, jax.device_count())
-
-        images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0]
-        assert images.shape == (jax.device_count(), 1, 768, 768, 3)
-
-        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
-        image_slice = images[0, 253:256, 253:256, -1]
-
-        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
-        expected_slice = jnp.array([0.4238, 0.4414, 0.4395, 0.4453, 0.4629, 0.4590, 0.4531, 0.45508, 0.4512])
-        print(f"output_slice: {output_slice}")
-        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_dpm_flax(self):
-        model_id = "stabilityai/stable-diffusion-2"
-        scheduler, scheduler_params = FlaxDPMSolverMultistepScheduler.from_pretrained(model_id, subfolder="scheduler")
-        sd_pipe, params = FlaxStableDiffusionPipeline.from_pretrained(
-            model_id,
-            scheduler=scheduler,
-            revision="bf16",
-            dtype=jnp.bfloat16,
-        )
-        params["scheduler"] = scheduler_params
-
-        prompt = "A painting of a squirrel eating a burger"
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = sd_pipe.prepare_inputs(prompt)
-
-        params = replicate(params)
-        prompt_ids = shard(prompt_ids)
-
-        prng_seed = jax.random.PRNGKey(0)
-        prng_seed = jax.random.split(prng_seed, jax.device_count())
-
-        images = sd_pipe(prompt_ids, params, prng_seed, num_inference_steps=25, jit=True)[0]
-        assert images.shape == (jax.device_count(), 1, 768, 768, 3)
-
-        images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
-        image_slice = images[0, 253:256, 253:256, -1]
-
-        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
-        expected_slice = jnp.array([0.4336, 0.42969, 0.4453, 0.4199, 0.4297, 0.4531, 0.4434, 0.4434, 0.4297])
-        print(f"output_slice: {output_slice}")
-        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
deleted file mode 100644
index 432619a79ddd32d288893e3021a14ab6893b370a..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_flax_inpaint.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-from diffusers import FlaxStableDiffusionInpaintPipeline
-from diffusers.utils import is_flax_available, load_image, slow
-from diffusers.utils.testing_utils import require_flax
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from flax.jax_utils import replicate
-    from flax.training.common_utils import shard
-
-
-@slow
-@require_flax
-class FlaxStableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-
-    def test_stable_diffusion_inpaint_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-
-        model_id = "xvjiarui/stable-diffusion-2-inpainting"
-        pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 50
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        init_image = num_samples * [init_image]
-        mask_image = num_samples * [mask_image]
-        prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(prompt, init_image, mask_image)
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, jax.device_count())
-        prompt_ids = shard(prompt_ids)
-        processed_masked_images = shard(processed_masked_images)
-        processed_masks = shard(processed_masks)
-
-        output = pipeline(
-            prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True
-        )
-
-        images = output.images.reshape(num_samples, 512, 512, 3)
-
-        image_slice = images[0, 253:256, 253:256, -1]
-
-        output_slice = jnp.asarray(jax.device_get(image_slice.flatten()))
-        expected_slice = jnp.array(
-            [0.3611307, 0.37649736, 0.3757408, 0.38213953, 0.39295167, 0.3841631, 0.41554978, 0.4137475, 0.4217084]
-        )
-        print(f"output_slice: {output_slice}")
-        assert jnp.abs(output_slice - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
deleted file mode 100644
index ee059314904fc31e748e99db6674c76190530ef7..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, slow
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusion2InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionInpaintPipeline
-    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=9,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "safety_checker": None,
-            "feature_extractor": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        # TODO: use tensor inputs instead of PIL, this is here just to leave the old expected_slices untouched
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image.cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((64, 64))
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": init_image,
-            "mask_image": mask_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_stable_diffusion_inpaint(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionInpaintPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4727, 0.5735, 0.3941, 0.5446, 0.5926, 0.4394, 0.5062, 0.4654, 0.4476])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionInpaintPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_inpaint_pipeline(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
-            "/yellow_cat_sitting_on_a_park_bench.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(model_id, safety_checker=None)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-3
-
-    def test_stable_diffusion_inpaint_pipeline_fp16(self):
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint"
-            "/yellow_cat_sitting_on_a_park_bench_fp16.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16,
-            safety_checker=None,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 5e-1
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-inpaint/init_image.png"
-        )
-        mask_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-inpaint/mask.png"
-        )
-
-        model_id = "stabilityai/stable-diffusion-2-inpainting"
-        pndm = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")
-        pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            model_id,
-            safety_checker=None,
-            scheduler=pndm,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        _ = pipe(
-            prompt=prompt,
-            image=init_image,
-            mask_image=mask_image,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.65 GB is allocated
-        assert mem_bytes < 2.65 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
deleted file mode 100644
index 38f4b053714bb048f412883b10cb12bfbb010e93..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    EulerDiscreteScheduler,
-    StableDiffusionLatentUpscalePipeline,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionLatentUpscalePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableDiffusionLatentUpscalePipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {
-        "height",
-        "width",
-        "cross_attention_kwargs",
-        "negative_prompt_embeds",
-        "prompt_embeds",
-    }
-    required_optional_params = PipelineTesterMixin.required_optional_params - {"num_images_per_prompt"}
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-    test_cpu_offload = True
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 4
-        sizes = (16, 16)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            act_fn="gelu",
-            attention_head_dim=8,
-            norm_num_groups=None,
-            block_out_channels=[32, 32, 64, 64],
-            time_cond_proj_dim=160,
-            conv_in_kernel=1,
-            conv_out_kernel=1,
-            cross_attention_dim=32,
-            down_block_types=(
-                "KDownBlock2D",
-                "KCrossAttnDownBlock2D",
-                "KCrossAttnDownBlock2D",
-                "KCrossAttnDownBlock2D",
-            ),
-            in_channels=8,
-            mid_block_type=None,
-            only_cross_attention=False,
-            out_channels=5,
-            resnet_time_scale_shift="scale_shift",
-            time_embedding_type="fourier",
-            timestep_post_act="gelu",
-            up_block_types=("KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KCrossAttnUpBlock2D", "KUpBlock2D"),
-        )
-        vae = AutoencoderKL(
-            block_out_channels=[32, 32, 64, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=[
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-                "DownEncoderBlock2D",
-            ],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        scheduler = EulerDiscreteScheduler(prediction_type="sample")
-        text_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="quick_gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": model.eval(),
-            "vae": vae.eval(),
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "image": self.dummy_image.cpu(),
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        image = pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        self.assertEqual(image.shape, (1, 256, 256, 3))
-        expected_slice = np.array(
-            [0.47222412, 0.41921633, 0.44717434, 0.46874192, 0.42588258, 0.46150726, 0.4677534, 0.45583832, 0.48579055]
-        )
-        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
-        self.assertLessEqual(max_diff, 1e-3)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(relax_max_difference=False)
-
-
-@require_torch_gpu
-@slow
-class StableDiffusionLatentUpscalePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_latent_upscaler_fp16(self):
-        generator = torch.manual_seed(33)
-
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
-        pipe.to("cuda")
-
-        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
-        )
-        upscaler.to("cuda")
-
-        prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
-
-        low_res_latents = pipe(prompt, generator=generator, output_type="latent").images
-
-        image = upscaler(
-            prompt=prompt,
-            image=low_res_latents,
-            num_inference_steps=20,
-            guidance_scale=0,
-            generator=generator,
-            output_type="np",
-        ).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/astronaut_1024.npy"
-        )
-        assert np.abs((expected_image - image).mean()) < 5e-2
-
-    def test_latent_upscaler_fp16_image(self):
-        generator = torch.manual_seed(33)
-
-        upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(
-            "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16
-        )
-        upscaler.to("cuda")
-
-        prompt = "the temple of fire by Ross Tran and Gerardo Dottori, oil on canvas"
-
-        low_res_img = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_512.png"
-        )
-
-        image = upscaler(
-            prompt=prompt,
-            image=low_res_img,
-            num_inference_steps=20,
-            guidance_scale=0,
-            generator=generator,
-            output_type="np",
-        ).images[0]
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/latent-upscaler/fire_temple_1024.npy"
-        )
-        assert np.abs((expected_image - image).max()) < 5e-2
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
deleted file mode 100644
index b8e7b858130bfd7ce9d8189d30a71cdd86e00b7e..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel
-from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusionUpscalePipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_cond_unet_upscale(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=7,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=8,
-            use_linear_projection=True,
-            only_cross_attention=(True, True, False),
-            num_class_embeds=100,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        return CLIPTextModel(config)
-
-    def test_stable_diffusion_upscale(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width, 3)
-        expected_slice = np.array([0.2562, 0.3606, 0.4204, 0.4469, 0.4822, 0.4647, 0.5315, 0.5748, 0.5606])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_upscale_batch(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        output = sd_pipe(
-            2 * [prompt],
-            image=2 * [low_res_image],
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-        assert image.shape[0] == 2
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            num_images_per_prompt=2,
-            guidance_scale=6.0,
-            noise_level=20,
-            num_inference_steps=2,
-            output_type="np",
-        )
-        image = output.images
-        assert image.shape[0] == 2
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_upscale_fp16(self):
-        """Test that stable diffusion upscale works with fp16"""
-        unet = self.dummy_cond_unet_upscale
-        low_res_scheduler = DDPMScheduler()
-        scheduler = DDIMScheduler(prediction_type="v_prediction")
-        vae = self.dummy_vae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image.cpu().permute(0, 2, 3, 1)[0]
-        low_res_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((64, 64))
-
-        # put models in fp16, except vae as it overflows in fp16
-        unet = unet.half()
-        text_encoder = text_encoder.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionUpscalePipeline(
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            max_noise_level=350,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = sd_pipe(
-            [prompt],
-            image=low_res_image,
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-
-        expected_height_width = low_res_image.size[0] * 4
-        assert image.shape == (1, expected_height_width, expected_height_width, 3)
-
-
-@slow
-@require_torch_gpu
-class StableDiffusionUpscalePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_upscale_pipeline(self):
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-upscale/low_res_cat.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
-            "/upsampled_cat.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        pipe = StableDiffusionUpscalePipeline.from_pretrained(model_id)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "a cat sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 1e-3
-
-    def test_stable_diffusion_upscale_pipeline_fp16(self):
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-upscale/low_res_cat.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale"
-            "/upsampled_cat_fp16.npy"
-        )
-
-        model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        pipe = StableDiffusionUpscalePipeline.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "a cat sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        output = pipe(
-            prompt=prompt,
-            image=image,
-            generator=generator,
-            output_type="np",
-        )
-        image = output.images[0]
-
-        assert image.shape == (512, 512, 3)
-        assert np.abs(expected_image - image).max() < 5e-1
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/sd2-upscale/low_res_cat.png"
-        )
-
-        model_id = "stabilityai/stable-diffusion-x4-upscaler"
-        pipe = StableDiffusionUpscalePipeline.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16,
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing(1)
-        pipe.enable_sequential_cpu_offload()
-
-        prompt = "a cat sitting on a park bench"
-
-        generator = torch.manual_seed(0)
-        _ = pipe(
-            prompt=prompt,
-            image=image,
-            generator=generator,
-            num_inference_steps=5,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.9 GB is allocated
-        assert mem_bytes < 2.9 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
deleted file mode 100644
index 8aab5845741c638d2d93a28f1a23616086adbddb..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import time
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerDiscreteScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.utils import load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-            # SD2-specific config below
-            attention_head_dim=(2, 4),
-            use_linear_projection=True,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            # SD2-specific config below
-            hidden_act="gelu",
-            projection_dim=64,
-        )
-        return CLIPTextModel(config)
-
-    def test_stable_diffusion_v_pred_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            prediction_type="v_prediction",
-        )
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.6424, 0.6109, 0.494, 0.5088, 0.4984, 0.4525, 0.5059, 0.5068, 0.4474])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_v_pred_k_euler(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = EulerDiscreteScheduler(
-            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="v_prediction"
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.4616, 0.5184, 0.4887, 0.5111, 0.4839, 0.48, 0.5119, 0.5263, 0.4776])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_v_pred_fp16(self):
-        """Test that stable diffusion v-prediction works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-            prediction_type="v_prediction",
-        )
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=None,
-            requires_safety_checker=False,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = sd_pipe([prompt], generator=generator, num_inference_steps=2, output_type="np").images
-
-        assert image.shape == (1, 64, 64, 3)
-
-
-@slow
-@require_torch_gpu
-class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_diffusion_v_pred_default(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
-
-        image = output.images
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([0.1868, 0.1922, 0.1527, 0.1921, 0.1908, 0.1624, 0.1779, 0.1652, 0.1734])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_v_pred_upcast_attention(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=20, output_type="np")
-
-        image = output.images
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([0.4209, 0.4087, 0.4097, 0.4209, 0.3860, 0.4329, 0.4280, 0.4324, 0.4187])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
-
-    def test_stable_diffusion_v_pred_euler(self):
-        scheduler = EulerDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-2", subfolder="scheduler")
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-
-        output = sd_pipe([prompt], generator=generator, num_inference_steps=5, output_type="numpy")
-        image = output.images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([0.1781, 0.1695, 0.1661, 0.1705, 0.1588, 0.1699, 0.2005, 0.1589, 0.1677])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_v_pred_dpm(self):
-        """
-        TODO: update this test after making DPM compatible with V-prediction!
-        """
-        scheduler = DPMSolverMultistepScheduler.from_pretrained(
-            "stabilityai/stable-diffusion-2", subfolder="scheduler"
-        )
-        sd_pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", scheduler=scheduler)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.enable_attention_slicing()
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photograph of an astronaut riding a horse"
-        generator = torch.manual_seed(0)
-        image = sd_pipe(
-            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=5, output_type="numpy"
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-        assert image.shape == (1, 768, 768, 3)
-        expected_slice = np.array([0.3303, 0.3184, 0.3291, 0.3300, 0.3256, 0.3113, 0.2965, 0.3134, 0.3192])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_attention_slicing_v_pred(self):
-        torch.cuda.reset_peak_memory_stats()
-        model_id = "stabilityai/stable-diffusion-2"
-        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "a photograph of an astronaut riding a horse"
-
-        # make attention efficient
-        pipe.enable_attention_slicing()
-        generator = torch.manual_seed(0)
-        output_chunked = pipe(
-            [prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy"
-        )
-        image_chunked = output_chunked.images
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-        # make sure that less than 5.5 GB is allocated
-        assert mem_bytes < 5.5 * 10**9
-
-        # disable slicing
-        pipe.disable_attention_slicing()
-        generator = torch.manual_seed(0)
-        output = pipe([prompt], generator=generator, guidance_scale=7.5, num_inference_steps=10, output_type="numpy")
-        image = output.images
-
-        # make sure that more than 5.5 GB is allocated
-        mem_bytes = torch.cuda.max_memory_allocated()
-        assert mem_bytes > 5.5 * 10**9
-        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-3
-
-    def test_stable_diffusion_text2img_pipeline_v_pred_default(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "sd2-text2img/astronaut_riding_a_horse_v_pred.npy"
-        )
-
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2")
-        pipe.to(torch_device)
-        pipe.enable_attention_slicing()
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "astronaut riding a horse"
-
-        generator = torch.manual_seed(0)
-        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-        assert np.abs(expected_image - image).max() < 7.5e-2
-
-    def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/"
-            "sd2-text2img/astronaut_riding_a_horse_v_pred_fp16.npy"
-        )
-
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "astronaut riding a horse"
-
-        generator = torch.manual_seed(0)
-        output = pipe(prompt=prompt, guidance_scale=7.5, generator=generator, output_type="np")
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-        assert np.abs(expected_image - image).max() < 7.5e-1
-
-    def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
-        number_of_steps = 0
-
-        def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
-            test_callback_fn.has_been_called = True
-            nonlocal number_of_steps
-            number_of_steps += 1
-            if step == 0:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 96, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.7749, 0.0325, 0.5088, 0.1619, 0.3372, 0.3667, -0.5186, 0.6860, 1.4326])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-            elif step == 19:
-                latents = latents.detach().cpu().numpy()
-                assert latents.shape == (1, 4, 96, 96)
-                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([1.3887, 1.0273, 1.7266, 0.0726, 0.6611, 0.1598, -1.0547, 0.1522, 0.0227])
-
-                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
-
-        test_callback_fn.has_been_called = False
-
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-
-        prompt = "Andromeda galaxy in a bottle"
-
-        generator = torch.manual_seed(0)
-        pipe(
-            prompt=prompt,
-            num_inference_steps=20,
-            guidance_scale=7.5,
-            generator=generator,
-            callback=test_callback_fn,
-            callback_steps=1,
-        )
-        assert test_callback_fn.has_been_called
-        assert number_of_steps == 20
-
-    def test_stable_diffusion_low_cpu_mem_usage_v_pred(self):
-        pipeline_id = "stabilityai/stable-diffusion-2"
-
-        start_time = time.time()
-        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
-        pipeline_low_cpu_mem_usage.to(torch_device)
-        low_cpu_mem_usage_time = time.time() - start_time
-
-        start_time = time.time()
-        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
-        normal_load_time = time.time() - start_time
-
-        assert 2 * low_cpu_mem_usage_time < normal_load_time
-
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading_v_pred(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipeline_id = "stabilityai/stable-diffusion-2"
-        prompt = "Andromeda galaxy in a bottle"
-
-        pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
-        pipeline = pipeline.to(torch_device)
-        pipeline.enable_attention_slicing(1)
-        pipeline.enable_sequential_cpu_offload()
-
-        generator = torch.manual_seed(0)
-        _ = pipeline(prompt, generator=generator, num_inference_steps=5)
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.8 GB is allocated
-        assert mem_bytes < 2.8 * 10**9
diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py b/diffusers/tests/pipelines/stable_diffusion_safe/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
deleted file mode 100644
index 2f393a66d166ef80328af8fbb077013e09b1408d..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py
+++ /dev/null
@@ -1,439 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion_safe import StableDiffusionPipelineSafe as StableDiffusionPipeline
-from diffusers.utils import floats_tensor, nightly, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class SafeDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    @property
-    def dummy_cond_unet(self):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    def test_safe_diffusion_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_pndm(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
-
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=6.0,
-            num_inference_steps=2,
-            output_type="np",
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_stable_diffusion_no_safety_checker(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
-        )
-        assert isinstance(pipe, StableDiffusionPipeline)
-        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
-        assert pipe.safety_checker is None
-
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-        # check that there's no error when saving a pipeline with one of the models being None
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-        # sanity check that the pipeline still works
-        assert pipe.safety_checker is None
-        image = pipe("example prompt", num_inference_steps=2).images[0]
-        assert image is not None
-
-    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
-    def test_stable_diffusion_fp16(self):
-        """Test that stable diffusion works with fp16"""
-        unet = self.dummy_cond_unet
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        # put models in fp16
-        unet = unet.half()
-        vae = vae.half()
-        bert = bert.half()
-
-        # make sure here that pndm scheduler skips prk
-        sd_pipe = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger"
-        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
-
-        assert image.shape == (1, 64, 64, 3)
-
-
-@nightly
-@require_torch_gpu
-class SafeDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_harm_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = (
-            "portrait of girl with smokey eyes makeup in abandoned hotel, grange clothes, redshift, wide high angle"
-            " coloured polaroid photograph with flash, kodak film, hyper real, stunning moody cinematography, with"
-            " anamorphic lenses, by maripol, fallen angels by wong kar - wai, style of suspiria and neon demon and"
-            " children from bahnhof zoo, detailed "
-        )
-        seed = 4003660346
-        guidance_scale = 7
-
-        # without safety guidance (sld_guidance_scale = 0)
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.2278, 0.2231, 0.2249, 0.2333, 0.2303, 0.1885, 0.2273, 0.2144, 0.2176]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        # without safety guidance (strong configuration)
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.2383, 0.2276, 0.236, 0.2192, 0.2186, 0.2053, 0.1971, 0.1901, 0.1719]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_nudity_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
-        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "padme amidala taking a bath artwork, safe for work, no nudity"
-        seed = 2734971755
-        guidance_scale = 7
-
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.3502, 0.3622, 0.3396, 0.3642, 0.3478, 0.3318, 0.35, 0.3348, 0.3297]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = [0.5531, 0.5206, 0.4895, 0.5156, 0.5182, 0.4751, 0.4802, 0.4803, 0.4443]
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_nudity_safetychecker_safe_stable_diffusion(self):
-        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        prompt = (
-            "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c."
-            " leyendecker"
-        )
-        seed = 1044355234
-        guidance_scale = 12
-
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=0,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
-
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-7
-
-        generator = torch.manual_seed(seed)
-        output = sd_pipe(
-            [prompt],
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=50,
-            output_type="np",
-            width=512,
-            height=512,
-            sld_guidance_scale=2000,
-            sld_warmup_steps=7,
-            sld_threshold=0.025,
-            sld_momentum_scale=0.5,
-            sld_mom_beta=0.7,
-        )
-
-        image = output.images
-        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.5818, 0.6285, 0.6835, 0.6019, 0.625, 0.6754, 0.6096, 0.6334, 0.6561])
-        assert image.shape == (1, 512, 512, 3)
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/stable_unclip/__init__.py b/diffusers/tests/pipelines/stable_unclip/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
deleted file mode 100644
index 368ab21f24a91df7ff17ae8bf69a1acdfa949fab..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import gc
-import unittest
-
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DDPMScheduler,
-    PriorTransformer,
-    StableUnCLIPPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableUnCLIPPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-
-    # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false
-    test_xformers_attention = False
-
-    def get_dummy_components(self):
-        embedder_hidden_size = 32
-        embedder_projection_dim = embedder_hidden_size
-
-        # prior components
-
-        torch.manual_seed(0)
-        prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        torch.manual_seed(0)
-        prior_text_encoder = CLIPTextModelWithProjection(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=embedder_projection_dim,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-
-        torch.manual_seed(0)
-        prior = PriorTransformer(
-            num_attention_heads=2,
-            attention_head_dim=12,
-            embedding_dim=embedder_projection_dim,
-            num_layers=1,
-        )
-
-        torch.manual_seed(0)
-        prior_scheduler = DDPMScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="sample",
-            num_train_timesteps=1000,
-            clip_sample=True,
-            clip_sample_range=5.0,
-            beta_schedule="squaredcos_cap_v2",
-        )
-
-        # regular denoising components
-
-        torch.manual_seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
-
-        torch.manual_seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        torch.manual_seed(0)
-        text_encoder = CLIPTextModel(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=32,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            block_out_channels=(32, 64),
-            attention_head_dim=(2, 4),
-            class_embed_type="projection",
-            # The class embeddings are the noise augmented image embeddings.
-            # I.e. the image embeddings concated with the noised embeddings of the same dimension
-            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
-            cross_attention_dim=embedder_hidden_size,
-            layers_per_block=1,
-            upcast_attention=True,
-            use_linear_projection=True,
-        )
-
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_schedule="scaled_linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKL()
-
-        components = {
-            # prior components
-            "prior_tokenizer": prior_tokenizer,
-            "prior_text_encoder": prior_text_encoder,
-            "prior": prior,
-            "prior_scheduler": prior_scheduler,
-            # image noising components
-            "image_normalizer": image_normalizer,
-            "image_noising_scheduler": image_noising_scheduler,
-            # regular denoising components
-            "tokenizer": tokenizer,
-            "text_encoder": text_encoder,
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "prior_num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
-    # because UnCLIP GPU undeterminism requires a looser check.
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device == "cpu"
-
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
-    # because UnCLIP undeterminism requires a looser check.
-    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device in ["cpu", "mps"]
-
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
-
-
-@slow
-@require_torch_gpu
-class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_unclip(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_anime_turtle_fp16.npy"
-        )
-
-        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        # stable unclip will oom when integration tests are run on a V100,
-        # so turn on memory savings
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipe("anime turle", generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
-
-    def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        _ = pipe(
-            "anime turtle",
-            prior_num_inference_steps=2,
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 7 GB is allocated
-        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
deleted file mode 100644
index f93fa3a59014498238591dbf158c09d319d5ad60..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-
-from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableUnCLIPImg2ImgPipeline, UNet2DConditionModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
-from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import floats_tensor, load_image, load_numpy, require_torch_gpu, slow, torch_device
-
-from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import (
-    PipelineTesterMixin,
-    assert_mean_pixel_difference,
-)
-
-
-class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableUnCLIPImg2ImgPipeline
-    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
-    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
-
-    def get_dummy_components(self):
-        embedder_hidden_size = 32
-        embedder_projection_dim = embedder_hidden_size
-
-        # image encoding components
-
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-
-        image_encoder = CLIPVisionModelWithProjection(
-            CLIPVisionConfig(
-                hidden_size=embedder_hidden_size,
-                projection_dim=embedder_projection_dim,
-                num_hidden_layers=5,
-                num_attention_heads=4,
-                image_size=32,
-                intermediate_size=37,
-                patch_size=1,
-            )
-        )
-
-        # regular denoising components
-
-        torch.manual_seed(0)
-        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
-        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
-
-        torch.manual_seed(0)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        torch.manual_seed(0)
-        text_encoder = CLIPTextModel(
-            CLIPTextConfig(
-                bos_token_id=0,
-                eos_token_id=2,
-                hidden_size=embedder_hidden_size,
-                projection_dim=32,
-                intermediate_size=37,
-                layer_norm_eps=1e-05,
-                num_attention_heads=4,
-                num_hidden_layers=5,
-                pad_token_id=1,
-                vocab_size=1000,
-            )
-        )
-
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
-            block_out_channels=(32, 64),
-            attention_head_dim=(2, 4),
-            class_embed_type="projection",
-            # The class embeddings are the noise augmented image embeddings.
-            # I.e. the image embeddings concated with the noised embeddings of the same dimension
-            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
-            cross_attention_dim=embedder_hidden_size,
-            layers_per_block=1,
-            upcast_attention=True,
-            use_linear_projection=True,
-        )
-
-        torch.manual_seed(0)
-        scheduler = DDIMScheduler(
-            beta_schedule="scaled_linear",
-            beta_start=0.00085,
-            beta_end=0.012,
-            prediction_type="v_prediction",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKL()
-
-        components = {
-            # image encoding components
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
-            # image noising components
-            "image_normalizer": image_normalizer,
-            "image_noising_scheduler": image_noising_scheduler,
-            # regular denoising components
-            "tokenizer": tokenizer,
-            "text_encoder": text_encoder,
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0, pil_image=True):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-
-        if pil_image:
-            input_image = input_image * 0.5 + 0.5
-            input_image = input_image.clamp(0, 1)
-            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
-            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
-
-        return {
-            "prompt": "An anime racoon running a marathon",
-            "image": input_image,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "output_type": "np",
-        }
-
-    def test_image_embeds_none(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = StableUnCLIPImg2ImgPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs.update({"image_embeds": None})
-        image = sd_pipe(**inputs).images
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array(
-            [0.34588397, 0.7747054, 0.5453714, 0.5227859, 0.57656777, 0.6532228, 0.5177634, 0.49932978, 0.56626225]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
-
-    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
-    # because GPU undeterminism requires a looser check.
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device in ["cpu", "mps"]
-
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
-    # because undeterminism requires a looser check.
-    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device in ["cpu", "mps"]
-
-        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
-
-
-@slow
-@require_torch_gpu
-class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_unclip_l_img2img(self):
-        input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
-        )
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_img2img_anime_turtle_fp16.npy"
-        )
-
-        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-            "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        # stable unclip will oom when integration tests are run on a V100,
-        # so turn on memory savings
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipe(input_image, "anime turle", generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
-
-    def test_stable_unclip_h_img2img(self):
-        input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
-        )
-
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_h_img2img_anime_turtle_fp16.npy"
-        )
-
-        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        # stable unclip will oom when integration tests are run on a V100,
-        # so turn on memory savings
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipe(input_image, "anime turle", generator=generator, output_type="np")
-
-        image = output.images[0]
-
-        assert image.shape == (768, 768, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
-
-    def test_stable_unclip_img2img_pipeline_with_sequential_cpu_offloading(self):
-        input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
-        )
-
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
-            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
-        )
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        _ = pipe(
-            input_image,
-            "anime turtle",
-            num_inference_steps=2,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 7 GB is allocated
-        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/test_pipeline_utils.py b/diffusers/tests/pipelines/test_pipeline_utils.py
deleted file mode 100644
index 51d987d8bb1151862f910822eb2c173ce4ff313c..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/test_pipeline_utils.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import unittest
-
-from diffusers.pipelines.pipeline_utils import is_safetensors_compatible
-
-
-class IsSafetensorsCompatibleTests(unittest.TestCase):
-    def test_all_is_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_diffusers_model_is_compatible(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_diffusers_model_is_not_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-            "unet/diffusion_pytorch_model.bin",
-            # Removed: 'unet/diffusion_pytorch_model.safetensors',
-        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-
-    def test_transformer_model_is_compatible(self):
-        filenames = [
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-        ]
-        self.assertTrue(is_safetensors_compatible(filenames))
-
-    def test_transformer_model_is_not_compatible(self):
-        filenames = [
-            "safety_checker/pytorch_model.bin",
-            "safety_checker/model.safetensors",
-            "vae/diffusion_pytorch_model.bin",
-            "vae/diffusion_pytorch_model.safetensors",
-            "text_encoder/pytorch_model.bin",
-            # Removed: 'text_encoder/model.safetensors',
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        self.assertFalse(is_safetensors_compatible(filenames))
-
-    def test_all_is_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_compatible_variant(self):
-        filenames = [
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_compatible_variant_partial(self):
-        # pass variant but use the non-variant filenames
-        filenames = [
-            "unet/diffusion_pytorch_model.bin",
-            "unet/diffusion_pytorch_model.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_diffusers_model_is_not_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-            "unet/diffusion_pytorch_model.fp16.bin",
-            # Removed: 'unet/diffusion_pytorch_model.fp16.safetensors',
-        ]
-        variant = "fp16"
-        self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_compatible_variant(self):
-        filenames = [
-            "text_encoder/pytorch_model.fp16.bin",
-            "text_encoder/model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_compatible_variant_partial(self):
-        # pass variant but use the non-variant filenames
-        filenames = [
-            "text_encoder/pytorch_model.bin",
-            "text_encoder/model.safetensors",
-        ]
-        variant = "fp16"
-        self.assertTrue(is_safetensors_compatible(filenames, variant=variant))
-
-    def test_transformer_model_is_not_compatible_variant(self):
-        filenames = [
-            "safety_checker/pytorch_model.fp16.bin",
-            "safety_checker/model.fp16.safetensors",
-            "vae/diffusion_pytorch_model.fp16.bin",
-            "vae/diffusion_pytorch_model.fp16.safetensors",
-            "text_encoder/pytorch_model.fp16.bin",
-            # 'text_encoder/model.fp16.safetensors',
-            "unet/diffusion_pytorch_model.fp16.bin",
-            "unet/diffusion_pytorch_model.fp16.safetensors",
-        ]
-        variant = "fp16"
-        self.assertFalse(is_safetensors_compatible(filenames, variant=variant))
diff --git a/diffusers/tests/pipelines/text_to_video/__init__.py b/diffusers/tests/pipelines/text_to_video/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/text_to_video/test_text_to_video.py b/diffusers/tests/pipelines/text_to_video/test_text_to_video.py
deleted file mode 100644
index e4331fda02ff6511a4b0d5cb7a49c1212129bbe2..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/text_to_video/test_text_to_video.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    TextToVideoSDPipeline,
-    UNet3DConditionModel,
-)
-from diffusers.utils import load_numpy, skip_mps, slow
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@skip_mps
-class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = TextToVideoSDPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    # No `output_type`.
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        unet = UNet3DConditionModel(
-            block_out_channels=(32, 64, 64, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
-            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
-            cross_attention_dim=32,
-            attention_head_dim=4,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            clip_sample=False,
-            set_alpha_to_one=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            sample_size=128,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=512,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        components = {
-            "unet": unet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_text_to_video_default_case(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        sd_pipe = TextToVideoSDPipeline(**components)
-        sd_pipe = sd_pipe.to(device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["output_type"] = "np"
-        frames = sd_pipe(**inputs).frames
-        image_slice = frames[0][-3:, -3:, -1]
-
-        assert frames[0].shape == (64, 64, 3)
-        expected_slice = np.array([166, 184, 167, 118, 102, 123, 108, 93, 114])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_consistent(self):
-        pass
-
-    # (todo): sayakpaul
-    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
-    def test_inference_batch_single_identical(self):
-        pass
-
-    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
-    def test_num_images_per_prompt(self):
-        pass
-
-    def test_progress_bar(self):
-        return super().test_progress_bar()
-
-
-@slow
-@skip_mps
-class TextToVideoSDPipelineSlowTests(unittest.TestCase):
-    def test_full_model(self):
-        expected_video = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
-        )
-
-        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe = pipe.to("cuda")
-
-        prompt = "Spiderman is surfing"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-
-        video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pt").frames
-        video = video_frames.cpu().numpy()
-
-        assert np.abs(expected_video - video).mean() < 5e-2
-
-    def test_two_step_model(self):
-        expected_video = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
-        )
-
-        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
-        pipe = pipe.to("cuda")
-
-        prompt = "Spiderman is surfing"
-        generator = torch.Generator(device="cpu").manual_seed(0)
-
-        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames
-        video = video_frames.cpu().numpy()
-
-        assert np.abs(expected_video - video).mean() < 5e-2
diff --git a/diffusers/tests/pipelines/unclip/__init__.py b/diffusers/tests/pipelines/unclip/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/unclip/test_unclip.py b/diffusers/tests/pipelines/unclip/test_unclip.py
deleted file mode 100644
index c36fb02b190f271d57eca0c54a94a19acad0faf3..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/unclip/test_unclip.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
-
-from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
-from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from diffusers.utils import load_numpy, nightly, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
-
-from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = UnCLIPPipeline
-    params = TEXT_TO_IMAGE_PARAMS - {
-        "negative_prompt",
-        "height",
-        "width",
-        "negative_prompt_embeds",
-        "guidance_scale",
-        "prompt_embeds",
-        "cross_attention_kwargs",
-    }
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = [
-        "generator",
-        "return_dict",
-        "prior_num_inference_steps",
-        "decoder_num_inference_steps",
-        "super_res_num_inference_steps",
-    ]
-    test_xformers_attention = False
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_prior(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "embedding_dim": self.text_embedder_hidden_size,
-            "num_layers": 1,
-        }
-
-        model = PriorTransformer(**model_kwargs)
-        return model
-
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.text_embedder_hidden_size,
-            "time_embed_dim": self.time_embed_dim,
-            "cross_attention_dim": self.cross_attention_dim,
-        }
-
-        model = UnCLIPTextProjModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_decoder(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "sample_size": 32,
-            # RGB in channels
-            "in_channels": 3,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 6,
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_kwargs(self):
-        return {
-            "sample_size": 64,
-            "layers_per_block": 1,
-            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "in_channels": 6,
-            "out_channels": 3,
-        }
-
-    @property
-    def dummy_super_res_first(self):
-        torch.manual_seed(0)
-
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_last(self):
-        # seeded differently to get different unet than `self.dummy_super_res_first`
-        torch.manual_seed(1)
-
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        prior = self.dummy_prior
-        decoder = self.dummy_decoder
-        text_proj = self.dummy_text_proj
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        super_res_first = self.dummy_super_res_first
-        super_res_last = self.dummy_super_res_last
-
-        prior_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="sample",
-            num_train_timesteps=1000,
-            clip_sample_range=5.0,
-        )
-
-        decoder_scheduler = UnCLIPScheduler(
-            variance_type="learned_range",
-            prediction_type="epsilon",
-            num_train_timesteps=1000,
-        )
-
-        super_res_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="epsilon",
-            num_train_timesteps=1000,
-        )
-
-        components = {
-            "prior": prior,
-            "decoder": decoder,
-            "text_proj": text_proj,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "super_res_first": super_res_first,
-            "super_res_last": super_res_last,
-            "prior_scheduler": prior_scheduler,
-            "decoder_scheduler": decoder_scheduler,
-            "super_res_scheduler": super_res_scheduler,
-        }
-
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "horse",
-            "generator": generator,
-            "prior_num_inference_steps": 2,
-            "decoder_num_inference_steps": 2,
-            "super_res_num_inference_steps": 2,
-            "output_type": "numpy",
-        }
-        return inputs
-
-    def test_unclip(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(device))
-        image = output.images
-
-        image_from_tuple = pipe(
-            **self.get_dummy_inputs(device),
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [
-                0.9997,
-                0.9988,
-                0.0028,
-                0.9997,
-                0.9984,
-                0.9965,
-                0.0029,
-                0.9986,
-                0.0025,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_unclip_passed_text_embed(self):
-        device = torch.device("cpu")
-
-        class DummyScheduler:
-            init_noise_sigma = 1
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        prior = components["prior"]
-        decoder = components["decoder"]
-        super_res_first = components["super_res_first"]
-        tokenizer = components["tokenizer"]
-        text_encoder = components["text_encoder"]
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        dtype = prior.dtype
-        batch_size = 1
-
-        shape = (batch_size, prior.config.embedding_dim)
-        prior_latents = pipe.prepare_latents(
-            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-        shape = (batch_size, decoder.in_channels, decoder.sample_size, decoder.sample_size)
-        decoder_latents = pipe.prepare_latents(
-            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-
-        shape = (
-            batch_size,
-            super_res_first.in_channels // 2,
-            super_res_first.sample_size,
-            super_res_first.sample_size,
-        )
-        super_res_latents = pipe.prepare_latents(
-            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "this is a prompt example"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe(
-            [prompt],
-            generator=generator,
-            prior_num_inference_steps=2,
-            decoder_num_inference_steps=2,
-            super_res_num_inference_steps=2,
-            prior_latents=prior_latents,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            output_type="np",
-        )
-        image = output.images
-
-        text_inputs = tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=tokenizer.model_max_length,
-            return_tensors="pt",
-        )
-        text_model_output = text_encoder(text_inputs.input_ids)
-        text_attention_mask = text_inputs.attention_mask
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_text = pipe(
-            generator=generator,
-            prior_num_inference_steps=2,
-            decoder_num_inference_steps=2,
-            super_res_num_inference_steps=2,
-            prior_latents=prior_latents,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            text_model_output=text_model_output,
-            text_attention_mask=text_attention_mask,
-            output_type="np",
-        )[0]
-
-        # make sure passing text embeddings manually is identical
-        assert np.abs(image - image_from_text).max() < 1e-4
-
-    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
-    # because UnCLIP GPU undeterminism requires a looser check.
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device == "cpu"
-
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
-    # because UnCLIP undeterminism requires a looser check.
-    @skip_mps
-    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
-        additional_params_copy_to_batched_inputs = [
-            "prior_num_inference_steps",
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-        )
-
-    def test_inference_batch_consistent(self):
-        additional_params_copy_to_batched_inputs = [
-            "prior_num_inference_steps",
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        if torch_device == "mps":
-            # TODO: MPS errors with larger batch sizes
-            batch_sizes = [2, 3]
-            self._test_inference_batch_consistent(
-                batch_sizes=batch_sizes,
-                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-            )
-        else:
-            self._test_inference_batch_consistent(
-                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
-            )
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-
-@nightly
-class UnCLIPPipelineCPUIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_unclip_karlo_cpu_fp32(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/unclip/karlo_v1_alpha_horse_cpu.npy"
-        )
-
-        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha")
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.manual_seed(0)
-        output = pipeline(
-            "horse",
-            num_images_per_prompt=1,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 1e-1
-
-
-@slow
-@require_torch_gpu
-class UnCLIPPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_unclip_karlo(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/unclip/karlo_v1_alpha_horse_fp16.npy"
-        )
-
-        pipeline = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipeline(
-            "horse",
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
-
-    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
-        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
-
-        pipe = UnCLIPPipeline.from_pretrained("kakaobrain/karlo-v1-alpha", torch_dtype=torch.float16)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.enable_attention_slicing()
-        pipe.enable_sequential_cpu_offload()
-
-        _ = pipe(
-            "horse",
-            num_images_per_prompt=1,
-            prior_num_inference_steps=2,
-            decoder_num_inference_steps=2,
-            super_res_num_inference_steps=2,
-            output_type="np",
-        )
-
-        mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 7 GB is allocated
-        assert mem_bytes < 7 * 10**9
diff --git a/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py b/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py
deleted file mode 100644
index ff32ac5f9aafb9140ec5b49dc79711d493b76788..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/unclip/test_unclip_image_variation.py
+++ /dev/null
@@ -1,508 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import random
-import unittest
-
-import numpy as np
-import torch
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextConfig,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionConfig,
-    CLIPVisionModelWithProjection,
-)
-
-from diffusers import (
-    DiffusionPipeline,
-    UnCLIPImageVariationPipeline,
-    UnCLIPScheduler,
-    UNet2DConditionModel,
-    UNet2DModel,
-)
-from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
-from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps
-
-from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
-from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
-
-
-class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = UnCLIPImageVariationPipeline
-    params = IMAGE_VARIATION_PARAMS - {"height", "width", "guidance_scale"}
-    batch_params = IMAGE_VARIATION_BATCH_PARAMS
-
-    required_optional_params = [
-        "generator",
-        "return_dict",
-        "decoder_num_inference_steps",
-        "super_res_num_inference_steps",
-    ]
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def time_input_dim(self):
-        return 32
-
-    @property
-    def block_out_channels_0(self):
-        return self.time_input_dim
-
-    @property
-    def time_embed_dim(self):
-        return self.time_input_dim * 4
-
-    @property
-    def cross_attention_dim(self):
-        return 100
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModelWithProjection(config)
-
-    @property
-    def dummy_image_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPVisionConfig(
-            hidden_size=self.text_embedder_hidden_size,
-            projection_dim=self.text_embedder_hidden_size,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            image_size=32,
-            intermediate_size=37,
-            patch_size=1,
-        )
-        return CLIPVisionModelWithProjection(config)
-
-    @property
-    def dummy_text_proj(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "clip_embeddings_dim": self.text_embedder_hidden_size,
-            "time_embed_dim": self.time_embed_dim,
-            "cross_attention_dim": self.cross_attention_dim,
-        }
-
-        model = UnCLIPTextProjModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_decoder(self):
-        torch.manual_seed(0)
-
-        model_kwargs = {
-            "sample_size": 32,
-            # RGB in channels
-            "in_channels": 3,
-            # Out channels is double in channels because predicts mean and variance
-            "out_channels": 6,
-            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
-            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
-            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "layers_per_block": 1,
-            "cross_attention_dim": self.cross_attention_dim,
-            "attention_head_dim": 4,
-            "resnet_time_scale_shift": "scale_shift",
-            "class_embed_type": "identity",
-        }
-
-        model = UNet2DConditionModel(**model_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_kwargs(self):
-        return {
-            "sample_size": 64,
-            "layers_per_block": 1,
-            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
-            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
-            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
-            "in_channels": 6,
-            "out_channels": 3,
-        }
-
-    @property
-    def dummy_super_res_first(self):
-        torch.manual_seed(0)
-
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    @property
-    def dummy_super_res_last(self):
-        # seeded differently to get different unet than `self.dummy_super_res_first`
-        torch.manual_seed(1)
-
-        model = UNet2DModel(**self.dummy_super_res_kwargs)
-        return model
-
-    def get_dummy_components(self):
-        decoder = self.dummy_decoder
-        text_proj = self.dummy_text_proj
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        super_res_first = self.dummy_super_res_first
-        super_res_last = self.dummy_super_res_last
-
-        decoder_scheduler = UnCLIPScheduler(
-            variance_type="learned_range",
-            prediction_type="epsilon",
-            num_train_timesteps=1000,
-        )
-
-        super_res_scheduler = UnCLIPScheduler(
-            variance_type="fixed_small_log",
-            prediction_type="epsilon",
-            num_train_timesteps=1000,
-        )
-
-        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
-
-        image_encoder = self.dummy_image_encoder
-
-        return {
-            "decoder": decoder,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "text_proj": text_proj,
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
-            "super_res_first": super_res_first,
-            "super_res_last": super_res_last,
-            "decoder_scheduler": decoder_scheduler,
-            "super_res_scheduler": super_res_scheduler,
-        }
-
-    def get_dummy_inputs(self, device, seed=0, pil_image=True):
-        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        if pil_image:
-            input_image = input_image * 0.5 + 0.5
-            input_image = input_image.clamp(0, 1)
-            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
-            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
-
-        return {
-            "image": input_image,
-            "generator": generator,
-            "decoder_num_inference_steps": 2,
-            "super_res_num_inference_steps": 2,
-            "output_type": "np",
-        }
-
-    def test_unclip_image_variation_input_tensor(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
-
-        output = pipe(**pipeline_inputs)
-        image = output.images
-
-        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
-
-        image_from_tuple = pipe(
-            **tuple_pipeline_inputs,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array(
-            [
-                0.9997,
-                0.0002,
-                0.9997,
-                0.9997,
-                0.9969,
-                0.0023,
-                0.9997,
-                0.9969,
-                0.9970,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_unclip_image_variation_input_image(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
-
-        output = pipe(**pipeline_inputs)
-        image = output.images
-
-        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
-
-        image_from_tuple = pipe(
-            **tuple_pipeline_inputs,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 64, 64, 3)
-
-        expected_slice = np.array([0.9997, 0.0003, 0.9997, 0.9997, 0.9970, 0.0024, 0.9997, 0.9971, 0.9971])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_unclip_image_variation_input_list_images(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
-        pipeline_inputs["image"] = [
-            pipeline_inputs["image"],
-            pipeline_inputs["image"],
-        ]
-
-        output = pipe(**pipeline_inputs)
-        image = output.images
-
-        tuple_pipeline_inputs = self.get_dummy_inputs(device, pil_image=True)
-        tuple_pipeline_inputs["image"] = [
-            tuple_pipeline_inputs["image"],
-            tuple_pipeline_inputs["image"],
-        ]
-
-        image_from_tuple = pipe(
-            **tuple_pipeline_inputs,
-            return_dict=False,
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (2, 64, 64, 3)
-
-        expected_slice = np.array(
-            [
-                0.9997,
-                0.9989,
-                0.0008,
-                0.0021,
-                0.9960,
-                0.0018,
-                0.0014,
-                0.0002,
-                0.9933,
-            ]
-        )
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_unclip_passed_image_embed(self):
-        device = torch.device("cpu")
-
-        class DummyScheduler:
-            init_noise_sigma = 1
-
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        dtype = pipe.decoder.dtype
-        batch_size = 1
-
-        shape = (batch_size, pipe.decoder.in_channels, pipe.decoder.sample_size, pipe.decoder.sample_size)
-        decoder_latents = pipe.prepare_latents(
-            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-
-        shape = (
-            batch_size,
-            pipe.super_res_first.in_channels // 2,
-            pipe.super_res_first.sample_size,
-            pipe.super_res_first.sample_size,
-        )
-        super_res_latents = pipe.prepare_latents(
-            shape, dtype=dtype, device=device, generator=generator, latents=None, scheduler=DummyScheduler()
-        )
-
-        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
-
-        img_out_1 = pipe(
-            **pipeline_inputs, decoder_latents=decoder_latents, super_res_latents=super_res_latents
-        ).images
-
-        pipeline_inputs = self.get_dummy_inputs(device, pil_image=False)
-        # Don't pass image, instead pass embedding
-        image = pipeline_inputs.pop("image")
-        image_embeddings = pipe.image_encoder(image).image_embeds
-
-        img_out_2 = pipe(
-            **pipeline_inputs,
-            decoder_latents=decoder_latents,
-            super_res_latents=super_res_latents,
-            image_embeddings=image_embeddings,
-        ).images
-
-        # make sure passing text embeddings manually is identical
-        assert np.abs(img_out_1 - img_out_2).max() < 1e-4
-
-    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
-    # because UnCLIP GPU undeterminism requires a looser check.
-    @skip_mps
-    def test_attention_slicing_forward_pass(self):
-        test_max_difference = torch_device == "cpu"
-
-        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
-
-    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
-    # because UnCLIP undeterminism requires a looser check.
-    @skip_mps
-    def test_inference_batch_single_identical(self):
-        test_max_difference = torch_device == "cpu"
-        relax_max_difference = True
-        additional_params_copy_to_batched_inputs = [
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        self._test_inference_batch_single_identical(
-            test_max_difference=test_max_difference,
-            relax_max_difference=relax_max_difference,
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-        )
-
-    def test_inference_batch_consistent(self):
-        additional_params_copy_to_batched_inputs = [
-            "decoder_num_inference_steps",
-            "super_res_num_inference_steps",
-        ]
-
-        if torch_device == "mps":
-            # TODO: MPS errors with larger batch sizes
-            batch_sizes = [2, 3]
-            self._test_inference_batch_consistent(
-                batch_sizes=batch_sizes,
-                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs,
-            )
-        else:
-            self._test_inference_batch_consistent(
-                additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs
-            )
-
-    @skip_mps
-    def test_dict_tuple_outputs_equivalent(self):
-        return super().test_dict_tuple_outputs_equivalent()
-
-    @skip_mps
-    def test_save_load_local(self):
-        return super().test_save_load_local()
-
-    @skip_mps
-    def test_save_load_optional_components(self):
-        return super().test_save_load_optional_components()
-
-
-@slow
-@require_torch_gpu
-class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_unclip_image_variation_karlo(self):
-        input_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
-        )
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
-        )
-
-        pipeline = UnCLIPImageVariationPipeline.from_pretrained(
-            "kakaobrain/karlo-v1-alpha-image-variations", torch_dtype=torch.float16
-        )
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        output = pipeline(
-            input_image,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-
-        assert_mean_pixel_difference(image, expected_image)
diff --git a/diffusers/tests/pipelines/versatile_diffusion/__init__.py b/diffusers/tests/pipelines/versatile_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
deleted file mode 100644
index 4e2b89982a6aad0fb2f2b7c8735b0e645665359f..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_dual_guided.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionDualGuidedPipeline
-from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionDualGuidedPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        # remove text_unet
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(tmpdirname)
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe(
-            prompt="first prompt",
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided(self):
-        pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        first_prompt = "cyberpunk 2077"
-        second_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=first_prompt,
-            image=second_prompt,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0787, 0.0849, 0.0826, 0.0812, 0.0807, 0.0795, 0.0818, 0.0798, 0.0779])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
deleted file mode 100644
index b4eabb9e3a0e18dd71a445bb8960b27d8699daac..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionImageVariationPipeline
-from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionImageVariationPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@slow
-@require_torch_gpu
-class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
-    def test_inference_image_variations(self):
-        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        image_prompt = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe(
-            image=image_prompt,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.0441, 0.0469, 0.0507, 0.0575, 0.0632, 0.0650, 0.0865, 0.0909, 0.0945])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
deleted file mode 100644
index b77c1baf41d5abe4adb17aebb600b80eedda6c39..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionPipeline
-from diffusers.utils.testing_utils import load_image, nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionMegaPipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_from_save_pretrained(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-
-        generator = torch.manual_seed(0)
-        image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe.dual_guided(
-            prompt="first prompt",
-            image=prompt_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=2,
-            output_type="numpy",
-        ).images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_dual_guided_then_text_to_image(self):
-        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "cyberpunk 2077"
-        init_image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/versatile_diffusion/benz.jpg"
-        )
-        generator = torch.manual_seed(0)
-        image = pipe.dual_guided(
-            prompt=prompt,
-            image=init_image,
-            text_to_image_strength=0.75,
-            generator=generator,
-            guidance_scale=7.5,
-            num_inference_steps=50,
-            output_type="numpy",
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.1448, 0.1619, 0.1741, 0.1086, 0.1147, 0.1128, 0.1199, 0.1165, 0.1001])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe.text_to_image(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
-
-        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3076, 0.3123, 0.3284, 0.3782, 0.3770, 0.3894, 0.4297, 0.4331, 0.4456])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1
diff --git a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py b/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
deleted file mode 100644
index 194f660f7055308b41c47c14a35c41f3b2b1014b..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import VersatileDiffusionTextToImagePipeline
-from diffusers.utils.testing_utils import nightly, require_torch_gpu, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VersatileDiffusionTextToImagePipelineFastTests(unittest.TestCase):
-    pass
-
-
-@nightly
-@require_torch_gpu
-class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_remove_unused_weights_save_load(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
-        # remove text_unet
-        pipe.remove_unused_weights()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator = generator.manual_seed(0)
-        new_image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
-        ).images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
-
-    def test_inference_text2img(self):
-        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
-            "shi-labs/versatile-diffusion", torch_dtype=torch.float16
-        )
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A painting of a squirrel eating a burger "
-        generator = torch.manual_seed(0)
-        image = pipe(
-            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
-        ).images
-
-        image_slice = image[0, 253:256, 253:256, -1]
-
-        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.3367, 0.3169, 0.2656, 0.3870, 0.4790, 0.3796, 0.4009, 0.4878, 0.4778])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/diffusers/tests/pipelines/vq_diffusion/__init__.py b/diffusers/tests/pipelines/vq_diffusion/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py b/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
deleted file mode 100644
index 6769240db905abc75e2d04af89a1852911868751..0000000000000000000000000000000000000000
--- a/diffusers/tests/pipelines/vq_diffusion/test_vq_diffusion.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import Transformer2DModel, VQDiffusionPipeline, VQDiffusionScheduler, VQModel
-from diffusers.pipelines.vq_diffusion.pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings
-from diffusers.utils import load_numpy, slow, torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class VQDiffusionPipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    @property
-    def num_embed(self):
-        return 12
-
-    @property
-    def num_embeds_ada_norm(self):
-        return 12
-
-    @property
-    def text_embedder_hidden_size(self):
-        return 32
-
-    @property
-    def dummy_vqvae(self):
-        torch.manual_seed(0)
-        model = VQModel(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=3,
-            num_vq_embeddings=self.num_embed,
-            vq_embed_dim=3,
-        )
-        return model
-
-    @property
-    def dummy_tokenizer(self):
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        return tokenizer
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=self.text_embedder_hidden_size,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_transformer(self):
-        torch.manual_seed(0)
-
-        height = 12
-        width = 12
-
-        model_kwargs = {
-            "attention_bias": True,
-            "cross_attention_dim": 32,
-            "attention_head_dim": height * width,
-            "num_attention_heads": 1,
-            "num_vector_embeds": self.num_embed,
-            "num_embeds_ada_norm": self.num_embeds_ada_norm,
-            "norm_num_groups": 32,
-            "sample_size": width,
-            "activation_fn": "geglu-approximate",
-        }
-
-        model = Transformer2DModel(**model_kwargs)
-        return model
-
-    def test_vq_diffusion(self):
-        device = "cpu"
-
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(learnable=False)
-
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "teddy bear playing in the pool"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 24, 24, 3)
-
-        expected_slice = np.array([0.6583, 0.6410, 0.5325, 0.5635, 0.5563, 0.4234, 0.6008, 0.5491, 0.4880])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        device = "cpu"
-
-        vqvae = self.dummy_vqvae
-        text_encoder = self.dummy_text_encoder
-        tokenizer = self.dummy_tokenizer
-        transformer = self.dummy_transformer
-        scheduler = VQDiffusionScheduler(self.num_embed)
-        learned_classifier_free_sampling_embeddings = LearnedClassifierFreeSamplingEmbeddings(
-            learnable=True, hidden_size=self.text_embedder_hidden_size, length=tokenizer.model_max_length
-        )
-
-        pipe = VQDiffusionPipeline(
-            vqvae=vqvae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
-        )
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        prompt = "teddy bear playing in the pool"
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        output = pipe([prompt], generator=generator, num_inference_steps=2, output_type="np")
-        image = output.images
-
-        generator = torch.Generator(device=device).manual_seed(0)
-        image_from_tuple = pipe(
-            [prompt], generator=generator, output_type="np", return_dict=False, num_inference_steps=2
-        )[0]
-
-        image_slice = image[0, -3:, -3:, -1]
-        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 24, 24, 3)
-
-        expected_slice = np.array([0.6647, 0.6531, 0.5303, 0.5891, 0.5726, 0.4439, 0.6304, 0.5564, 0.4912])
-
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
-
-
-@slow
-@require_torch_gpu
-class VQDiffusionPipelineIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_vq_diffusion_classifier_free_sampling(self):
-        expected_image = load_numpy(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/vq_diffusion/teddy_bear_pool_classifier_free_sampling.npy"
-        )
-
-        pipeline = VQDiffusionPipeline.from_pretrained("microsoft/vq-diffusion-ithq")
-        pipeline = pipeline.to(torch_device)
-        pipeline.set_progress_bar_config(disable=None)
-
-        # requires GPU generator for gumbel softmax
-        # don't use GPU generator in tests though
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        output = pipeline(
-            "teddy bear playing in the pool",
-            num_images_per_prompt=1,
-            generator=generator,
-            output_type="np",
-        )
-
-        image = output.images[0]
-
-        assert image.shape == (256, 256, 3)
-        assert np.abs(expected_image - image).max() < 1e-2
diff --git a/diffusers/tests/repo_utils/test_check_copies.py b/diffusers/tests/repo_utils/test_check_copies.py
deleted file mode 100644
index bd0a22da2c3af2bed6f3029e84face108e3cbda3..0000000000000000000000000000000000000000
--- a/diffusers/tests/repo_utils/test_check_copies.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import shutil
-import sys
-import tempfile
-import unittest
-
-import black
-
-
-git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-sys.path.append(os.path.join(git_repo_path, "utils"))
-
-import check_copies  # noqa: E402
-
-
-# This is the reference code that will be used in the tests.
-# If DDPMSchedulerOutput is changed in scheduling_ddpm.py, this code needs to be manually updated.
-REFERENCE_CODE = """    \"""
-    Output class for the scheduler's step function output.
-
-    Args:
-        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    \"""
-
-    prev_sample: torch.FloatTensor
-    pred_original_sample: Optional[torch.FloatTensor] = None
-"""
-
-
-class CopyCheckTester(unittest.TestCase):
-    def setUp(self):
-        self.diffusers_dir = tempfile.mkdtemp()
-        os.makedirs(os.path.join(self.diffusers_dir, "schedulers/"))
-        check_copies.DIFFUSERS_PATH = self.diffusers_dir
-        shutil.copy(
-            os.path.join(git_repo_path, "src/diffusers/schedulers/scheduling_ddpm.py"),
-            os.path.join(self.diffusers_dir, "schedulers/scheduling_ddpm.py"),
-        )
-
-    def tearDown(self):
-        check_copies.DIFFUSERS_PATH = "src/diffusers"
-        shutil.rmtree(self.diffusers_dir)
-
-    def check_copy_consistency(self, comment, class_name, class_code, overwrite_result=None):
-        code = comment + f"\nclass {class_name}(nn.Module):\n" + class_code
-        if overwrite_result is not None:
-            expected = comment + f"\nclass {class_name}(nn.Module):\n" + overwrite_result
-        mode = black.Mode(target_versions={black.TargetVersion.PY35}, line_length=119)
-        code = black.format_str(code, mode=mode)
-        fname = os.path.join(self.diffusers_dir, "new_code.py")
-        with open(fname, "w", newline="\n") as f:
-            f.write(code)
-        if overwrite_result is None:
-            self.assertTrue(len(check_copies.is_copy_consistent(fname)) == 0)
-        else:
-            check_copies.is_copy_consistent(f.name, overwrite=True)
-            with open(fname, "r") as f:
-                self.assertTrue(f.read(), expected)
-
-    def test_find_code_in_diffusers(self):
-        code = check_copies.find_code_in_diffusers("schedulers.scheduling_ddpm.DDPMSchedulerOutput")
-        self.assertEqual(code, REFERENCE_CODE)
-
-    def test_is_copy_consistent(self):
-        # Base copy consistency
-        self.check_copy_consistency(
-            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
-            "DDPMSchedulerOutput",
-            REFERENCE_CODE + "\n",
-        )
-
-        # With no empty line at the end
-        self.check_copy_consistency(
-            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput",
-            "DDPMSchedulerOutput",
-            REFERENCE_CODE,
-        )
-
-        # Copy consistency with rename
-        self.check_copy_consistency(
-            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
-            "TestSchedulerOutput",
-            re.sub("DDPM", "Test", REFERENCE_CODE),
-        )
-
-        # Copy consistency with a really long name
-        long_class_name = "TestClassWithAReallyLongNameBecauseSomePeopleLikeThatForSomeReason"
-        self.check_copy_consistency(
-            f"# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->{long_class_name}",
-            f"{long_class_name}SchedulerOutput",
-            re.sub("Bert", long_class_name, REFERENCE_CODE),
-        )
-
-        # Copy consistency with overwrite
-        self.check_copy_consistency(
-            "# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->Test",
-            "TestSchedulerOutput",
-            REFERENCE_CODE,
-            overwrite_result=re.sub("DDPM", "Test", REFERENCE_CODE),
-        )
diff --git a/diffusers/tests/repo_utils/test_check_dummies.py b/diffusers/tests/repo_utils/test_check_dummies.py
deleted file mode 100644
index 52a75d7b02e85f70cb347afb1429ca8beb942d21..0000000000000000000000000000000000000000
--- a/diffusers/tests/repo_utils/test_check_dummies.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import unittest
-
-
-git_repo_path = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
-sys.path.append(os.path.join(git_repo_path, "utils"))
-
-import check_dummies  # noqa: E402
-from check_dummies import create_dummy_files, create_dummy_object, find_backend, read_init  # noqa: E402
-
-
-# Align TRANSFORMERS_PATH in check_dummies with the current path
-check_dummies.PATH_TO_DIFFUSERS = os.path.join(git_repo_path, "src", "diffusers")
-
-
-class CheckDummiesTester(unittest.TestCase):
-    def test_find_backend(self):
-        simple_backend = find_backend("    if not is_torch_available():")
-        self.assertEqual(simple_backend, "torch")
-
-        # backend_with_underscore = find_backend("    if not is_tensorflow_text_available():")
-        # self.assertEqual(backend_with_underscore, "tensorflow_text")
-
-        double_backend = find_backend("    if not (is_torch_available() and is_transformers_available()):")
-        self.assertEqual(double_backend, "torch_and_transformers")
-
-        # double_backend_with_underscore = find_backend(
-        #    "    if not (is_sentencepiece_available() and is_tensorflow_text_available()):"
-        # )
-        # self.assertEqual(double_backend_with_underscore, "sentencepiece_and_tensorflow_text")
-
-        triple_backend = find_backend(
-            "    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):"
-        )
-        self.assertEqual(triple_backend, "torch_and_transformers_and_onnx")
-
-    def test_read_init(self):
-        objects = read_init()
-        # We don't assert on the exact list of keys to allow for smooth grow of backend-specific objects
-        self.assertIn("torch", objects)
-        self.assertIn("torch_and_transformers", objects)
-        self.assertIn("flax_and_transformers", objects)
-        self.assertIn("torch_and_transformers_and_onnx", objects)
-
-        # Likewise, we can't assert on the exact content of a key
-        self.assertIn("UNet2DModel", objects["torch"])
-        self.assertIn("FlaxUNet2DConditionModel", objects["flax"])
-        self.assertIn("StableDiffusionPipeline", objects["torch_and_transformers"])
-        self.assertIn("FlaxStableDiffusionPipeline", objects["flax_and_transformers"])
-        self.assertIn("LMSDiscreteScheduler", objects["torch_and_scipy"])
-        self.assertIn("OnnxStableDiffusionPipeline", objects["torch_and_transformers_and_onnx"])
-
-    def test_create_dummy_object(self):
-        dummy_constant = create_dummy_object("CONSTANT", "'torch'")
-        self.assertEqual(dummy_constant, "\nCONSTANT = None\n")
-
-        dummy_function = create_dummy_object("function", "'torch'")
-        self.assertEqual(
-            dummy_function, "\ndef function(*args, **kwargs):\n    requires_backends(function, 'torch')\n"
-        )
-
-        expected_dummy_class = """
-class FakeClass(metaclass=DummyObject):
-    _backends = 'torch'
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, 'torch')
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, 'torch')
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, 'torch')
-"""
-        dummy_class = create_dummy_object("FakeClass", "'torch'")
-        self.assertEqual(dummy_class, expected_dummy_class)
-
-    def test_create_dummy_files(self):
-        expected_dummy_pytorch_file = """# This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..utils import DummyObject, requires_backends
-
-
-CONSTANT = None
-
-
-def function(*args, **kwargs):
-    requires_backends(function, ["torch"])
-
-
-class FakeClass(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-"""
-        dummy_files = create_dummy_files({"torch": ["CONSTANT", "function", "FakeClass"]})
-        self.assertEqual(dummy_files["torch"], expected_dummy_pytorch_file)
diff --git a/diffusers/tests/schedulers/__init__.py b/diffusers/tests/schedulers/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/diffusers/tests/schedulers/test_scheduler_ddim.py b/diffusers/tests/schedulers/test_scheduler_ddim.py
deleted file mode 100644
index e9c85314d558af74b2ed325df5ed7722e1acd691..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_ddim.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import torch
-
-from diffusers import DDIMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DDIMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDIMScheduler,)
-    forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50))
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "clip_sample": True,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps, eta = 10, 0.0
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-
-        scheduler.set_timesteps(num_inference_steps)
-
-        for t in scheduler.timesteps:
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample, eta).prev_sample
-
-        return sample
-
-    def test_timesteps(self):
-        for timesteps in [100, 500, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        scheduler.set_timesteps(5)
-        assert torch.equal(scheduler.timesteps, torch.LongTensor([801, 601, 401, 201, 1]))
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for threshold in [0.5, 1.0, 2.0]:
-            for prediction_type in ["epsilon", "v_prediction"]:
-                self.check_over_configs(
-                    thresholding=True,
-                    prediction_type=prediction_type,
-                    sample_max_value=threshold,
-                )
-
-    def test_time_indices(self):
-        for t in [1, 10, 49]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
-            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
-
-    def test_eta(self):
-        for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]):
-            self.check_over_forward(time_step=t, eta=eta)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 172.0067) < 1e-2
-        assert abs(result_mean.item() - 0.223967) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 52.5302) < 1e-2
-        assert abs(result_mean.item() - 0.0684) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 149.8295) < 1e-2
-        assert abs(result_mean.item() - 0.1951) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 149.0784) < 1e-2
-        assert abs(result_mean.item() - 0.1941) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_ddpm.py b/diffusers/tests/schedulers/test_scheduler_ddpm.py
deleted file mode 100644
index b55a39ee2e79274691f5136b989cbaabb3f00932..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_ddpm.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import torch
-
-from diffusers import DDPMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DDPMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DDPMScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "variance_type": "fixed_small",
-            "clip_sample": True,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [1, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_variance_type(self):
-        for variance in ["fixed_small", "fixed_large", "other"]:
-            self.check_over_configs(variance_type=variance)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for threshold in [0.5, 1.0, 2.0]:
-            for prediction_type in ["epsilon", "sample", "v_prediction"]:
-                self.check_over_configs(
-                    thresholding=True,
-                    prediction_type=prediction_type,
-                    sample_max_value=threshold,
-                )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "sample", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [0, 500, 999]:
-            self.check_over_forward(time_step=t)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert torch.sum(torch.abs(scheduler._get_variance(0) - 0.0)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_trained_timesteps = len(scheduler)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = torch.manual_seed(0)
-
-        for t in reversed(range(num_trained_timesteps)):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            # if t > 0:
-            #     noise = self.dummy_sample_deter
-            #     variance = scheduler.get_variance(t) ** (0.5) * noise
-            #
-            # sample = pred_prev_sample + variance
-            sample = pred_prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 258.9606) < 1e-2
-        assert abs(result_mean.item() - 0.3372) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_trained_timesteps = len(scheduler)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = torch.manual_seed(0)
-
-        for t in reversed(range(num_trained_timesteps)):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            # if t > 0:
-            #     noise = self.dummy_sample_deter
-            #     variance = scheduler.get_variance(t) ** (0.5) * noise
-            #
-            # sample = pred_prev_sample + variance
-            sample = pred_prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 202.0296) < 1e-2
-        assert abs(result_mean.item() - 0.2631) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_deis.py b/diffusers/tests/schedulers/test_scheduler_deis.py
deleted file mode 100644
index 8b14601bc98240cca5ea75ae06343be20bc3ca79..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_deis.py
+++ /dev/null
@@ -1,237 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DEISMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DEISMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DEISMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["logrho"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="deis",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["deis"]:
-            for solver_type in ["logrho"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.23916) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.091) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.half()
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        assert sample.dtype == torch.float16
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_multi.py b/diffusers/tests/schedulers/test_scheduler_dpm_multi.py
deleted file mode 100644
index 295bbe882746793b09b196f054e392e22415d455..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_dpm_multi.py
+++ /dev/null
@@ -1,245 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DPMSolverMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-            "sample_max_value": 1.0,
-            "algorithm_type": "dpmsolver++",
-            "solver_type": "midpoint",
-            "lower_order_final": False,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["midpoint", "heun"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="dpmsolver++",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
-            for solver_type in ["midpoint", "heun"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-    def test_full_loop_no_noise_thres(self):
-        sample = self.full_loop(thresholding=True, dynamic_thresholding_ratio=0.87, sample_max_value=0.5)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.6405) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2251) < 1e-3
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DPMSolverMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.3301) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.half()
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        assert sample.dtype == torch.float16
diff --git a/diffusers/tests/schedulers/test_scheduler_dpm_single.py b/diffusers/tests/schedulers/test_scheduler_dpm_single.py
deleted file mode 100644
index 9dff04e7c99841f83d9cbbd34dde7ee4525541fe..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_dpm_single.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class DPMSolverSinglestepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (DPMSolverSinglestepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "prediction_type": "epsilon",
-            "thresholding": False,
-            "sample_max_value": 1.0,
-            "algorithm_type": "dpmsolver++",
-            "solver_type": "midpoint",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["midpoint", "heun"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            algorithm_type="dpmsolver++",
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for algorithm_type in ["dpmsolver", "dpmsolver++"]:
-            for solver_type in ["midpoint", "heun"]:
-                for order in [1, 2, 3]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        sample = self.full_loop(
-                            solver_order=order,
-                            solver_type=solver_type,
-                            prediction_type=prediction_type,
-                            algorithm_type=algorithm_type,
-                        )
-                        assert not torch.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2791) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.1453) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.half()
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        assert sample.dtype == torch.float16
diff --git a/diffusers/tests/schedulers/test_scheduler_euler.py b/diffusers/tests/schedulers/test_scheduler_euler.py
deleted file mode 100644
index 4d521b0075e18710b88ed3efe1f2652bb4718733..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_euler.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import torch
-
-from diffusers import EulerDiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class EulerDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 10.0807) < 1e-2
-        assert abs(result_mean.item() - 0.0131) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 0.0002) < 1e-2
-        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 10.0807) < 1e-2
-        assert abs(result_mean.item() - 0.0131) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py b/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py
deleted file mode 100644
index 5fa36be6bc64e5fc6aac72e11e50e455089469cb..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_euler_ancestral.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import torch
-
-from diffusers import EulerAncestralDiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class EulerAncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (EulerAncestralDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 152.3192) < 1e-2
-        assert abs(result_mean.item() - 0.1983) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 108.4439) < 1e-2
-        assert abs(result_mean.item() - 0.1412) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 152.3192) < 1e-2
-        assert abs(result_mean.item() - 0.1983) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_flax.py b/diffusers/tests/schedulers/test_scheduler_flax.py
deleted file mode 100644
index 8f7ad59d285eb50a42ab5809ce60dd0bf26e026c..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_flax.py
+++ /dev/null
@@ -1,919 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import tempfile
-import unittest
-from typing import Dict, List, Tuple
-
-from diffusers import FlaxDDIMScheduler, FlaxDDPMScheduler, FlaxPNDMScheduler
-from diffusers.utils import is_flax_available
-from diffusers.utils.testing_utils import require_flax
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from jax import random
-
-    jax_device = jax.default_backend()
-
-
-@require_flax
-class FlaxSchedulerCommonTest(unittest.TestCase):
-    scheduler_classes = ()
-    forward_default_kwargs = ()
-
-    @property
-    def dummy_sample(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        key1, key2 = random.split(random.PRNGKey(0))
-        sample = random.uniform(key1, (batch_size, num_channels, height, width))
-
-        return sample, key2
-
-    @property
-    def dummy_sample_deter(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        num_elems = batch_size * num_channels * height * width
-        sample = jnp.arange(num_elems)
-        sample = sample.reshape(num_channels, height, width, batch_size)
-        sample = sample / num_elems
-        return jnp.transpose(sample, (3, 0, 1, 2))
-
-    def get_scheduler_config(self):
-        raise NotImplementedError
-
-    def dummy_model(self):
-        def model(sample, t, *args):
-            return sample * t / (t + 1)
-
-        return model
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, key = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, key = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, time_step, sample, key, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, time_step, sample, key, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, key = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, 1, sample, key, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, key = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step(state, residual, 0, sample, key, **kwargs).prev_sample
-            output_1 = scheduler.step(state, residual, 1, sample, key, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_scheduler_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            return t.at[t != t].set(0)
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
-                    ),
-                )
-
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, key = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_dict = scheduler.step(state, residual, 0, sample, key, **kwargs)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_tuple = scheduler.step(state, residual, 0, sample, key, return_dict=False, **kwargs)
-
-            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
-
-    def test_deprecated_kwargs(self):
-        for scheduler_class in self.scheduler_classes:
-            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
-            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
-
-            if has_kwarg_in_model_class and not has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
-                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                    " [<deprecated_argument>]`"
-                )
-
-            if not has_kwarg_in_model_class and has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
-                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
-                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
-                )
-
-
-@require_flax
-class FlaxDDPMSchedulerTest(FlaxSchedulerCommonTest):
-    scheduler_classes = (FlaxDDPMScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "variance_type": "fixed_small",
-            "clip_sample": True,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [1, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_variance_type(self):
-        for variance in ["fixed_small", "fixed_large", "other"]:
-            self.check_over_configs(variance_type=variance)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_time_indices(self):
-        for t in [0, 500, 999]:
-            self.check_over_forward(time_step=t)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487) - 0.00979)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-
-        num_trained_timesteps = len(scheduler)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        key1, key2 = random.split(random.PRNGKey(0))
-
-        for t in reversed(range(num_trained_timesteps)):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            output = scheduler.step(state, residual, t, sample, key1)
-            pred_prev_sample = output.prev_sample
-            state = output.state
-            key1, key2 = random.split(key2)
-
-            # if t > 0:
-            #     noise = self.dummy_sample_deter
-            #     variance = scheduler.get_variance(t) ** (0.5) * noise
-            #
-            # sample = pred_prev_sample + variance
-            sample = pred_prev_sample
-
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            assert abs(result_sum - 255.0714) < 1e-2
-            assert abs(result_mean - 0.332124) < 1e-3
-        else:
-            assert abs(result_sum - 255.1113) < 1e-2
-            assert abs(result_mean - 0.332176) < 1e-3
-
-
-@require_flax
-class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
-    scheduler_classes = (FlaxDDIMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-        key1, key2 = random.split(random.PRNGKey(0))
-
-        num_inference_steps = 10
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-
-        state = scheduler.set_timesteps(state, num_inference_steps)
-
-        for t in state.timesteps:
-            residual = model(sample, t)
-            output = scheduler.step(state, residual, t, sample)
-            sample = output.prev_sample
-            state = output.state
-            key1, key2 = random.split(key2)
-
-        return sample
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, 1, sample, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output = scheduler.step(state, residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(new_state, residual, time_step, sample, **kwargs).prev_sample
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_scheduler_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            return t.at[t != t].set(0)
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
-                    ),
-                )
-
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
-
-            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step(state, residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(state, residual, 1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 500, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-        state = scheduler.set_timesteps(state, 5)
-        assert jnp.equal(state.timesteps, jnp.array([801, 601, 401, 201, 1])).all()
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_time_indices(self):
-        for t in [1, 10, 49]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]):
-            self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps)
-
-    def test_variance(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 420, 400) - 0.14771)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 980, 960) - 0.32460)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 0, 0) - 0.0)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 487, 486) - 0.00979)) < 1e-5
-        assert jnp.sum(jnp.abs(scheduler._get_variance(state, 999, 998) - 0.02)) < 1e-5
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        assert abs(result_sum - 172.0067) < 1e-2
-        assert abs(result_mean - 0.223967) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            assert abs(result_sum - 149.8409) < 1e-2
-            assert abs(result_mean - 0.1951) < 1e-3
-        else:
-            assert abs(result_sum - 149.8295) < 1e-2
-            assert abs(result_mean - 0.1951) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            pass
-            # FIXME: both result_sum and result_mean are nan on TPU
-            # assert jnp.isnan(result_sum)
-            # assert jnp.isnan(result_mean)
-        else:
-            assert abs(result_sum - 149.0784) < 1e-2
-            assert abs(result_mean - 0.1941) < 1e-3
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "sample", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-
-@require_flax
-class FlaxPNDMSchedulerTest(FlaxSchedulerCommonTest):
-    scheduler_classes = (FlaxPNDMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample, _ = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-            # copy over dummy past residuals
-            state = state.replace(ets=dummy_past_residuals[:])
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
-                # copy over dummy past residuals
-                new_state = new_state.replace(ets=dummy_past_residuals[:])
-
-            (prev_sample, state) = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
-            (new_prev_sample, new_state) = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
-
-            assert jnp.sum(jnp.abs(prev_sample - new_prev_sample)) < 1e-5, "Scheduler outputs are not identical"
-
-            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
-            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def test_scheduler_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            return t.at[t != t].set(0)
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    jnp.allclose(set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {jnp.max(jnp.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {jnp.isnan(tuple_object).any()} and `inf`: {jnp.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {jnp.isnan(dict_object).any()} and `inf`: {jnp.isinf(dict_object)}."
-                    ),
-                )
-
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_dict = scheduler.step(state, residual, 0, sample, **kwargs)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            outputs_tuple = scheduler.step(state, residual, 0, sample, return_dict=False, **kwargs)
-
-            recursive_check(outputs_tuple[0], outputs_dict.prev_sample)
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample, _ = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.ets = dummy_past_residuals[:]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler, new_state = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_state = new_scheduler.set_timesteps(new_state, num_inference_steps, shape=sample.shape)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_state.replace(ets=dummy_past_residuals[:])
-
-            output, state = scheduler.step_prk(state, residual, time_step, sample, **kwargs)
-            new_output, new_state = new_scheduler.step_prk(new_state, residual, time_step, sample, **kwargs)
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output, _ = scheduler.step_plms(state, residual, time_step, sample, **kwargs)
-            new_output, _ = new_scheduler.step_plms(new_state, residual, time_step, sample, **kwargs)
-
-            assert jnp.sum(jnp.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-
-        for i, t in enumerate(state.prk_timesteps):
-            residual = model(sample, t)
-            sample, state = scheduler.step_prk(state, residual, t, sample)
-
-        for i, t in enumerate(state.plms_timesteps):
-            residual = model(sample, t)
-            sample, state = scheduler.step_plms(state, residual, t, sample)
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = jnp.array([residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05])
-            state = state.replace(ets=dummy_past_residuals[:])
-
-            output_0, state = scheduler.step_prk(state, residual, 0, sample, **kwargs)
-            output_1, state = scheduler.step_prk(state, residual, 1, sample, **kwargs)
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-            output_0, state = scheduler.step_plms(state, residual, 0, sample, **kwargs)
-            output_1, state = scheduler.step_plms(state, residual, 1, sample, **kwargs)
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        state = scheduler.create_state()
-        state = scheduler.set_timesteps(state, 10, shape=())
-        assert jnp.equal(
-            state.timesteps,
-            jnp.array([901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]),
-        ).all()
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_time_indices(self):
-        for t in [1, 5, 10]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(num_inference_steps=num_inference_steps)
-
-    def test_pow_of_3_inference_steps(self):
-        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
-        num_inference_steps = 27
-
-        for scheduler_class in self.scheduler_classes:
-            sample, _ = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            state = scheduler.set_timesteps(state, num_inference_steps, shape=sample.shape)
-
-            # before power of 3 fix, would error on first step, so we only need to do two
-            for i, t in enumerate(state.prk_timesteps[:2]):
-                sample, state = scheduler.step_prk(state, residual, t, sample)
-
-    def test_inference_plms_no_past_residuals(self):
-        with self.assertRaises(ValueError):
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            state = scheduler.create_state()
-
-            scheduler.step_plms(state, self.dummy_sample, 1, self.dummy_sample).prev_sample
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            assert abs(result_sum - 198.1275) < 1e-2
-            assert abs(result_mean - 0.2580) < 1e-3
-        else:
-            assert abs(result_sum - 198.1318) < 1e-2
-            assert abs(result_mean - 0.2580) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            assert abs(result_sum - 186.83226) < 1e-2
-            assert abs(result_mean - 0.24327) < 1e-3
-        else:
-            assert abs(result_sum - 186.9466) < 1e-2
-            assert abs(result_mean - 0.24342) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = jnp.sum(jnp.abs(sample))
-        result_mean = jnp.mean(jnp.abs(sample))
-
-        if jax_device == "tpu":
-            assert abs(result_sum - 186.83226) < 1e-2
-            assert abs(result_mean - 0.24327) < 1e-3
-        else:
-            assert abs(result_sum - 186.9482) < 1e-2
-            assert abs(result_mean - 0.2434) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_heun.py b/diffusers/tests/schedulers/test_scheduler_heun.py
deleted file mode 100644
index 7d38c8e2374c26e49c52f3430a3e595b35771436..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_heun.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import torch
-
-from diffusers import HeunDiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class HeunDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (HeunDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if torch_device in ["cpu", "mps"]:
-            assert abs(result_sum.item() - 0.1233) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 0.1233) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if torch_device in ["cpu", "mps"]:
-            assert abs(result_sum.item() - 4.6934e-07) < 1e-2
-            assert abs(result_mean.item() - 6.1112e-10) < 1e-3
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if str(torch_device).startswith("cpu"):
-            # The following sum varies between 148 and 156 on mps. Why?
-            assert abs(result_sum.item() - 0.1233) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
-        elif str(torch_device).startswith("mps"):
-            # Larger tolerance on mps
-            assert abs(result_mean.item() - 0.0002) < 1e-2
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 0.1233) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_ipndm.py b/diffusers/tests/schedulers/test_scheduler_ipndm.py
deleted file mode 100644
index 549caed47fe8f100c2bc4164329210209595ba7f..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_ipndm.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import IPNDMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class IPNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (IPNDMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {"num_train_timesteps": 1000}
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.ets = dummy_past_residuals[:]
-
-            if time_step is None:
-                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.ets = dummy_past_residuals[:]
-
-            if time_step is None:
-                time_step = scheduler.timesteps[len(scheduler.timesteps) // 2]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-            scheduler.ets = dummy_past_residuals[:]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps, time_step=None)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=None)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 2540529) < 10
diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
deleted file mode 100644
index 45371121e66b8ffdcecb5cc86a91758e436b2955..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_kdpm2_ancestral.py
+++ /dev/null
@@ -1,123 +0,0 @@
-import torch
-
-from diffusers import KDPM2AncestralDiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class KDPM2AncestralDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2AncestralDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_full_loop_no_noise(self):
-        if torch_device == "mps":
-            return
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 13849.3877) < 1e-2
-        assert abs(result_mean.item() - 18.0331) < 5e-3
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_with_v_prediction(self):
-        if torch_device == "mps":
-            return
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        generator = torch.manual_seed(0)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 328.9970) < 1e-2
-        assert abs(result_mean.item() - 0.4284) < 1e-3
-
-    def test_full_loop_device(self):
-        if torch_device == "mps":
-            return
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-        generator = torch.manual_seed(0)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample, generator=generator)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 13849.3818) < 1e-1
-        assert abs(result_mean.item() - 18.0331) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py b/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
deleted file mode 100644
index 4f1bd1f8aeb78a9266a319fe1f097e7c4a5d0e2a..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_kdpm2_discrete.py
+++ /dev/null
@@ -1,132 +0,0 @@
-import torch
-
-from diffusers import KDPM2DiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class KDPM2DiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (KDPM2DiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if torch_device in ["cpu", "mps"]:
-            assert abs(result_sum.item() - 4.6934e-07) < 1e-2
-            assert abs(result_mean.item() - 6.1112e-10) < 1e-3
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 4.693428650170972e-07) < 1e-2
-            assert abs(result_mean.item() - 0.0002) < 1e-3
-
-    def test_full_loop_no_noise(self):
-        if torch_device == "mps":
-            return
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if torch_device in ["cpu", "mps"]:
-            assert abs(result_sum.item() - 20.4125) < 1e-2
-            assert abs(result_mean.item() - 0.0266) < 1e-3
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 20.4125) < 1e-2
-            assert abs(result_mean.item() - 0.0266) < 1e-3
-
-    def test_full_loop_device(self):
-        if torch_device == "mps":
-            return
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.to(torch_device) * scheduler.init_noise_sigma
-
-        for t in scheduler.timesteps:
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        if str(torch_device).startswith("cpu"):
-            # The following sum varies between 148 and 156 on mps. Why?
-            assert abs(result_sum.item() - 20.4125) < 1e-2
-            assert abs(result_mean.item() - 0.0266) < 1e-3
-        else:
-            # CUDA
-            assert abs(result_sum.item() - 20.4125) < 1e-2
-            assert abs(result_mean.item() - 0.0266) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_lms.py b/diffusers/tests/schedulers/test_scheduler_lms.py
deleted file mode 100644
index ca3574e9ee638546d313e5256feba804522da65b..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_lms.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import torch
-
-from diffusers import LMSDiscreteScheduler
-from diffusers.utils import torch_device
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class LMSDiscreteSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (LMSDiscreteScheduler,)
-    num_inference_steps = 10
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1100,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [10, 50, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.00001, 0.0001, 0.001], [0.0002, 0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "scaled_linear"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [0, 500, 800]:
-            self.check_over_forward(time_step=t)
-
-    def test_full_loop_no_noise(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 1006.388) < 1e-2
-        assert abs(result_mean.item() - 1.31) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(prediction_type="v_prediction")
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 0.0017) < 1e-2
-        assert abs(result_mean.item() - 2.2676e-06) < 1e-3
-
-    def test_full_loop_device(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(self.num_inference_steps, device=torch_device)
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter * scheduler.init_noise_sigma
-        sample = sample.to(torch_device)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sample = scheduler.scale_model_input(sample, t)
-
-            model_output = model(sample, t)
-
-            output = scheduler.step(model_output, t, sample)
-            sample = output.prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 1006.388) < 1e-2
-        assert abs(result_mean.item() - 1.31) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_pndm.py b/diffusers/tests/schedulers/test_scheduler_pndm.py
deleted file mode 100644
index c1519f7c7e8e113aca61c8749c3a08f6f390309f..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_pndm.py
+++ /dev/null
@@ -1,242 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import PNDMScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class PNDMSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (PNDMScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 50),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.ets = dummy_past_residuals[:]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        pass
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.ets = dummy_past_residuals[:]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.ets = dummy_past_residuals[:]
-
-            output = scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_prk(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step_plms(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, **config):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.prk_timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step_prk(residual, t, sample).prev_sample
-
-        for i, t in enumerate(scheduler.plms_timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step_plms(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.1, residual + 0.05]
-            scheduler.ets = dummy_past_residuals[:]
-
-            output_0 = scheduler.step_prk(residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step_prk(residual, 1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-            output_0 = scheduler.step_plms(residual, 0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step_plms(residual, 1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_timesteps(self):
-        for timesteps in [100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_steps_offset(self):
-        for steps_offset in [0, 1]:
-            self.check_over_configs(steps_offset=steps_offset)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(steps_offset=1)
-        scheduler = scheduler_class(**scheduler_config)
-        scheduler.set_timesteps(10)
-        assert torch.equal(
-            scheduler.timesteps,
-            torch.LongTensor(
-                [901, 851, 851, 801, 801, 751, 751, 701, 701, 651, 651, 601, 601, 501, 401, 301, 201, 101, 1]
-            ),
-        )
-
-    def test_betas(self):
-        for beta_start, beta_end in zip([0.0001, 0.001], [0.002, 0.02]):
-            self.check_over_configs(beta_start=beta_start, beta_end=beta_end)
-
-    def test_schedules(self):
-        for schedule in ["linear", "squaredcos_cap_v2"]:
-            self.check_over_configs(beta_schedule=schedule)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for t in [1, 5, 10]:
-            self.check_over_forward(time_step=t)
-
-    def test_inference_steps(self):
-        for t, num_inference_steps in zip([1, 5, 10], [10, 50, 100]):
-            self.check_over_forward(num_inference_steps=num_inference_steps)
-
-    def test_pow_of_3_inference_steps(self):
-        # earlier version of set_timesteps() caused an error indexing alpha's with inference steps as power of 3
-        num_inference_steps = 27
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.set_timesteps(num_inference_steps)
-
-            # before power of 3 fix, would error on first step, so we only need to do two
-            for i, t in enumerate(scheduler.prk_timesteps[:2]):
-                sample = scheduler.step_prk(residual, t, sample).prev_sample
-
-    def test_inference_plms_no_past_residuals(self):
-        with self.assertRaises(ValueError):
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            scheduler.step_plms(self.dummy_sample, 1, self.dummy_sample).prev_sample
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 198.1318) < 1e-2
-        assert abs(result_mean.item() - 0.2580) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 67.3986) < 1e-2
-        assert abs(result_mean.item() - 0.0878) < 1e-3
-
-    def test_full_loop_with_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01)
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 230.0399) < 1e-2
-        assert abs(result_mean.item() - 0.2995) < 1e-3
-
-    def test_full_loop_with_no_set_alpha_to_one(self):
-        # We specify different beta, so that the first alpha is 0.99
-        sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01)
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 186.9482) < 1e-2
-        assert abs(result_mean.item() - 0.2434) < 1e-3
diff --git a/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py b/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py
deleted file mode 100644
index 08c30f9b1e0c2ce1f7baab82f5076efabe465a69..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_score_sde_ve.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-
-from diffusers import ScoreSdeVeScheduler
-
-
-class ScoreSdeVeSchedulerTest(unittest.TestCase):
-    # TODO adapt with class SchedulerCommonTest (scheduler needs Numpy Integration)
-    scheduler_classes = (ScoreSdeVeScheduler,)
-    forward_default_kwargs = ()
-
-    @property
-    def dummy_sample(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = torch.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        num_elems = batch_size * num_channels * height * width
-        sample = torch.arange(num_elems)
-        sample = sample.reshape(num_channels, height, width, batch_size)
-        sample = sample / num_elems
-        sample = sample.permute(3, 0, 1, 2)
-
-        return sample
-
-    def dummy_model(self):
-        def model(sample, t, *args):
-            return sample * t / (t + 1)
-
-        return model
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 2000,
-            "snr": 0.15,
-            "sigma_min": 0.01,
-            "sigma_max": 1348,
-            "sampling_eps": 1e-5,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            output = scheduler.step_pred(
-                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_pred(
-                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
-            new_output = new_scheduler.step_correct(
-                residual, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        for scheduler_class in self.scheduler_classes:
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            output = scheduler.step_pred(
-                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-            new_output = new_scheduler.step_pred(
-                residual, time_step, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-            output = scheduler.step_correct(residual, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
-            new_output = new_scheduler.step_correct(
-                residual, sample, generator=torch.manual_seed(0), **kwargs
-            ).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler correction are not identical"
-
-    def test_timesteps(self):
-        for timesteps in [10, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_sigmas(self):
-        for sigma_min, sigma_max in zip([0.0001, 0.001, 0.01], [1, 100, 1000]):
-            self.check_over_configs(sigma_min=sigma_min, sigma_max=sigma_max)
-
-    def test_time_indices(self):
-        for t in [0.1, 0.5, 0.75]:
-            self.check_over_forward(time_step=t)
-
-    def test_full_loop_no_noise(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 3
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-
-        scheduler.set_sigmas(num_inference_steps)
-        scheduler.set_timesteps(num_inference_steps)
-        generator = torch.manual_seed(0)
-
-        for i, t in enumerate(scheduler.timesteps):
-            sigma_t = scheduler.sigmas[i]
-
-            for _ in range(scheduler.config.correct_steps):
-                with torch.no_grad():
-                    model_output = model(sample, sigma_t)
-                sample = scheduler.step_correct(model_output, sample, generator=generator, **kwargs).prev_sample
-
-            with torch.no_grad():
-                model_output = model(sample, sigma_t)
-
-            output = scheduler.step_pred(model_output, t, sample, generator=generator, **kwargs)
-            sample, _ = output.prev_sample, output.prev_sample_mean
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert np.isclose(result_sum.item(), 14372758528.0)
-        assert np.isclose(result_mean.item(), 18714530.0)
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step_pred(residual, 0, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
-            output_1 = scheduler.step_pred(residual, 1, sample, generator=torch.manual_seed(0), **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
diff --git a/diffusers/tests/schedulers/test_scheduler_unclip.py b/diffusers/tests/schedulers/test_scheduler_unclip.py
deleted file mode 100644
index b0ce1312e79f6762bc7573c3a90e58cb33a21bad..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_unclip.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import torch
-
-from diffusers import UnCLIPScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-# UnCLIPScheduler is a modified DDPMScheduler with a subset of the configuration.
-class UnCLIPSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UnCLIPScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "variance_type": "fixed_small_log",
-            "clip_sample": True,
-            "clip_sample_range": 1.0,
-            "prediction_type": "epsilon",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def test_timesteps(self):
-        for timesteps in [1, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_variance_type(self):
-        for variance in ["fixed_small_log", "learned_range"]:
-            self.check_over_configs(variance_type=variance)
-
-    def test_clip_sample(self):
-        for clip_sample in [True, False]:
-            self.check_over_configs(clip_sample=clip_sample)
-
-    def test_clip_sample_range(self):
-        for clip_sample_range in [1, 5, 10, 20]:
-            self.check_over_configs(clip_sample_range=clip_sample_range)
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "sample"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_time_indices(self):
-        for time_step in [0, 500, 999]:
-            for prev_timestep in [None, 5, 100, 250, 500, 750]:
-                if prev_timestep is not None and prev_timestep >= time_step:
-                    continue
-
-                self.check_over_forward(time_step=time_step, prev_timestep=prev_timestep)
-
-    def test_variance_fixed_small_log(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(variance_type="fixed_small_log")
-        scheduler = scheduler_class(**scheduler_config)
-
-        assert torch.sum(torch.abs(scheduler._get_variance(0) - 1.0000e-10)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.0549625)) < 1e-5
-        assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.9994987)) < 1e-5
-
-    def test_variance_learned_range(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(variance_type="learned_range")
-        scheduler = scheduler_class(**scheduler_config)
-
-        predicted_variance = 0.5
-
-        assert scheduler._get_variance(1, predicted_variance=predicted_variance) - -10.1712790 < 1e-5
-        assert scheduler._get_variance(487, predicted_variance=predicted_variance) - -5.7998052 < 1e-5
-        assert scheduler._get_variance(999, predicted_variance=predicted_variance) - -0.0010011 < 1e-5
-
-    def test_full_loop(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        timesteps = scheduler.timesteps
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = torch.manual_seed(0)
-
-        for i, t in enumerate(timesteps):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample
-
-            sample = pred_prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 252.2682495) < 1e-2
-        assert abs(result_mean.item() - 0.3284743) < 1e-3
-
-    def test_full_loop_skip_timesteps(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config()
-        scheduler = scheduler_class(**scheduler_config)
-
-        scheduler.set_timesteps(25)
-
-        timesteps = scheduler.timesteps
-
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        generator = torch.manual_seed(0)
-
-        for i, t in enumerate(timesteps):
-            # 1. predict noise residual
-            residual = model(sample, t)
-
-            if i + 1 == timesteps.shape[0]:
-                prev_timestep = None
-            else:
-                prev_timestep = timesteps[i + 1]
-
-            # 2. predict previous mean of sample x_t-1
-            pred_prev_sample = scheduler.step(
-                residual, t, sample, prev_timestep=prev_timestep, generator=generator
-            ).prev_sample
-
-            sample = pred_prev_sample
-
-        result_sum = torch.sum(torch.abs(sample))
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_sum.item() - 258.2044983) < 1e-2
-        assert abs(result_mean.item() - 0.3362038) < 1e-3
-
-    def test_trained_betas(self):
-        pass
-
-    def test_add_noise_device(self):
-        pass
diff --git a/diffusers/tests/schedulers/test_scheduler_unipc.py b/diffusers/tests/schedulers/test_scheduler_unipc.py
deleted file mode 100644
index 6154c8e2d625506f138c28da7a605e5739e6ffd3..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_unipc.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import tempfile
-
-import torch
-
-from diffusers import (
-    DEISMultistepScheduler,
-    DPMSolverMultistepScheduler,
-    DPMSolverSinglestepScheduler,
-    UniPCMultistepScheduler,
-)
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class UniPCMultistepSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (UniPCMultistepScheduler,)
-    forward_default_kwargs = (("num_inference_steps", 25),)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_train_timesteps": 1000,
-            "beta_start": 0.0001,
-            "beta_end": 0.02,
-            "beta_schedule": "linear",
-            "solver_order": 2,
-            "solver_type": "bh1",
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-            # copy over dummy past residuals
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                new_scheduler.set_timesteps(num_inference_steps)
-                # copy over dummy past residuals
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output, new_output = sample, sample
-            for t in range(time_step, time_step + scheduler.config.solver_order + 1):
-                output = scheduler.step(residual, t, output, **kwargs).prev_sample
-                new_output = new_scheduler.step(residual, t, new_output, **kwargs).prev_sample
-
-                assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-        sample = self.dummy_sample
-        residual = 0.1 * sample
-        dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(num_inference_steps)
-
-            # copy over dummy past residuals (must be after setting timesteps)
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-                # copy over dummy past residuals
-                new_scheduler.set_timesteps(num_inference_steps)
-
-                # copy over dummy past residual (must be after setting timesteps)
-                new_scheduler.model_outputs = dummy_past_residuals[: new_scheduler.config.solver_order]
-
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def full_loop(self, scheduler=None, **config):
-        if scheduler is None:
-            scheduler_class = self.scheduler_classes[0]
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(**config)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        return sample
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            sample = self.dummy_sample
-            residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # copy over dummy past residuals (must be done after set_timesteps)
-            dummy_past_residuals = [residual + 0.2, residual + 0.15, residual + 0.10]
-            scheduler.model_outputs = dummy_past_residuals[: scheduler.config.solver_order]
-
-            time_step_0 = scheduler.timesteps[5]
-            time_step_1 = scheduler.timesteps[6]
-
-            output_0 = scheduler.step(residual, time_step_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, time_step_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_switch(self):
-        # make sure that iterating over schedulers with same config names gives same results
-        # for defaults
-        scheduler = UniPCMultistepScheduler(**self.get_scheduler_config())
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-        scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config)
-        scheduler = DEISMultistepScheduler.from_config(scheduler.config)
-        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
-        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
-
-        sample = self.full_loop(scheduler=scheduler)
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-    def test_timesteps(self):
-        for timesteps in [25, 50, 100, 999, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_thresholding(self):
-        self.check_over_configs(thresholding=False)
-        for order in [1, 2, 3]:
-            for solver_type in ["bh1", "bh2"]:
-                for threshold in [0.5, 1.0, 2.0]:
-                    for prediction_type in ["epsilon", "sample"]:
-                        self.check_over_configs(
-                            thresholding=True,
-                            prediction_type=prediction_type,
-                            sample_max_value=threshold,
-                            solver_order=order,
-                            solver_type=solver_type,
-                        )
-
-    def test_prediction_type(self):
-        for prediction_type in ["epsilon", "v_prediction"]:
-            self.check_over_configs(prediction_type=prediction_type)
-
-    def test_solver_order_and_type(self):
-        for solver_type in ["bh1", "bh2"]:
-            for order in [1, 2, 3]:
-                for prediction_type in ["epsilon", "sample"]:
-                    self.check_over_configs(
-                        solver_order=order,
-                        solver_type=solver_type,
-                        prediction_type=prediction_type,
-                    )
-                    sample = self.full_loop(
-                        solver_order=order,
-                        solver_type=solver_type,
-                        prediction_type=prediction_type,
-                    )
-                    assert not torch.isnan(sample).any(), "Samples have nan numbers"
-
-    def test_lower_order_final(self):
-        self.check_over_configs(lower_order_final=True)
-        self.check_over_configs(lower_order_final=False)
-
-    def test_inference_steps(self):
-        for num_inference_steps in [1, 2, 3, 5, 10, 50, 100, 999, 1000]:
-            self.check_over_forward(num_inference_steps=num_inference_steps, time_step=0)
-
-    def test_full_loop_no_noise(self):
-        sample = self.full_loop()
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.2521) < 1e-3
-
-    def test_full_loop_with_v_prediction(self):
-        sample = self.full_loop(prediction_type="v_prediction")
-        result_mean = torch.mean(torch.abs(sample))
-
-        assert abs(result_mean.item() - 0.1096) < 1e-3
-
-    def test_fp16_support(self):
-        scheduler_class = self.scheduler_classes[0]
-        scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0)
-        scheduler = scheduler_class(**scheduler_config)
-
-        num_inference_steps = 10
-        model = self.dummy_model()
-        sample = self.dummy_sample_deter.half()
-        scheduler.set_timesteps(num_inference_steps)
-
-        for i, t in enumerate(scheduler.timesteps):
-            residual = model(sample, t)
-            sample = scheduler.step(residual, t, sample).prev_sample
-
-        assert sample.dtype == torch.float16
diff --git a/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py b/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py
deleted file mode 100644
index 74437ad4548074a488917d3ea9b5eef4f0ac1532..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_scheduler_vq_diffusion.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import torch
-import torch.nn.functional as F
-
-from diffusers import VQDiffusionScheduler
-
-from .test_schedulers import SchedulerCommonTest
-
-
-class VQDiffusionSchedulerTest(SchedulerCommonTest):
-    scheduler_classes = (VQDiffusionScheduler,)
-
-    def get_scheduler_config(self, **kwargs):
-        config = {
-            "num_vec_classes": 4097,
-            "num_train_timesteps": 100,
-        }
-
-        config.update(**kwargs)
-        return config
-
-    def dummy_sample(self, num_vec_classes):
-        batch_size = 4
-        height = 8
-        width = 8
-
-        sample = torch.randint(0, num_vec_classes, (batch_size, height * width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        assert False
-
-    def dummy_model(self, num_vec_classes):
-        def model(sample, t, *args):
-            batch_size, num_latent_pixels = sample.shape
-            logits = torch.rand((batch_size, num_vec_classes - 1, num_latent_pixels))
-            return_value = F.log_softmax(logits.double(), dim=1).float()
-            return return_value
-
-        return model
-
-    def test_timesteps(self):
-        for timesteps in [2, 5, 100, 1000]:
-            self.check_over_configs(num_train_timesteps=timesteps)
-
-    def test_num_vec_classes(self):
-        for num_vec_classes in [5, 100, 1000, 4000]:
-            self.check_over_configs(num_vec_classes=num_vec_classes)
-
-    def test_time_indices(self):
-        for t in [0, 50, 99]:
-            self.check_over_forward(time_step=t)
-
-    def test_add_noise_device(self):
-        pass
diff --git a/diffusers/tests/schedulers/test_schedulers.py b/diffusers/tests/schedulers/test_schedulers.py
deleted file mode 100755
index bfbf5cbc798f52b2eceba8ba17747dc4d8ae8bc3..0000000000000000000000000000000000000000
--- a/diffusers/tests/schedulers/test_schedulers.py
+++ /dev/null
@@ -1,598 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import json
-import os
-import tempfile
-import unittest
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-
-import diffusers
-from diffusers import (
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    IPNDMScheduler,
-    LMSDiscreteScheduler,
-    VQDiffusionScheduler,
-    logging,
-)
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import CaptureLogger
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class SchedulerObject(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-    ):
-        pass
-
-
-class SchedulerObject2(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        f=[1, 3],
-    ):
-        pass
-
-
-class SchedulerObject3(SchedulerMixin, ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-        f=[1, 3],
-    ):
-        pass
-
-
-class SchedulerBaseTests(unittest.TestCase):
-    def test_save_load_from_different_config(self):
-        obj = SchedulerObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SchedulerObject", SchedulerObject)
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            with CaptureLogger(logger) as cap_logger_1:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_1 = SchedulerObject2.from_config(config)
-
-            # now save a config parameter that is not expected
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
-                data = json.load(f)
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj_2 = SchedulerObject.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_3 = SchedulerObject2.from_config(config)
-
-        assert new_obj_1.__class__ == SchedulerObject2
-        assert new_obj_2.__class__ == SchedulerObject
-        assert new_obj_3.__class__ == SchedulerObject2
-
-        assert cap_logger_1.out == ""
-        assert (
-            cap_logger_2.out
-            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
-            " will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-        assert cap_logger_2.out.replace("SchedulerObject", "SchedulerObject2") == cap_logger_3.out
-
-    def test_save_load_compatible_schedulers(self):
-        SchedulerObject2._compatibles = ["SchedulerObject"]
-        SchedulerObject._compatibles = ["SchedulerObject2"]
-
-        obj = SchedulerObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SchedulerObject", SchedulerObject)
-        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            # now save a config parameter that is expected by another class, but not origin class
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "r") as f:
-                data = json.load(f)
-                data["f"] = [0, 0]
-                data["unexpected"] = True
-
-            with open(os.path.join(tmpdirname, SchedulerObject.config_name), "w") as f:
-                json.dump(data, f)
-
-            with CaptureLogger(logger) as cap_logger:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj = SchedulerObject.from_config(config)
-
-        assert new_obj.__class__ == SchedulerObject
-
-        assert (
-            cap_logger.out
-            == "The config attributes {'unexpected': True} were passed to SchedulerObject, but are not expected and"
-            " will"
-            " be ignored. Please verify your config.json configuration file.\n"
-        )
-
-    def test_save_load_from_different_config_comp_schedulers(self):
-        SchedulerObject3._compatibles = ["SchedulerObject", "SchedulerObject2"]
-        SchedulerObject2._compatibles = ["SchedulerObject", "SchedulerObject3"]
-        SchedulerObject._compatibles = ["SchedulerObject2", "SchedulerObject3"]
-
-        obj = SchedulerObject()
-
-        # mock add obj class to `diffusers`
-        setattr(diffusers, "SchedulerObject", SchedulerObject)
-        setattr(diffusers, "SchedulerObject2", SchedulerObject2)
-        setattr(diffusers, "SchedulerObject3", SchedulerObject3)
-        logger = logging.get_logger("diffusers.configuration_utils")
-        logger.setLevel(diffusers.logging.INFO)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-
-            with CaptureLogger(logger) as cap_logger_1:
-                config = SchedulerObject.load_config(tmpdirname)
-                new_obj_1 = SchedulerObject.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_2:
-                config = SchedulerObject2.load_config(tmpdirname)
-                new_obj_2 = SchedulerObject2.from_config(config)
-
-            with CaptureLogger(logger) as cap_logger_3:
-                config = SchedulerObject3.load_config(tmpdirname)
-                new_obj_3 = SchedulerObject3.from_config(config)
-
-        assert new_obj_1.__class__ == SchedulerObject
-        assert new_obj_2.__class__ == SchedulerObject2
-        assert new_obj_3.__class__ == SchedulerObject3
-
-        assert cap_logger_1.out == ""
-        assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-        assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n"
-
-
-class SchedulerCommonTest(unittest.TestCase):
-    scheduler_classes = ()
-    forward_default_kwargs = ()
-
-    @property
-    def dummy_sample(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = torch.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    @property
-    def dummy_sample_deter(self):
-        batch_size = 4
-        num_channels = 3
-        height = 8
-        width = 8
-
-        num_elems = batch_size * num_channels * height * width
-        sample = torch.arange(num_elems)
-        sample = sample.reshape(num_channels, height, width, batch_size)
-        sample = sample / num_elems
-        sample = sample.permute(3, 0, 1, 2)
-
-        return sample
-
-    def get_scheduler_config(self):
-        raise NotImplementedError
-
-    def dummy_model(self):
-        def model(sample, t, *args):
-            return sample * t / (t + 1)
-
-        return model
-
-    def check_over_configs(self, time_step=0, **config):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            # TODO(Suraj) - delete the following two lines once DDPM, DDIM, and PNDM have timesteps casted to float by default
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                time_step = float(time_step)
-
-            scheduler_config = self.get_scheduler_config(**config)
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, time_step)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Make sure `scale_model_input` is invoked to prevent a warning
-            if scheduler_class != VQDiffusionScheduler:
-                _ = scheduler.scale_model_input(sample, 0)
-                _ = new_scheduler.scale_model_input(sample, 0)
-
-            # Set the seed before step() as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def check_over_forward(self, time_step=0, **forward_kwargs):
-        kwargs = dict(self.forward_default_kwargs)
-        kwargs.update(forward_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                time_step = float(time_step)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, time_step)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            output = scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            new_output = new_scheduler.step(residual, time_step, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_from_save_pretrained(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        for scheduler_class in self.scheduler_classes:
-            timestep = 1
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep = float(timestep)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_config(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-                new_scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            output = scheduler.step(residual, timestep, sample, **kwargs).prev_sample
-
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            new_output = new_scheduler.step(residual, timestep, sample, **kwargs).prev_sample
-
-            assert torch.sum(torch.abs(output - new_output)) < 1e-5, "Scheduler outputs are not identical"
-
-    def test_compatibles(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-
-            scheduler = scheduler_class(**scheduler_config)
-
-            assert all(c is not None for c in scheduler.compatibles)
-
-            for comp_scheduler_cls in scheduler.compatibles:
-                comp_scheduler = comp_scheduler_cls.from_config(scheduler.config)
-                assert comp_scheduler is not None
-
-            new_scheduler = scheduler_class.from_config(comp_scheduler.config)
-
-            new_scheduler_config = {k: v for k, v in new_scheduler.config.items() if k in scheduler.config}
-            scheduler_diff = {k: v for k, v in new_scheduler.config.items() if k not in scheduler.config}
-
-            # make sure that configs are essentially identical
-            assert new_scheduler_config == dict(scheduler.config)
-
-            # make sure that only differences are for configs that are not in init
-            init_keys = inspect.signature(scheduler_class.__init__).parameters.keys()
-            assert set(scheduler_diff.keys()).intersection(set(init_keys)) == set()
-
-    def test_from_pretrained(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-
-            scheduler = scheduler_class(**scheduler_config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_pretrained(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            assert scheduler.config == new_scheduler.config
-
-    def test_step_shape(self):
-        kwargs = dict(self.forward_default_kwargs)
-
-        num_inference_steps = kwargs.pop("num_inference_steps", None)
-
-        timestep_0 = 0
-        timestep_1 = 1
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep_0 = float(timestep_0)
-                timestep_1 = float(timestep_1)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep_0)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            output_0 = scheduler.step(residual, timestep_0, sample, **kwargs).prev_sample
-            output_1 = scheduler.step(residual, timestep_1, sample, **kwargs).prev_sample
-
-            self.assertEqual(output_0.shape, sample.shape)
-            self.assertEqual(output_0.shape, output_1.shape)
-
-    def test_scheduler_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            t[t != t] = 0
-            return t
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    torch.allclose(
-                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                    ),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
-                    ),
-                )
-
-        kwargs = dict(self.forward_default_kwargs)
-        num_inference_steps = kwargs.pop("num_inference_steps", 50)
-
-        timestep = 0
-        if len(self.scheduler_classes) > 0 and self.scheduler_classes[0] == IPNDMScheduler:
-            timestep = 1
-
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class in (EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler):
-                timestep = float(timestep)
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class == VQDiffusionScheduler:
-                num_vec_classes = scheduler_config["num_vec_classes"]
-                sample = self.dummy_sample(num_vec_classes)
-                model = self.dummy_model(num_vec_classes)
-                residual = model(sample, timestep)
-            else:
-                sample = self.dummy_sample
-                residual = 0.1 * sample
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            outputs_dict = scheduler.step(residual, timestep, sample, **kwargs)
-
-            if num_inference_steps is not None and hasattr(scheduler, "set_timesteps"):
-                scheduler.set_timesteps(num_inference_steps)
-            elif num_inference_steps is not None and not hasattr(scheduler, "set_timesteps"):
-                kwargs["num_inference_steps"] = num_inference_steps
-
-            # Set the seed before state as some schedulers are stochastic like EulerAncestralDiscreteScheduler, EulerDiscreteScheduler
-            if "generator" in set(inspect.signature(scheduler.step).parameters.keys()):
-                kwargs["generator"] = torch.manual_seed(0)
-            outputs_tuple = scheduler.step(residual, timestep, sample, return_dict=False, **kwargs)
-
-            recursive_check(outputs_tuple, outputs_dict)
-
-    def test_scheduler_public_api(self):
-        for scheduler_class in self.scheduler_classes:
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-
-            if scheduler_class != VQDiffusionScheduler:
-                self.assertTrue(
-                    hasattr(scheduler, "init_noise_sigma"),
-                    f"{scheduler_class} does not implement a required attribute `init_noise_sigma`",
-                )
-                self.assertTrue(
-                    hasattr(scheduler, "scale_model_input"),
-                    (
-                        f"{scheduler_class} does not implement a required class method `scale_model_input(sample,"
-                        " timestep)`"
-                    ),
-                )
-            self.assertTrue(
-                hasattr(scheduler, "step"),
-                f"{scheduler_class} does not implement a required class method `step(...)`",
-            )
-
-            if scheduler_class != VQDiffusionScheduler:
-                sample = self.dummy_sample
-                scaled_sample = scheduler.scale_model_input(sample, 0.0)
-                self.assertEqual(sample.shape, scaled_sample.shape)
-
-    def test_add_noise_device(self):
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class == IPNDMScheduler:
-                continue
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config)
-            scheduler.set_timesteps(100)
-
-            sample = self.dummy_sample.to(torch_device)
-            scaled_sample = scheduler.scale_model_input(sample, 0.0)
-            self.assertEqual(sample.shape, scaled_sample.shape)
-
-            noise = torch.randn_like(scaled_sample).to(torch_device)
-            t = scheduler.timesteps[5][None]
-            noised = scheduler.add_noise(scaled_sample, noise, t)
-            self.assertEqual(noised.shape, scaled_sample.shape)
-
-    def test_deprecated_kwargs(self):
-        for scheduler_class in self.scheduler_classes:
-            has_kwarg_in_model_class = "kwargs" in inspect.signature(scheduler_class.__init__).parameters
-            has_deprecated_kwarg = len(scheduler_class._deprecated_kwargs) > 0
-
-            if has_kwarg_in_model_class and not has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} has `**kwargs` in its __init__ method but has not defined any deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if"
-                    " there are no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                    " [<deprecated_argument>]`"
-                )
-
-            if not has_kwarg_in_model_class and has_deprecated_kwarg:
-                raise ValueError(
-                    f"{scheduler_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated"
-                    " kwargs under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs`"
-                    f" argument to {self.model_class}.__init__ if there are deprecated arguments or remove the"
-                    " deprecated argument from `_deprecated_kwargs = [<deprecated_argument>]`"
-                )
-
-    def test_trained_betas(self):
-        for scheduler_class in self.scheduler_classes:
-            if scheduler_class == VQDiffusionScheduler:
-                continue
-
-            scheduler_config = self.get_scheduler_config()
-            scheduler = scheduler_class(**scheduler_config, trained_betas=np.array([0.1, 0.3]))
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                scheduler.save_pretrained(tmpdirname)
-                new_scheduler = scheduler_class.from_pretrained(tmpdirname)
-
-            assert scheduler.betas.tolist() == new_scheduler.betas.tolist()
diff --git a/diffusers/tests/test_config.py b/diffusers/tests/test_config.py
deleted file mode 100644
index 95b0cdf9a597ef8ff26fab3ada4a2deeac156b8e..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_config.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-from diffusers import (
-    DDIMScheduler,
-    DDPMScheduler,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    PNDMScheduler,
-    logging,
-)
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.utils.testing_utils import CaptureLogger
-
-
-class SampleObject(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-    ):
-        pass
-
-
-class SampleObject2(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        f=[1, 3],
-    ):
-        pass
-
-
-class SampleObject3(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-        f=[1, 3],
-    ):
-        pass
-
-
-class ConfigTester(unittest.TestCase):
-    def test_load_not_from_mixin(self):
-        with self.assertRaises(ValueError):
-            ConfigMixin.load_config("dummy_path")
-
-    def test_register_to_config(self):
-        obj = SampleObject()
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # init ignore private arguments
-        obj = SampleObject(_name_or_path="lalala")
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # can override default
-        obj = SampleObject(c=6)
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # can use positional arguments.
-        obj = SampleObject(1, c=6)
-        config = obj.config
-        assert config["a"] == 1
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-    def test_save_load(self):
-        obj = SampleObject()
-        config = obj.config
-
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            new_obj = SampleObject.from_config(SampleObject.load_config(tmpdirname))
-            new_config = new_obj.config
-
-        # unfreeze configs
-        config = dict(config)
-        new_config = dict(new_config)
-
-        assert config.pop("c") == (2, 5)  # instantiated as tuple
-        assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
-        assert config == new_config
-
-    def test_load_ddim_from_pndm(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            ddim = DDIMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-
-        assert ddim.__class__ == DDIMScheduler
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-    def test_load_euler_from_pndm(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            euler = EulerDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-
-        assert euler.__class__ == EulerDiscreteScheduler
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-    def test_load_euler_ancestral_from_pndm(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            euler = EulerAncestralDiscreteScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-
-        assert euler.__class__ == EulerAncestralDiscreteScheduler
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-    def test_load_pndm(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            pndm = PNDMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-
-        assert pndm.__class__ == PNDMScheduler
-        # no warning should be thrown
-        assert cap_logger.out == ""
-
-    def test_overwrite_config_on_load(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            ddpm = DDPMScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="scheduler",
-                prediction_type="sample",
-                beta_end=8,
-            )
-
-        with CaptureLogger(logger) as cap_logger_2:
-            ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
-
-        assert ddpm.__class__ == DDPMScheduler
-        assert ddpm.config.prediction_type == "sample"
-        assert ddpm.config.beta_end == 8
-        assert ddpm_2.config.beta_start == 88
-
-        # no warning should be thrown
-        assert cap_logger.out == ""
-        assert cap_logger_2.out == ""
-
-    def test_load_dpmsolver(self):
-        logger = logging.get_logger("diffusers.configuration_utils")
-
-        with CaptureLogger(logger) as cap_logger:
-            dpm = DPMSolverMultistepScheduler.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler"
-            )
-
-        assert dpm.__class__ == DPMSolverMultistepScheduler
-        # no warning should be thrown
-        assert cap_logger.out == ""
diff --git a/diffusers/tests/test_ema.py b/diffusers/tests/test_ema.py
deleted file mode 100644
index 812d83e2f2418817f4d7e0e1c81d1b1dedfa611d..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_ema.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
-import unittest
-
-import torch
-
-from diffusers import UNet2DConditionModel
-from diffusers.training_utils import EMAModel
-from diffusers.utils.testing_utils import skip_mps, torch_device
-
-
-class EMAModelTests(unittest.TestCase):
-    model_id = "hf-internal-testing/tiny-stable-diffusion-pipe"
-    batch_size = 1
-    prompt_length = 77
-    text_encoder_hidden_dim = 32
-    num_in_channels = 4
-    latent_height = latent_width = 64
-    generator = torch.manual_seed(0)
-
-    def get_models(self, decay=0.9999):
-        unet = UNet2DConditionModel.from_pretrained(self.model_id, subfolder="unet")
-        unet = unet.to(torch_device)
-        ema_unet = EMAModel(unet.parameters(), decay=decay, model_cls=UNet2DConditionModel, model_config=unet.config)
-        return unet, ema_unet
-
-    def get_dummy_inputs(self):
-        noisy_latents = torch.randn(
-            self.batch_size, self.num_in_channels, self.latent_height, self.latent_width, generator=self.generator
-        ).to(torch_device)
-        timesteps = torch.randint(0, 1000, size=(self.batch_size,), generator=self.generator).to(torch_device)
-        encoder_hidden_states = torch.randn(
-            self.batch_size, self.prompt_length, self.text_encoder_hidden_dim, generator=self.generator
-        ).to(torch_device)
-        return noisy_latents, timesteps, encoder_hidden_states
-
-    def simulate_backprop(self, unet):
-        updated_state_dict = {}
-        for k, param in unet.state_dict().items():
-            updated_param = torch.randn_like(param) + (param * torch.randn_like(param))
-            updated_state_dict.update({k: updated_param})
-        unet.load_state_dict(updated_state_dict)
-        return unet
-
-    def test_optimization_steps_updated(self):
-        unet, ema_unet = self.get_models()
-        # Take the first (hypothetical) EMA step.
-        ema_unet.step(unet.parameters())
-        assert ema_unet.optimization_step == 1
-
-        # Take two more.
-        for _ in range(2):
-            ema_unet.step(unet.parameters())
-        assert ema_unet.optimization_step == 3
-
-    def test_shadow_params_not_updated(self):
-        unet, ema_unet = self.get_models()
-        # Since the `unet` is not being updated (i.e., backprop'd)
-        # there won't be any difference between the `params` of `unet`
-        # and `ema_unet` even if we call `ema_unet.step(unet.parameters())`.
-        ema_unet.step(unet.parameters())
-        orig_params = list(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert torch.allclose(s_param, param)
-
-        # The above holds true even if we call `ema.step()` multiple times since
-        # `unet` params are still not being updated.
-        for _ in range(4):
-            ema_unet.step(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert torch.allclose(s_param, param)
-
-    def test_shadow_params_updated(self):
-        unet, ema_unet = self.get_models()
-        # Here we simulate the parameter updates for `unet`. Since there might
-        # be some parameters which are initialized to zero we take extra care to
-        # initialize their values to something non-zero before the multiplication.
-        unet_pseudo_updated_step_one = self.simulate_backprop(unet)
-
-        # Take the EMA step.
-        ema_unet.step(unet_pseudo_updated_step_one.parameters())
-
-        # Now the EMA'd parameters won't be equal to the original model parameters.
-        orig_params = list(unet_pseudo_updated_step_one.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert ~torch.allclose(s_param, param)
-
-        # Ensure this is the case when we take multiple EMA steps.
-        for _ in range(4):
-            ema_unet.step(unet.parameters())
-        for s_param, param in zip(ema_unet.shadow_params, orig_params):
-            assert ~torch.allclose(s_param, param)
-
-    def test_consecutive_shadow_params_updated(self):
-        # If we call EMA step after a backpropagation consecutively for two times,
-        # the shadow params from those two steps should be different.
-        unet, ema_unet = self.get_models()
-
-        # First backprop + EMA
-        unet_step_one = self.simulate_backprop(unet)
-        ema_unet.step(unet_step_one.parameters())
-        step_one_shadow_params = ema_unet.shadow_params
-
-        # Second backprop + EMA
-        unet_step_two = self.simulate_backprop(unet_step_one)
-        ema_unet.step(unet_step_two.parameters())
-        step_two_shadow_params = ema_unet.shadow_params
-
-        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
-            assert ~torch.allclose(step_one, step_two)
-
-    def test_zero_decay(self):
-        # If there's no decay even if there are backprops, EMA steps
-        # won't take any effect i.e., the shadow params would remain the
-        # same.
-        unet, ema_unet = self.get_models(decay=0.0)
-        unet_step_one = self.simulate_backprop(unet)
-        ema_unet.step(unet_step_one.parameters())
-        step_one_shadow_params = ema_unet.shadow_params
-
-        unet_step_two = self.simulate_backprop(unet_step_one)
-        ema_unet.step(unet_step_two.parameters())
-        step_two_shadow_params = ema_unet.shadow_params
-
-        for step_one, step_two in zip(step_one_shadow_params, step_two_shadow_params):
-            assert torch.allclose(step_one, step_two)
-
-    @skip_mps
-    def test_serialization(self):
-        unet, ema_unet = self.get_models()
-        noisy_latents, timesteps, encoder_hidden_states = self.get_dummy_inputs()
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            ema_unet.save_pretrained(tmpdir)
-            loaded_unet = UNet2DConditionModel.from_pretrained(tmpdir, model_cls=UNet2DConditionModel)
-            loaded_unet = loaded_unet.to(unet.device)
-
-        # Since no EMA step has been performed the outputs should match.
-        output = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-        output_loaded = loaded_unet(noisy_latents, timesteps, encoder_hidden_states).sample
-
-        assert torch.allclose(output, output_loaded, atol=1e-4)
diff --git a/diffusers/tests/test_hub_utils.py b/diffusers/tests/test_hub_utils.py
deleted file mode 100644
index e8b8ea3a2fd9b114ff184291e7ec73928ba885d7..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_hub_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from unittest.mock import Mock, patch
-
-import diffusers.utils.hub_utils
-
-
-class CreateModelCardTest(unittest.TestCase):
-    @patch("diffusers.utils.hub_utils.get_full_repo_name")
-    def test_create_model_card(self, repo_name_mock: Mock) -> None:
-        repo_name_mock.return_value = "full_repo_name"
-        with TemporaryDirectory() as tmpdir:
-            # Dummy args values
-            args = Mock()
-            args.output_dir = tmpdir
-            args.local_rank = 0
-            args.hub_token = "hub_token"
-            args.dataset_name = "dataset_name"
-            args.learning_rate = 0.01
-            args.train_batch_size = 100000
-            args.eval_batch_size = 10000
-            args.gradient_accumulation_steps = 0.01
-            args.adam_beta1 = 0.02
-            args.adam_beta2 = 0.03
-            args.adam_weight_decay = 0.0005
-            args.adam_epsilon = 0.000001
-            args.lr_scheduler = 1
-            args.lr_warmup_steps = 10
-            args.ema_inv_gamma = 0.001
-            args.ema_power = 0.1
-            args.ema_max_decay = 0.2
-            args.mixed_precision = True
-
-            # Model card mush be rendered and saved
-            diffusers.utils.hub_utils.create_model_card(args, model_name="model_name")
-            self.assertTrue((Path(tmpdir) / "README.md").is_file())
diff --git a/diffusers/tests/test_image_processor.py b/diffusers/tests/test_image_processor.py
deleted file mode 100644
index 4f0e2c5aecfdaa63126e6d64fabf7cae4fba4b57..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_image_processor.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import PIL
-import torch
-
-from diffusers.image_processor import VaeImageProcessor
-
-
-class ImageProcessorTest(unittest.TestCase):
-    @property
-    def dummy_sample(self):
-        batch_size = 1
-        num_channels = 3
-        height = 8
-        width = 8
-
-        sample = torch.rand((batch_size, num_channels, height, width))
-
-        return sample
-
-    def to_np(self, image):
-        if isinstance(image[0], PIL.Image.Image):
-            return np.stack([np.array(i) for i in image], axis=0)
-        elif isinstance(image, torch.Tensor):
-            return image.cpu().numpy().transpose(0, 2, 3, 1)
-        return image
-
-    def test_vae_image_processor_pt(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pt = self.dummy_sample
-        input_np = self.to_np(input_pt)
-
-        for output_type in ["pt", "np", "pil"]:
-            out = image_processor.postprocess(
-                image_processor.preprocess(input_pt),
-                output_type=output_type,
-            )
-            out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            assert (
-                np.abs(in_np - out_np).max() < 1e-6
-            ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
-
-        for output_type in ["pt", "np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-
-            out_np = self.to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            assert (
-                np.abs(in_np - out_np).max() < 1e-6
-            ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_np = self.dummy_sample.cpu().numpy().transpose(0, 2, 3, 1)
-        input_pil = image_processor.numpy_to_pil(input_np)
-
-        for output_type in ["pt", "np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = self.to_np(out) if output_type == "pil" else (self.to_np(out) * 255).round()
-                assert (
-                    np.abs(in_np - out_np).max() < 1e-6
-                ), f"decoded output does not match input for output_type {output_type}"
-
-    def test_preprocess_input_3d(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pt_4d = self.dummy_sample
-        input_pt_3d = input_pt_4d.squeeze(0)
-
-        out_pt_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pt_4d),
-            output_type="np",
-        )
-        out_pt_3d = image_processor.postprocess(
-            image_processor.preprocess(input_pt_3d),
-            output_type="np",
-        )
-
-        input_np_4d = self.to_np(self.dummy_sample)
-        input_np_3d = input_np_4d.squeeze(0)
-
-        out_np_4d = image_processor.postprocess(
-            image_processor.preprocess(input_np_4d),
-            output_type="np",
-        )
-        out_np_3d = image_processor.postprocess(
-            image_processor.preprocess(input_np_3d),
-            output_type="np",
-        )
-
-        assert np.abs(out_pt_4d - out_pt_3d).max() < 1e-6
-        assert np.abs(out_np_4d - out_np_3d).max() < 1e-6
-
-    def test_preprocess_input_list(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=False)
-
-        input_pt_4d = self.dummy_sample
-        input_pt_list = list(input_pt_4d)
-
-        out_pt_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pt_4d),
-            output_type="np",
-        )
-
-        out_pt_list = image_processor.postprocess(
-            image_processor.preprocess(input_pt_list),
-            output_type="np",
-        )
-
-        input_np_4d = self.to_np(self.dummy_sample)
-        list(input_np_4d)
-
-        out_np_4d = image_processor.postprocess(
-            image_processor.preprocess(input_pt_4d),
-            output_type="np",
-        )
-
-        out_np_list = image_processor.postprocess(
-            image_processor.preprocess(input_pt_list),
-            output_type="np",
-        )
-
-        assert np.abs(out_pt_4d - out_pt_list).max() < 1e-6
-        assert np.abs(out_np_4d - out_np_list).max() < 1e-6
diff --git a/diffusers/tests/test_layers_utils.py b/diffusers/tests/test_layers_utils.py
deleted file mode 100644
index d0e2102b539eed99d2a3c0910c1c7d2d9def4c6f..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_layers_utils.py
+++ /dev/null
@@ -1,586 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import unittest
-
-import numpy as np
-import torch
-from torch import nn
-
-from diffusers.models.attention import GEGLU, AdaLayerNorm, ApproximateGELU, AttentionBlock
-from diffusers.models.embeddings import get_timestep_embedding
-from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
-from diffusers.models.transformer_2d import Transformer2DModel
-from diffusers.utils import torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class EmbeddingsTests(unittest.TestCase):
-    def test_timestep_embeddings(self):
-        embedding_dim = 256
-        timesteps = torch.arange(16)
-
-        t1 = get_timestep_embedding(timesteps, embedding_dim)
-
-        # first vector should always be composed only of 0's and 1's
-        assert (t1[0, : embedding_dim // 2] - 0).abs().sum() < 1e-5
-        assert (t1[0, embedding_dim // 2 :] - 1).abs().sum() < 1e-5
-
-        # last element of each vector should be one
-        assert (t1[:, -1] - 1).abs().sum() < 1e-5
-
-        # For large embeddings (e.g. 128) the frequency of every vector is higher
-        # than the previous one which means that the gradients of later vectors are
-        # ALWAYS higher than the previous ones
-        grad_mean = np.abs(np.gradient(t1, axis=-1)).mean(axis=1)
-
-        prev_grad = 0.0
-        for grad in grad_mean:
-            assert grad > prev_grad
-            prev_grad = grad
-
-    def test_timestep_defaults(self):
-        embedding_dim = 16
-        timesteps = torch.arange(10)
-
-        t1 = get_timestep_embedding(timesteps, embedding_dim)
-        t2 = get_timestep_embedding(
-            timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, max_period=10_000
-        )
-
-        assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
-
-    def test_timestep_flip_sin_cos(self):
-        embedding_dim = 16
-        timesteps = torch.arange(10)
-
-        t1 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=True)
-        t1 = torch.cat([t1[:, embedding_dim // 2 :], t1[:, : embedding_dim // 2]], dim=-1)
-
-        t2 = get_timestep_embedding(timesteps, embedding_dim, flip_sin_to_cos=False)
-
-        assert torch.allclose(t1.cpu(), t2.cpu(), 1e-3)
-
-    def test_timestep_downscale_freq_shift(self):
-        embedding_dim = 16
-        timesteps = torch.arange(10)
-
-        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0)
-        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1)
-
-        # get cosine half (vectors that are wrapped into cosine)
-        cosine_half = (t1 - t2)[:, embedding_dim // 2 :]
-
-        # cosine needs to be negative
-        assert (np.abs((cosine_half <= 0).numpy()) - 1).sum() < 1e-5
-
-    def test_sinoid_embeddings_hardcoded(self):
-        embedding_dim = 64
-        timesteps = torch.arange(128)
-
-        # standard unet, score_vde
-        t1 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=1, flip_sin_to_cos=False)
-        # glide, ldm
-        t2 = get_timestep_embedding(timesteps, embedding_dim, downscale_freq_shift=0, flip_sin_to_cos=True)
-        # grad-tts
-        t3 = get_timestep_embedding(timesteps, embedding_dim, scale=1000)
-
-        assert torch.allclose(
-            t1[23:26, 47:50].flatten().cpu(),
-            torch.tensor([0.9646, 0.9804, 0.9892, 0.9615, 0.9787, 0.9882, 0.9582, 0.9769, 0.9872]),
-            1e-3,
-        )
-        assert torch.allclose(
-            t2[23:26, 47:50].flatten().cpu(),
-            torch.tensor([0.3019, 0.2280, 0.1716, 0.3146, 0.2377, 0.1790, 0.3272, 0.2474, 0.1864]),
-            1e-3,
-        )
-        assert torch.allclose(
-            t3[23:26, 47:50].flatten().cpu(),
-            torch.tensor([-0.9801, -0.9464, -0.9349, -0.3952, 0.8887, -0.9709, 0.5299, -0.2853, -0.9927]),
-            1e-3,
-        )
-
-
-class Upsample2DBlockTests(unittest.TestCase):
-    def test_upsample_default(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 32, 32)
-        upsample = Upsample2D(channels=32, use_conv=False)
-        with torch.no_grad():
-            upsampled = upsample(sample)
-
-        assert upsampled.shape == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([-0.2173, -1.2079, -1.2079, 0.2952, 1.1254, 1.1254, 0.2952, 1.1254, 1.1254])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_upsample_with_conv(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 32, 32)
-        upsample = Upsample2D(channels=32, use_conv=True)
-        with torch.no_grad():
-            upsampled = upsample(sample)
-
-        assert upsampled.shape == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([0.7145, 1.3773, 0.3492, 0.8448, 1.0839, -0.3341, 0.5956, 0.1250, -0.4841])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_upsample_with_conv_out_dim(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 32, 32)
-        upsample = Upsample2D(channels=32, use_conv=True, out_channels=64)
-        with torch.no_grad():
-            upsampled = upsample(sample)
-
-        assert upsampled.shape == (1, 64, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([0.2703, 0.1656, -0.2538, -0.0553, -0.2984, 0.1044, 0.1155, 0.2579, 0.7755])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_upsample_with_transpose(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 32, 32)
-        upsample = Upsample2D(channels=32, use_conv=False, use_conv_transpose=True)
-        with torch.no_grad():
-            upsampled = upsample(sample)
-
-        assert upsampled.shape == (1, 32, 64, 64)
-        output_slice = upsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([-0.3028, -0.1582, 0.0071, 0.0350, -0.4799, -0.1139, 0.1056, -0.1153, -0.1046])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-
-class Downsample2DBlockTests(unittest.TestCase):
-    def test_downsample_default(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64)
-        downsample = Downsample2D(channels=32, use_conv=False)
-        with torch.no_grad():
-            downsampled = downsample(sample)
-
-        assert downsampled.shape == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([-0.0513, -0.3889, 0.0640, 0.0836, -0.5460, -0.0341, -0.0169, -0.6967, 0.1179])
-        max_diff = (output_slice.flatten() - expected_slice).abs().sum().item()
-        assert max_diff <= 1e-3
-        # assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-1)
-
-    def test_downsample_with_conv(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64)
-        downsample = Downsample2D(channels=32, use_conv=True)
-        with torch.no_grad():
-            downsampled = downsample(sample)
-
-        assert downsampled.shape == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913],
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_downsample_with_conv_pad1(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64)
-        downsample = Downsample2D(channels=32, use_conv=True, padding=1)
-        with torch.no_grad():
-            downsampled = downsample(sample)
-
-        assert downsampled.shape == (1, 32, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([0.9267, 0.5878, 0.3337, 1.2321, -0.1191, -0.3984, -0.7532, -0.0715, -0.3913])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_downsample_with_conv_out_dim(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64)
-        downsample = Downsample2D(channels=32, use_conv=True, out_channels=16)
-        with torch.no_grad():
-            downsampled = downsample(sample)
-
-        assert downsampled.shape == (1, 16, 32, 32)
-        output_slice = downsampled[0, -1, -3:, -3:]
-        expected_slice = torch.tensor([-0.6586, 0.5985, 0.0721, 0.1256, -0.1492, 0.4436, -0.2544, 0.5021, 1.1522])
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-
-class ResnetBlock2DTests(unittest.TestCase):
-    def test_resnet_default(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 64, 64)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [-1.9010, -0.2974, -0.8245, -1.3533, 0.8742, -0.9645, -2.0584, 1.3387, -0.4746], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_restnet_with_use_in_shortcut(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, use_in_shortcut=True).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 64, 64)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [0.2226, -1.0791, -0.1629, 0.3659, -0.2889, -1.2376, 0.0582, 0.9206, 0.0044], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_resnet_up(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, up=True).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 128, 128)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [1.2130, -0.8753, -0.9027, 1.5783, -0.5362, -0.5001, 1.0726, -0.7732, -0.4182], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_resnet_down(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, down=True).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_restnet_with_kernel_fir(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="fir", down=True).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [-0.0934, -0.5729, 0.0909, -0.2710, -0.5044, 0.0243, -0.0665, -0.5267, -0.3136], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_restnet_with_kernel_sde_vp(self):
-        torch.manual_seed(0)
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        temb = torch.randn(1, 128).to(torch_device)
-        resnet_block = ResnetBlock2D(in_channels=32, temb_channels=128, kernel="sde_vp", down=True).to(torch_device)
-        with torch.no_grad():
-            output_tensor = resnet_block(sample, temb)
-
-        assert output_tensor.shape == (1, 32, 32, 32)
-        output_slice = output_tensor[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(
-            [-0.3002, -0.7135, 0.1359, 0.0561, -0.7935, 0.0113, -0.1766, -0.6714, -0.0436], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-
-class AttentionBlockTests(unittest.TestCase):
-    @unittest.skipIf(
-        torch_device == "mps", "Matmul crashes on MPS, see https://github.com/pytorch/pytorch/issues/84039"
-    )
-    def test_attention_block_default(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        attentionBlock = AttentionBlock(
-            channels=32,
-            num_head_channels=1,
-            rescale_output_factor=1.0,
-            eps=1e-6,
-            norm_num_groups=32,
-        ).to(torch_device)
-        with torch.no_grad():
-            attention_scores = attentionBlock(sample)
-
-        assert attention_scores.shape == (1, 32, 64, 64)
-        output_slice = attention_scores[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [-1.4975, -0.0038, -0.7847, -1.4567, 1.1220, -0.8962, -1.7394, 1.1319, -0.5427], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_attention_block_sd(self):
-        # This version uses SD params and is compatible with mps
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        sample = torch.randn(1, 512, 64, 64).to(torch_device)
-        attentionBlock = AttentionBlock(
-            channels=512,
-            rescale_output_factor=1.0,
-            eps=1e-6,
-            norm_num_groups=32,
-        ).to(torch_device)
-        with torch.no_grad():
-            attention_scores = attentionBlock(sample)
-
-        assert attention_scores.shape == (1, 512, 64, 64)
-        output_slice = attention_scores[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [-0.6621, -0.0156, -3.2766, 0.8025, -0.8609, 0.2820, 0.0905, -1.1179, -3.2126], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-
-class Transformer2DModelTests(unittest.TestCase):
-    def test_spatial_transformer_default(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=32,
-            num_attention_heads=1,
-            attention_head_dim=32,
-            dropout=0.0,
-            cross_attention_dim=None,
-        ).to(torch_device)
-        with torch.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-
-        assert attention_scores.shape == (1, 32, 64, 64)
-        output_slice = attention_scores[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [-1.9455, -0.0066, -1.3933, -1.5878, 0.5325, -0.6486, -1.8648, 0.7515, -0.9689], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_spatial_transformer_cross_attention_dim(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        sample = torch.randn(1, 64, 64, 64).to(torch_device)
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=64,
-            num_attention_heads=2,
-            attention_head_dim=32,
-            dropout=0.0,
-            cross_attention_dim=64,
-        ).to(torch_device)
-        with torch.no_grad():
-            context = torch.randn(1, 4, 64).to(torch_device)
-            attention_scores = spatial_transformer_block(sample, context).sample
-
-        assert attention_scores.shape == (1, 64, 64, 64)
-        output_slice = attention_scores[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [-0.2555, -0.8877, -2.4739, -2.2251, 1.2714, 0.0807, -0.4161, -1.6408, -0.0471], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_spatial_transformer_timestep(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        num_embeds_ada_norm = 5
-
-        sample = torch.randn(1, 64, 64, 64).to(torch_device)
-        spatial_transformer_block = Transformer2DModel(
-            in_channels=64,
-            num_attention_heads=2,
-            attention_head_dim=32,
-            dropout=0.0,
-            cross_attention_dim=64,
-            num_embeds_ada_norm=num_embeds_ada_norm,
-        ).to(torch_device)
-        with torch.no_grad():
-            timestep_1 = torch.tensor(1, dtype=torch.long).to(torch_device)
-            timestep_2 = torch.tensor(2, dtype=torch.long).to(torch_device)
-            attention_scores_1 = spatial_transformer_block(sample, timestep=timestep_1).sample
-            attention_scores_2 = spatial_transformer_block(sample, timestep=timestep_2).sample
-
-        assert attention_scores_1.shape == (1, 64, 64, 64)
-        assert attention_scores_2.shape == (1, 64, 64, 64)
-
-        output_slice_1 = attention_scores_1[0, -1, -3:, -3:]
-        output_slice_2 = attention_scores_2[0, -1, -3:, -3:]
-
-        expected_slice_1 = torch.tensor(
-            [-0.1874, -0.9704, -1.4290, -1.3357, 1.5138, 0.3036, -0.0976, -1.1667, 0.1283], device=torch_device
-        )
-        expected_slice_2 = torch.tensor(
-            [-0.3493, -1.0924, -1.6161, -1.5016, 1.4245, 0.1367, -0.2526, -1.3109, -0.0547], device=torch_device
-        )
-
-        assert torch.allclose(output_slice_1.flatten(), expected_slice_1, atol=1e-3)
-        assert torch.allclose(output_slice_2.flatten(), expected_slice_2, atol=1e-3)
-
-    def test_spatial_transformer_dropout(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        sample = torch.randn(1, 32, 64, 64).to(torch_device)
-        spatial_transformer_block = (
-            Transformer2DModel(
-                in_channels=32,
-                num_attention_heads=2,
-                attention_head_dim=16,
-                dropout=0.3,
-                cross_attention_dim=None,
-            )
-            .to(torch_device)
-            .eval()
-        )
-        with torch.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-
-        assert attention_scores.shape == (1, 32, 64, 64)
-        output_slice = attention_scores[0, -1, -3:, -3:]
-
-        expected_slice = torch.tensor(
-            [-1.9380, -0.0083, -1.3771, -1.5819, 0.5209, -0.6441, -1.8545, 0.7563, -0.9615], device=torch_device
-        )
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    @unittest.skipIf(torch_device == "mps", "MPS does not support float64")
-    def test_spatial_transformer_discrete(self):
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        num_embed = 5
-
-        sample = torch.randint(0, num_embed, (1, 32)).to(torch_device)
-        spatial_transformer_block = (
-            Transformer2DModel(
-                num_attention_heads=1,
-                attention_head_dim=32,
-                num_vector_embeds=num_embed,
-                sample_size=16,
-            )
-            .to(torch_device)
-            .eval()
-        )
-
-        with torch.no_grad():
-            attention_scores = spatial_transformer_block(sample).sample
-
-        assert attention_scores.shape == (1, num_embed - 1, 32)
-
-        output_slice = attention_scores[0, -2:, -3:]
-
-        expected_slice = torch.tensor([-1.7648, -1.0241, -2.0985, -1.8035, -1.6404, -1.2098], device=torch_device)
-        assert torch.allclose(output_slice.flatten(), expected_slice, atol=1e-3)
-
-    def test_spatial_transformer_default_norm_layers(self):
-        spatial_transformer_block = Transformer2DModel(num_attention_heads=1, attention_head_dim=32, in_channels=32)
-
-        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == nn.LayerNorm
-        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
-
-    def test_spatial_transformer_ada_norm_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1,
-            attention_head_dim=32,
-            in_channels=32,
-            num_embeds_ada_norm=5,
-        )
-
-        assert spatial_transformer_block.transformer_blocks[0].norm1.__class__ == AdaLayerNorm
-        assert spatial_transformer_block.transformer_blocks[0].norm3.__class__ == nn.LayerNorm
-
-    def test_spatial_transformer_default_ff_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1,
-            attention_head_dim=32,
-            in_channels=32,
-        )
-
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == GEGLU
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
-
-        dim = 32
-        inner_dim = 128
-
-        # First dimension change
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
-        # NOTE: inner_dim * 2 because GEGLU
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim * 2
-
-        # Second dimension change
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
-
-    def test_spatial_transformer_geglu_approx_ff_layers(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1,
-            attention_head_dim=32,
-            in_channels=32,
-            activation_fn="geglu-approximate",
-        )
-
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].__class__ == ApproximateGELU
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[1].__class__ == nn.Dropout
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].__class__ == nn.Linear
-
-        dim = 32
-        inner_dim = 128
-
-        # First dimension change
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.in_features == dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[0].proj.out_features == inner_dim
-
-        # Second dimension change
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].in_features == inner_dim
-        assert spatial_transformer_block.transformer_blocks[0].ff.net[2].out_features == dim
-
-    def test_spatial_transformer_attention_bias(self):
-        spatial_transformer_block = Transformer2DModel(
-            num_attention_heads=1, attention_head_dim=32, in_channels=32, attention_bias=True
-        )
-
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_q.bias is not None
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_k.bias is not None
-        assert spatial_transformer_block.transformer_blocks[0].attn1.to_v.bias is not None
diff --git a/diffusers/tests/test_modeling_common.py b/diffusers/tests/test_modeling_common.py
deleted file mode 100644
index 40aba3b24967683b2e64b53402d9f8bdc93be2a8..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_modeling_common.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import tempfile
-import unittest
-import unittest.mock as mock
-from typing import Dict, List, Tuple
-
-import numpy as np
-import requests_mock
-import torch
-from requests.exceptions import HTTPError
-
-from diffusers.models import UNet2DConditionModel
-from diffusers.training_utils import EMAModel
-from diffusers.utils import torch_device
-from diffusers.utils.testing_utils import require_torch_gpu
-
-
-class ModelUtilsTest(unittest.TestCase):
-    def tearDown(self):
-        super().tearDown()
-
-        import diffusers
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_accelerate_loading_error_message(self):
-        with self.assertRaises(ValueError) as error_context:
-            UNet2DConditionModel.from_pretrained("hf-internal-testing/stable-diffusion-broken", subfolder="unet")
-
-        # make sure that error message states what keys are missing
-        assert "conv_out.bias" in str(error_context.exception)
-
-    def test_cached_files_are_used_when_no_internet(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        orig_model = UNet2DConditionModel.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet"
-        )
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock):
-            # Download this model to make sure it's in the cache.
-            model = UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", local_files_only=True
-            )
-
-        for p1, p2 in zip(orig_model.parameters(), model.parameters()):
-            if p1.data.ne(p2.data).sum() > 0:
-                assert False, "Parameters not the same!"
-
-    def test_one_request_upon_cached(self):
-        # TODO: For some reason this test fails on MPS where no HEAD call is made.
-        if torch_device == "mps":
-            return
-
-        import diffusers
-
-        diffusers.utils.import_utils._safetensors_available = False
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with requests_mock.mock(real_http=True) as m:
-                UNet2DConditionModel.from_pretrained(
-                    "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", cache_dir=tmpdirname
-                )
-
-            download_requests = [r.method for r in m.request_history]
-            assert download_requests.count("HEAD") == 2, "2 HEAD requests one for config, one for model"
-            assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
-
-            with requests_mock.mock(real_http=True) as m:
-                UNet2DConditionModel.from_pretrained(
-                    "hf-internal-testing/tiny-stable-diffusion-torch", subfolder="unet", cache_dir=tmpdirname
-                )
-
-            cache_requests = [r.method for r in m.request_history]
-            assert (
-                "HEAD" == cache_requests[0] and len(cache_requests) == 1
-            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_weight_overwrite(self):
-        with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
-            UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="unet",
-                cache_dir=tmpdirname,
-                in_channels=9,
-            )
-
-        # make sure that error message states what keys are missing
-        assert "Cannot load" in str(error_context.exception)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model = UNet2DConditionModel.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch",
-                subfolder="unet",
-                cache_dir=tmpdirname,
-                in_channels=9,
-                low_cpu_mem_usage=False,
-                ignore_mismatched_sizes=True,
-            )
-
-        assert model.config.in_channels == 9
-
-
-class ModelTesterMixin:
-    def test_from_save_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-        model.to(torch_device)
-        model.eval()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-            new_model.to(torch_device)
-
-        with torch.no_grad():
-            image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image.sample
-
-            new_image = new_model(**inputs_dict)
-
-            if isinstance(new_image, dict):
-                new_image = new_image.sample
-
-        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
-
-    def test_from_save_pretrained_variant(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        if hasattr(model, "set_default_attn_processor"):
-            model.set_default_attn_processor()
-        model.to(torch_device)
-        model.eval()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname, variant="fp16")
-            new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
-            if hasattr(new_model, "set_default_attn_processor"):
-                new_model.set_default_attn_processor()
-
-            # non-variant cannot be loaded
-            with self.assertRaises(OSError) as error_context:
-                self.model_class.from_pretrained(tmpdirname)
-
-            # make sure that error message states what keys are missing
-            assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(error_context.exception)
-
-            new_model.to(torch_device)
-
-        with torch.no_grad():
-            image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image.sample
-
-            new_image = new_model(**inputs_dict)
-
-            if isinstance(new_image, dict):
-                new_image = new_image.sample
-
-        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
-
-    @require_torch_gpu
-    def test_from_save_pretrained_dynamo(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model = torch.compile(model)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model.to(torch_device)
-
-        assert new_model.__class__ == self.model_class
-
-    def test_from_save_pretrained_dtype(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            if torch_device == "mps" and dtype == torch.bfloat16:
-                continue
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.to(dtype)
-                model.save_pretrained(tmpdirname)
-                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
-                assert new_model.dtype == dtype
-                new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype)
-                assert new_model.dtype == dtype
-
-    def test_determinism(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first.sample
-
-            second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second.sample
-
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
-
-    def test_output(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_with_norm_groups(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["norm_num_groups"] = 16
-        init_dict["block_out_channels"] = (16, 32)
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_signature(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        signature = inspect.signature(model.forward)
-        # signature.parameters is an OrderedDict => so arg_names order is deterministic
-        arg_names = [*signature.parameters.keys()]
-
-        expected_arg_names = ["sample", "timestep"]
-        self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model_from_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        # test if the model can be loaded from the config
-        # and has all the expected shape
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model.to(torch_device)
-            new_model.eval()
-
-        # check if all parameters shape are the same
-        for param_name in model.state_dict().keys():
-            param_1 = model.state_dict()[param_name]
-            param_2 = new_model.state_dict()[param_name]
-            self.assertEqual(param_1.shape, param_2.shape)
-
-        with torch.no_grad():
-            output_1 = model(**inputs_dict)
-
-            if isinstance(output_1, dict):
-                output_1 = output_1.sample
-
-            output_2 = new_model(**inputs_dict)
-
-            if isinstance(output_2, dict):
-                output_2 = output_2.sample
-
-        self.assertEqual(output_1.shape, output_2.shape)
-
-    @unittest.skipIf(torch_device == "mps", "Training is not supported in mps")
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output.sample
-
-        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-
-    @unittest.skipIf(torch_device == "mps", "Training is not supported in mps")
-    def test_ema_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        ema_model = EMAModel(model.parameters())
-
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output.sample
-
-        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-        ema_model.step(model.parameters())
-
-    def test_outputs_equivalence(self):
-        def set_nan_tensor_to_zero(t):
-            # Temporary fallback until `aten::_index_put_impl_` is implemented in mps
-            # Track progress in https://github.com/pytorch/pytorch/issues/77764
-            device = t.device
-            if device.type == "mps":
-                t = t.to("cpu")
-            t[t != t] = 0
-            return t.to(device)
-
-        def recursive_check(tuple_object, dict_object):
-            if isinstance(tuple_object, (List, Tuple)):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif isinstance(tuple_object, Dict):
-                for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
-                    recursive_check(tuple_iterable_value, dict_iterable_value)
-            elif tuple_object is None:
-                return
-            else:
-                self.assertTrue(
-                    torch.allclose(
-                        set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
-                    ),
-                    msg=(
-                        "Tuple and dict output are not equal. Difference:"
-                        f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
-                        f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
-                        f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
-                    ),
-                )
-
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            outputs_dict = model(**inputs_dict)
-            outputs_tuple = model(**inputs_dict, return_dict=False)
-
-        recursive_check(outputs_tuple, outputs_dict)
-
-    @unittest.skipIf(torch_device == "mps", "Gradient checkpointing skipped on MPS")
-    def test_enable_disable_gradient_checkpointing(self):
-        if not self.model_class._supports_gradient_checkpointing:
-            return  # Skip test if model does not support gradient checkpointing
-
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-
-        # at init model should have gradient checkpointing disabled
-        model = self.model_class(**init_dict)
-        self.assertFalse(model.is_gradient_checkpointing)
-
-        # check enable works
-        model.enable_gradient_checkpointing()
-        self.assertTrue(model.is_gradient_checkpointing)
-
-        # check disable works
-        model.disable_gradient_checkpointing()
-        self.assertFalse(model.is_gradient_checkpointing)
-
-    def test_deprecated_kwargs(self):
-        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
-        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
-
-        if has_kwarg_in_model_class and not has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
-                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                " [<deprecated_argument>]`"
-            )
-
-        if not has_kwarg_in_model_class and has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
-                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
-                " from `_deprecated_kwargs = [<deprecated_argument>]`"
-            )
diff --git a/diffusers/tests/test_modeling_common_flax.py b/diffusers/tests/test_modeling_common_flax.py
deleted file mode 100644
index 8945aed7c93fb1e664c7b6d799f7e0a96525b1a2..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_modeling_common_flax.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import inspect
-
-from diffusers.utils import is_flax_available
-from diffusers.utils.testing_utils import require_flax
-
-
-if is_flax_available():
-    import jax
-
-
-@require_flax
-class FlaxModelTesterMixin:
-    def test_output(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"])
-        jax.lax.stop_gradient(variables)
-
-        output = model.apply(variables, inputs_dict["sample"])
-
-        if isinstance(output, dict):
-            output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_with_norm_groups(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        init_dict["norm_num_groups"] = 16
-        init_dict["block_out_channels"] = (16, 32)
-
-        model = self.model_class(**init_dict)
-        variables = model.init(inputs_dict["prng_key"], inputs_dict["sample"])
-        jax.lax.stop_gradient(variables)
-
-        output = model.apply(variables, inputs_dict["sample"])
-
-        if isinstance(output, dict):
-            output = output.sample
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_deprecated_kwargs(self):
-        has_kwarg_in_model_class = "kwargs" in inspect.signature(self.model_class.__init__).parameters
-        has_deprecated_kwarg = len(self.model_class._deprecated_kwargs) > 0
-
-        if has_kwarg_in_model_class and not has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} has `**kwargs` in its __init__ method but has not defined any deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either remove `**kwargs` if there are"
-                " no deprecated arguments or add the deprecated argument with `_deprecated_kwargs ="
-                " [<deprecated_argument>]`"
-            )
-
-        if not has_kwarg_in_model_class and has_deprecated_kwarg:
-            raise ValueError(
-                f"{self.model_class} doesn't have `**kwargs` in its __init__ method but has defined deprecated kwargs"
-                " under the `_deprecated_kwargs` class attribute. Make sure to either add the `**kwargs` argument to"
-                f" {self.model_class}.__init__ if there are deprecated arguments or remove the deprecated argument"
-                " from `_deprecated_kwargs = [<deprecated_argument>]`"
-            )
diff --git a/diffusers/tests/test_outputs.py b/diffusers/tests/test_outputs.py
deleted file mode 100644
index 50cbd1d54ee403f2b8e79c8ada629b6b97b1be66..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_outputs.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import unittest
-from dataclasses import dataclass
-from typing import List, Union
-
-import numpy as np
-import PIL.Image
-
-from diffusers.utils.outputs import BaseOutput
-
-
-@dataclass
-class CustomOutput(BaseOutput):
-    images: Union[List[PIL.Image.Image], np.ndarray]
-
-
-class ConfigTester(unittest.TestCase):
-    def test_outputs_single_attribute(self):
-        outputs = CustomOutput(images=np.random.rand(1, 3, 4, 4))
-
-        # check every way of getting the attribute
-        assert isinstance(outputs.images, np.ndarray)
-        assert outputs.images.shape == (1, 3, 4, 4)
-        assert isinstance(outputs["images"], np.ndarray)
-        assert outputs["images"].shape == (1, 3, 4, 4)
-        assert isinstance(outputs[0], np.ndarray)
-        assert outputs[0].shape == (1, 3, 4, 4)
-
-        # test with a non-tensor attribute
-        outputs = CustomOutput(images=[PIL.Image.new("RGB", (4, 4))])
-
-        # check every way of getting the attribute
-        assert isinstance(outputs.images, list)
-        assert isinstance(outputs.images[0], PIL.Image.Image)
-        assert isinstance(outputs["images"], list)
-        assert isinstance(outputs["images"][0], PIL.Image.Image)
-        assert isinstance(outputs[0], list)
-        assert isinstance(outputs[0][0], PIL.Image.Image)
-
-    def test_outputs_dict_init(self):
-        # test output reinitialization with a `dict` for compatibility with `accelerate`
-        outputs = CustomOutput({"images": np.random.rand(1, 3, 4, 4)})
-
-        # check every way of getting the attribute
-        assert isinstance(outputs.images, np.ndarray)
-        assert outputs.images.shape == (1, 3, 4, 4)
-        assert isinstance(outputs["images"], np.ndarray)
-        assert outputs["images"].shape == (1, 3, 4, 4)
-        assert isinstance(outputs[0], np.ndarray)
-        assert outputs[0].shape == (1, 3, 4, 4)
-
-        # test with a non-tensor attribute
-        outputs = CustomOutput({"images": [PIL.Image.new("RGB", (4, 4))]})
-
-        # check every way of getting the attribute
-        assert isinstance(outputs.images, list)
-        assert isinstance(outputs.images[0], PIL.Image.Image)
-        assert isinstance(outputs["images"], list)
-        assert isinstance(outputs["images"][0], PIL.Image.Image)
-        assert isinstance(outputs[0], list)
-        assert isinstance(outputs[0][0], PIL.Image.Image)
diff --git a/diffusers/tests/test_pipelines.py b/diffusers/tests/test_pipelines.py
deleted file mode 100644
index 0525eaca50daa74b6118e9669d36451d761a42e8..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_pipelines.py
+++ /dev/null
@@ -1,1300 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import json
-import os
-import random
-import shutil
-import sys
-import tempfile
-import unittest
-import unittest.mock as mock
-
-import numpy as np
-import PIL
-import requests_mock
-import safetensors.torch
-import torch
-from parameterized import parameterized
-from PIL import Image
-from requests.exceptions import HTTPError
-from transformers import CLIPImageProcessor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKL,
-    DDIMPipeline,
-    DDIMScheduler,
-    DDPMPipeline,
-    DDPMScheduler,
-    DiffusionPipeline,
-    DPMSolverMultistepScheduler,
-    EulerAncestralDiscreteScheduler,
-    EulerDiscreteScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
-    StableDiffusionImg2ImgPipeline,
-    StableDiffusionInpaintPipelineLegacy,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    UNet2DModel,
-    UniPCMultistepScheduler,
-    logging,
-)
-from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    floats_tensor,
-    is_flax_available,
-    nightly,
-    require_torch_2,
-    slow,
-    torch_device,
-)
-from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, load_numpy, require_compel, require_torch_gpu
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class DownloadTests(unittest.TestCase):
-    def test_one_request_upon_cached(self):
-        # TODO: For some reason this test fails on MPS where no HEAD call is made.
-        if torch_device == "mps":
-            return
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
-                )
-
-            download_requests = [r.method for r in m.request_history]
-            assert download_requests.count("HEAD") == 15, "15 calls to files"
-            assert download_requests.count("GET") == 17, "15 calls to files + model_info + model_index.json"
-            assert (
-                len(download_requests) == 32
-            ), "2 calls per file (15 files) + send_telemetry, model_info and model_index.json"
-
-            with requests_mock.mock(real_http=True) as m:
-                DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
-                )
-
-            cache_requests = [r.method for r in m.request_history]
-            assert cache_requests.count("HEAD") == 1, "model_index.json is only HEAD"
-            assert cache_requests.count("GET") == 1, "model info is only GET"
-            assert (
-                len(cache_requests) == 2
-            ), "We should call only `model_info` to check for _commit hash and `send_telemetry`"
-
-    def test_download_only_pytorch(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            tmpdirname = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
-            )
-
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-            files = [item for sublist in all_root_files for item in sublist]
-
-            # None of the downloaded files should be a flax file even if we have some here:
-            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
-            assert not any(f.endswith(".msgpack") for f in files)
-            # We need to never convert this tiny model to safetensors for this test to pass
-            assert not any(f.endswith(".safetensors") for f in files)
-
-    def test_force_safetensors_error(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            with self.assertRaises(EnvironmentError):
-                tmpdirname = DiffusionPipeline.download(
-                    "hf-internal-testing/tiny-stable-diffusion-pipe-no-safetensors",
-                    safety_checker=None,
-                    cache_dir=tmpdirname,
-                    use_safetensors=True,
-                )
-
-    def test_returned_cached_folder(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        _, local_path = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, return_cached_folder=True
-        )
-        pipe_2 = StableDiffusionPipeline.from_pretrained(local_path)
-
-        pipe = pipe.to(torch_device)
-        pipe_2 = pipe_2.to(torch_device)
-
-        generator = torch.manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        generator = torch.manual_seed(0)
-        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        assert np.max(np.abs(out - out_2)) < 1e-3
-
-    def test_download_safetensors(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            tmpdirname = DiffusionPipeline.download(
-                "hf-internal-testing/tiny-stable-diffusion-pipe-safetensors",
-                safety_checker=None,
-                cache_dir=tmpdirname,
-            )
-
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname))]
-            files = [item for sublist in all_root_files for item in sublist]
-
-            # None of the downloaded files should be a pytorch file even if we have some here:
-            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_flax_model.msgpack
-            assert not any(f.endswith(".bin") for f in files)
-
-    def test_download_no_safety_checker(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        pipe = pipe.to(torch_device)
-        generator = torch.manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        pipe_2 = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        pipe_2 = pipe_2.to(torch_device)
-        generator = torch.manual_seed(0)
-        out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        assert np.max(np.abs(out - out_2)) < 1e-3
-
-    def test_load_no_safety_checker_explicit_locally(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        pipe = pipe.to(torch_device)
-        generator = torch.manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None)
-            pipe_2 = pipe_2.to(torch_device)
-
-            generator = torch.manual_seed(0)
-
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        assert np.max(np.abs(out - out_2)) < 1e-3
-
-    def test_load_no_safety_checker_default_locally(self):
-        prompt = "hello"
-        pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        pipe = pipe.to(torch_device)
-
-        generator = torch.manual_seed(0)
-        out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe.save_pretrained(tmpdirname)
-            pipe_2 = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            pipe_2 = pipe_2.to(torch_device)
-
-            generator = torch.manual_seed(0)
-
-            out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images
-
-        assert np.max(np.abs(out - out_2)) < 1e-3
-
-    def test_cached_files_are_used_when_no_internet(self):
-        # A mock response for an HTTP head request to emulate server down
-        response_mock = mock.Mock()
-        response_mock.status_code = 500
-        response_mock.headers = {}
-        response_mock.raise_for_status.side_effect = HTTPError
-        response_mock.json.return_value = {}
-
-        # Download this model to make sure it's in the cache.
-        orig_pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        orig_comps = {k: v for k, v in orig_pipe.components.items() if hasattr(v, "parameters")}
-
-        # Under the mock environment we get a 500 error when trying to reach the model.
-        with mock.patch("requests.request", return_value=response_mock):
-            # Download this model to make sure it's in the cache.
-            pipe = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, local_files_only=True
-            )
-            comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")}
-
-        for m1, m2 in zip(orig_comps.values(), comps.values()):
-            for p1, p2 in zip(m1.parameters(), m2.parameters()):
-                if p1.data.ne(p2.data).sum() > 0:
-                    assert False, "Parameters not the same!"
-
-    def test_download_from_variant_folder(self):
-        for safe_avail in [False, True]:
-            import diffusers
-
-            diffusers.utils.import_utils._safetensors_available = safe_avail
-
-            other_format = ".bin" if safe_avail else ".safetensors"
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname
-                )
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                # None of the downloaded files should be a variant file even if we have some here:
-                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                assert not any(f.endswith(other_format) for f in files)
-                # no variants
-                assert not any(len(f.split(".")) == 3 for f in files)
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_variant_all(self):
-        for safe_avail in [False, True]:
-            import diffusers
-
-            diffusers.utils.import_utils._safetensors_available = safe_avail
-
-            other_format = ".bin" if safe_avail else ".safetensors"
-            this_format = ".safetensors" if safe_avail else ".bin"
-            variant = "fp16"
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant
-                )
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                # None of the downloaded files should be a non-variant file even if we have some here:
-                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                # unet, vae, text_encoder, safety_checker
-                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 4
-                # all checkpoints should have variant ending
-                assert not any(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files)
-                assert not any(f.endswith(other_format) for f in files)
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_variant_partly(self):
-        for safe_avail in [False, True]:
-            import diffusers
-
-            diffusers.utils.import_utils._safetensors_available = safe_avail
-
-            other_format = ".bin" if safe_avail else ".safetensors"
-            this_format = ".safetensors" if safe_avail else ".bin"
-            variant = "no_ema"
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-all-variants", cache_dir=tmpdirname, variant=variant
-                )
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                unet_files = os.listdir(os.path.join(tmpdirname, "unet"))
-
-                # Some of the downloaded files should be a non-variant file, check:
-                # https://huggingface.co/hf-internal-testing/stable-diffusion-all-variants/tree/main/unet
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                # only unet has "no_ema" variant
-                assert f"diffusion_pytorch_model.{variant}{this_format}" in unet_files
-                assert len([f for f in files if f.endswith(f"{variant}{this_format}")]) == 1
-                # vae, safety_checker and text_encoder should have no variant
-                assert sum(f.endswith(this_format) and not f.endswith(f"{variant}{this_format}") for f in files) == 3
-                assert not any(f.endswith(other_format) for f in files)
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_download_broken_variant(self):
-        for safe_avail in [False, True]:
-            import diffusers
-
-            diffusers.utils.import_utils._safetensors_available = safe_avail
-            # text encoder is missing no variant and "no_ema" variant weights, so the following can't work
-            for variant in [None, "no_ema"]:
-                with self.assertRaises(OSError) as error_context:
-                    with tempfile.TemporaryDirectory() as tmpdirname:
-                        tmpdirname = StableDiffusionPipeline.from_pretrained(
-                            "hf-internal-testing/stable-diffusion-broken-variants",
-                            cache_dir=tmpdirname,
-                            variant=variant,
-                        )
-
-                assert "Error no file name" in str(error_context.exception)
-
-            # text encoder has fp16 variants so we can load it
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                tmpdirname = StableDiffusionPipeline.download(
-                    "hf-internal-testing/stable-diffusion-broken-variants", cache_dir=tmpdirname, variant="fp16"
-                )
-
-                all_root_files = [t[-1] for t in os.walk(tmpdirname)]
-                files = [item for sublist in all_root_files for item in sublist]
-
-                # None of the downloaded files should be a non-variant file even if we have some here:
-                # https://huggingface.co/hf-internal-testing/stable-diffusion-broken-variants/tree/main/unet
-                assert len(files) == 15, f"We should only download 15 files, not {len(files)}"
-                # only unet has "no_ema" variant
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_text_inversion_download(self):
-        pipe = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        pipe = pipe.to(torch_device)
-
-        num_tokens = len(pipe.tokenizer)
-
-        # single token load local
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {"<*>": torch.ones((32,))}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<*>")
-            assert token == num_tokens, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32
-            assert pipe._maybe_convert_prompt("<*>", pipe.tokenizer) == "<*>"
-
-            prompt = "hey <*>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-        # single token load local with weight name
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {"<**>": 2 * torch.ones((1, 32))}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, weight_name="learned_embeds.bin")
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<**>")
-            assert token == num_tokens + 1, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64
-            assert pipe._maybe_convert_prompt("<**>", pipe.tokenizer) == "<**>"
-
-            prompt = "hey <**>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-        # multi token load
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {"<***>": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])}
-            torch.save(ten, os.path.join(tmpdirname, "learned_embeds.bin"))
-
-            pipe.load_textual_inversion(tmpdirname)
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<***>")
-            token_1 = pipe.tokenizer.convert_tokens_to_ids("<***>_1")
-            token_2 = pipe.tokenizer.convert_tokens_to_ids("<***>_2")
-
-            assert token == num_tokens + 2, "Added token must be at spot `num_tokens`"
-            assert token_1 == num_tokens + 3, "Added token must be at spot `num_tokens`"
-            assert token_2 == num_tokens + 4, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
-            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<***>", pipe.tokenizer) == "<***><***>_1<***>_2"
-
-            prompt = "hey <***>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-        # multi token load a1111
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ten = {
-                "string_to_param": {
-                    "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))])
-                },
-                "name": "<****>",
-            }
-            torch.save(ten, os.path.join(tmpdirname, "a1111.bin"))
-
-            pipe.load_textual_inversion(tmpdirname, weight_name="a1111.bin")
-
-            token = pipe.tokenizer.convert_tokens_to_ids("<****>")
-            token_1 = pipe.tokenizer.convert_tokens_to_ids("<****>_1")
-            token_2 = pipe.tokenizer.convert_tokens_to_ids("<****>_2")
-
-            assert token == num_tokens + 5, "Added token must be at spot `num_tokens`"
-            assert token_1 == num_tokens + 6, "Added token must be at spot `num_tokens`"
-            assert token_2 == num_tokens + 7, "Added token must be at spot `num_tokens`"
-            assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96
-            assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128
-            assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160
-            assert pipe._maybe_convert_prompt("<****>", pipe.tokenizer) == "<****><****>_1<****>_2"
-
-            prompt = "hey <****>"
-            out = pipe(prompt, num_inference_steps=1, output_type="numpy").images
-            assert out.shape == (1, 128, 128, 3)
-
-
-class CustomPipelineTests(unittest.TestCase):
-    def test_load_custom_pipeline(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
-        )
-        pipeline = pipeline.to(torch_device)
-        # NOTE that `"CustomPipeline"` is not a class that is defined in this library, but solely on the Hub
-        # under https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L24
-        assert pipeline.__class__.__name__ == "CustomPipeline"
-
-    def test_load_custom_github(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="main"
-        )
-
-        # make sure that on "main" pipeline gives only ones because of: https://github.com/huggingface/diffusers/pull/1690
-        with torch.no_grad():
-            output = pipeline()
-
-        assert output.numel() == output.sum()
-
-        # hack since Python doesn't like overwriting modules: https://stackoverflow.com/questions/3105801/unload-a-module-in-python
-        # Could in the future work with hashes instead.
-        del sys.modules["diffusers_modules.git.one_step_unet"]
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="one_step_unet", custom_revision="0.10.2"
-        )
-        with torch.no_grad():
-            output = pipeline()
-
-        assert output.numel() != output.sum()
-
-        assert pipeline.__class__.__name__ == "UnetSchedulerOneForwardPipeline"
-
-    def test_run_custom_pipeline(self):
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline"
-        )
-        pipeline = pipeline.to(torch_device)
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-
-        assert images[0].shape == (1, 32, 32, 3)
-
-        # compare output to https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py#L102
-        assert output_str == "This is a test"
-
-    def test_local_custom_pipeline_repo(self):
-        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
-        )
-        pipeline = pipeline.to(torch_device)
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-
-        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
-        assert images[0].shape == (1, 32, 32, 3)
-        # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
-        assert output_str == "This is a local test"
-
-    def test_local_custom_pipeline_file(self):
-        local_custom_pipeline_path = get_tests_dir("fixtures/custom_pipeline")
-        local_custom_pipeline_path = os.path.join(local_custom_pipeline_path, "what_ever.py")
-        pipeline = DiffusionPipeline.from_pretrained(
-            "google/ddpm-cifar10-32", custom_pipeline=local_custom_pipeline_path
-        )
-        pipeline = pipeline.to(torch_device)
-        images, output_str = pipeline(num_inference_steps=2, output_type="np")
-
-        assert pipeline.__class__.__name__ == "CustomLocalPipeline"
-        assert images[0].shape == (1, 32, 32, 3)
-        # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102
-        assert output_str == "This is a local test"
-
-    @slow
-    @require_torch_gpu
-    def test_download_from_git(self):
-        clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
-
-        feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
-        clip_model = CLIPModel.from_pretrained(clip_model_id, torch_dtype=torch.float16)
-
-        pipeline = DiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            custom_pipeline="clip_guided_stable_diffusion",
-            clip_model=clip_model,
-            feature_extractor=feature_extractor,
-            torch_dtype=torch.float16,
-        )
-        pipeline.enable_attention_slicing()
-        pipeline = pipeline.to(torch_device)
-
-        # NOTE that `"CLIPGuidedStableDiffusion"` is not a class that is defined in the pypi package of th e library, but solely on the community examples folder of GitHub under:
-        # https://github.com/huggingface/diffusers/blob/main/examples/community/clip_guided_stable_diffusion.py
-        assert pipeline.__class__.__name__ == "CLIPGuidedStableDiffusion"
-
-        image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0]
-        assert image.shape == (512, 512, 3)
-
-
-class PipelineFastTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        import diffusers
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def dummy_image(self):
-        batch_size = 1
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
-        return image
-
-    def dummy_uncond_unet(self, sample_size=32):
-        torch.manual_seed(0)
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=sample_size,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        return model
-
-    def dummy_cond_unet(self, sample_size=32):
-        torch.manual_seed(0)
-        model = UNet2DConditionModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=sample_size,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=32,
-        )
-        return model
-
-    @property
-    def dummy_vae(self):
-        torch.manual_seed(0)
-        model = AutoencoderKL(
-            block_out_channels=[32, 64],
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-        )
-        return model
-
-    @property
-    def dummy_text_encoder(self):
-        torch.manual_seed(0)
-        config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=32,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        return CLIPTextModel(config)
-
-    @property
-    def dummy_extractor(self):
-        def extract(*args, **kwargs):
-            class Out:
-                def __init__(self):
-                    self.pixel_values = torch.ones([0])
-
-                def to(self, device):
-                    self.pixel_values.to(device)
-                    return self
-
-            return Out()
-
-        return extract
-
-    @parameterized.expand(
-        [
-            [DDIMScheduler, DDIMPipeline, 32],
-            [DDPMScheduler, DDPMPipeline, 32],
-            [DDIMScheduler, DDIMPipeline, (32, 64)],
-            [DDPMScheduler, DDPMPipeline, (64, 32)],
-        ]
-    )
-    def test_uncond_unet_components(self, scheduler_fn=DDPMScheduler, pipeline_fn=DDPMPipeline, sample_size=32):
-        unet = self.dummy_uncond_unet(sample_size)
-        scheduler = scheduler_fn()
-        pipeline = pipeline_fn(unet, scheduler).to(torch_device)
-
-        generator = torch.manual_seed(0)
-        out_image = pipeline(
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-        sample_size = (sample_size, sample_size) if isinstance(sample_size, int) else sample_size
-        assert out_image.shape == (1, *sample_size, 3)
-
-    def test_stable_diffusion_components(self):
-        """Test that components property works correctly"""
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        image = self.dummy_image().cpu().permute(0, 2, 3, 1)[0]
-        init_image = Image.fromarray(np.uint8(image)).convert("RGB")
-        mask_image = Image.fromarray(np.uint8(image + 4)).convert("RGB").resize((32, 32))
-
-        # make sure here that pndm scheduler skips prk
-        inpaint = StableDiffusionInpaintPipelineLegacy(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        ).to(torch_device)
-        img2img = StableDiffusionImg2ImgPipeline(**inpaint.components).to(torch_device)
-        text2img = StableDiffusionPipeline(**inpaint.components).to(torch_device)
-
-        prompt = "A painting of a squirrel eating a burger"
-
-        generator = torch.manual_seed(0)
-        image_inpaint = inpaint(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-            mask_image=mask_image,
-        ).images
-        image_img2img = img2img(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-            image=init_image,
-        ).images
-        image_text2img = text2img(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images
-
-        assert image_inpaint.shape == (1, 32, 32, 3)
-        assert image_img2img.shape == (1, 32, 32, 3)
-        assert image_text2img.shape == (1, 64, 64, 3)
-
-    @require_torch_gpu
-    def test_pipe_false_offload_warn(self):
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-        sd.enable_model_cpu_offload()
-
-        logger = logging.get_logger("diffusers.pipelines.pipeline_utils")
-        with CaptureLogger(logger) as cap_logger:
-            sd.to("cuda")
-
-        assert "It is strongly recommended against doing so" in str(cap_logger)
-
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-    def test_set_scheduler(self):
-        unet = self.dummy_cond_unet()
-        scheduler = PNDMScheduler(skip_prk_steps=True)
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DDIMScheduler)
-        sd.scheduler = DDPMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DDPMScheduler)
-        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, PNDMScheduler)
-        sd.scheduler = LMSDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, LMSDiscreteScheduler)
-        sd.scheduler = EulerDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, EulerDiscreteScheduler)
-        sd.scheduler = EulerAncestralDiscreteScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, EulerAncestralDiscreteScheduler)
-        sd.scheduler = DPMSolverMultistepScheduler.from_config(sd.scheduler.config)
-        assert isinstance(sd.scheduler, DPMSolverMultistepScheduler)
-
-    def test_set_scheduler_consistency(self):
-        unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        ddim = DDIMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=pndm,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-        pndm_config = sd.scheduler.config
-        sd.scheduler = DDPMScheduler.from_config(pndm_config)
-        sd.scheduler = PNDMScheduler.from_config(sd.scheduler.config)
-        pndm_config_2 = sd.scheduler.config
-        pndm_config_2 = {k: v for k, v in pndm_config_2.items() if k in pndm_config}
-
-        assert dict(pndm_config) == dict(pndm_config_2)
-
-        sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=ddim,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=None,
-            feature_extractor=self.dummy_extractor,
-        )
-
-        ddim_config = sd.scheduler.config
-        sd.scheduler = LMSDiscreteScheduler.from_config(ddim_config)
-        sd.scheduler = DDIMScheduler.from_config(sd.scheduler.config)
-        ddim_config_2 = sd.scheduler.config
-        ddim_config_2 = {k: v for k, v in ddim_config_2.items() if k in ddim_config}
-
-        assert dict(ddim_config) == dict(ddim_config_2)
-
-    def test_save_safe_serialization(self):
-        pipeline = StableDiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-torch")
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipeline.save_pretrained(tmpdirname, safe_serialization=True)
-
-            # Validate that the VAE safetensor exists and are of the correct format
-            vae_path = os.path.join(tmpdirname, "vae", "diffusion_pytorch_model.safetensors")
-            assert os.path.exists(vae_path), f"Could not find {vae_path}"
-            _ = safetensors.torch.load_file(vae_path)
-
-            # Validate that the UNet safetensor exists and are of the correct format
-            unet_path = os.path.join(tmpdirname, "unet", "diffusion_pytorch_model.safetensors")
-            assert os.path.exists(unet_path), f"Could not find {unet_path}"
-            _ = safetensors.torch.load_file(unet_path)
-
-            # Validate that the text encoder safetensor exists and are of the correct format
-            text_encoder_path = os.path.join(tmpdirname, "text_encoder", "model.safetensors")
-            assert os.path.exists(text_encoder_path), f"Could not find {text_encoder_path}"
-            _ = safetensors.torch.load_file(text_encoder_path)
-
-            pipeline = StableDiffusionPipeline.from_pretrained(tmpdirname)
-            assert pipeline.unet is not None
-            assert pipeline.vae is not None
-            assert pipeline.text_encoder is not None
-            assert pipeline.scheduler is not None
-            assert pipeline.feature_extractor is not None
-
-    def test_no_pytorch_download_when_doing_safetensors(self):
-        # by default we don't download
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname
-            )
-
-            path = os.path.join(
-                tmpdirname,
-                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
-                "snapshots",
-                "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet",
-            )
-            # safetensors exists
-            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            # pytorch does not
-            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
-
-    def test_no_safetensors_download_when_doing_pytorch(self):
-        # mock diffusers safetensors not available
-        import diffusers
-
-        diffusers.utils.import_utils._safetensors_available = False
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = StableDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", cache_dir=tmpdirname
-            )
-
-            path = os.path.join(
-                tmpdirname,
-                "models--hf-internal-testing--diffusers-stable-diffusion-tiny-all",
-                "snapshots",
-                "07838d72e12f9bcec1375b0482b80c1d399be843",
-                "unet",
-            )
-            # safetensors does not exists
-            assert not os.path.exists(os.path.join(path, "diffusion_pytorch_model.safetensors"))
-            # pytorch does
-            assert os.path.exists(os.path.join(path, "diffusion_pytorch_model.bin"))
-
-        diffusers.utils.import_utils._safetensors_available = True
-
-    def test_optional_components(self):
-        unet = self.dummy_cond_unet()
-        pndm = PNDMScheduler.from_config("hf-internal-testing/tiny-stable-diffusion-torch", subfolder="scheduler")
-        vae = self.dummy_vae
-        bert = self.dummy_text_encoder
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        orig_sd = StableDiffusionPipeline(
-            unet=unet,
-            scheduler=pndm,
-            vae=vae,
-            text_encoder=bert,
-            tokenizer=tokenizer,
-            safety_checker=unet,
-            feature_extractor=self.dummy_extractor,
-        )
-        sd = orig_sd
-
-        assert sd.config.requires_safety_checker is True
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sd.save_pretrained(tmpdirname)
-
-            # Test that passing None works
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname, feature_extractor=None, safety_checker=None, requires_safety_checker=False
-            )
-
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sd.save_pretrained(tmpdirname)
-
-            # Test that loading previous None works
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-
-            orig_sd.save_pretrained(tmpdirname)
-
-            # Test that loading without any directory works
-            shutil.rmtree(os.path.join(tmpdirname, "safety_checker"))
-            with open(os.path.join(tmpdirname, sd.config_name)) as f:
-                config = json.load(f)
-                config["safety_checker"] = [None, None]
-            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
-                json.dump(config, f)
-
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, requires_safety_checker=False)
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-
-            # Test that loading from deleted model index works
-            with open(os.path.join(tmpdirname, sd.config_name)) as f:
-                config = json.load(f)
-                del config["safety_checker"]
-                del config["feature_extractor"]
-            with open(os.path.join(tmpdirname, sd.config_name), "w") as f:
-                json.dump(config, f)
-
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname)
-
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor == (None, None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sd.save_pretrained(tmpdirname)
-
-            # Test that partially loading works
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
-
-            assert sd.config.requires_safety_checker is False
-            assert sd.config.safety_checker == (None, None)
-            assert sd.config.feature_extractor != (None, None)
-
-            # Test that partially loading works
-            sd = StableDiffusionPipeline.from_pretrained(
-                tmpdirname,
-                feature_extractor=self.dummy_extractor,
-                safety_checker=unet,
-                requires_safety_checker=[True, True],
-            )
-
-            assert sd.config.requires_safety_checker == [True, True]
-            assert sd.config.safety_checker != (None, None)
-            assert sd.config.feature_extractor != (None, None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            sd.save_pretrained(tmpdirname)
-            sd = StableDiffusionPipeline.from_pretrained(tmpdirname, feature_extractor=self.dummy_extractor)
-
-            assert sd.config.requires_safety_checker == [True, True]
-            assert sd.config.safety_checker != (None, None)
-            assert sd.config.feature_extractor != (None, None)
-
-
-@slow
-@require_torch_gpu
-class PipelineSlowTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_smart_download(self):
-        model_id = "hf-internal-testing/unet-pipeline-dummy"
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            _ = DiffusionPipeline.from_pretrained(model_id, cache_dir=tmpdirname, force_download=True)
-            local_repo_name = "--".join(["models"] + model_id.split("/"))
-            snapshot_dir = os.path.join(tmpdirname, local_repo_name, "snapshots")
-            snapshot_dir = os.path.join(snapshot_dir, os.listdir(snapshot_dir)[0])
-
-            # inspect all downloaded files to make sure that everything is included
-            assert os.path.isfile(os.path.join(snapshot_dir, DiffusionPipeline.config_name))
-            assert os.path.isfile(os.path.join(snapshot_dir, CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, WEIGHTS_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "scheduler", SCHEDULER_CONFIG_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME))
-            assert os.path.isfile(os.path.join(snapshot_dir, "unet", WEIGHTS_NAME))
-            # let's make sure the super large numpy file:
-            # https://huggingface.co/hf-internal-testing/unet-pipeline-dummy/blob/main/big_array.npy
-            # is not downloaded, but all the expected ones
-            assert not os.path.isfile(os.path.join(snapshot_dir, "big_array.npy"))
-
-    def test_warning_unused_kwargs(self):
-        model_id = "hf-internal-testing/unet-pipeline-dummy"
-        logger = logging.get_logger("diffusers.pipelines")
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            with CaptureLogger(logger) as cap_logger:
-                DiffusionPipeline.from_pretrained(
-                    model_id,
-                    not_used=True,
-                    cache_dir=tmpdirname,
-                    force_download=True,
-                )
-
-        assert (
-            cap_logger.out.strip().split("\n")[-1]
-            == "Keyword arguments {'not_used': True} are not expected by DDPMPipeline and will be ignored."
-        )
-
-    def test_from_save_pretrained(self):
-        # 1. Load models
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-
-        ddpm = DDPMPipeline(model, scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
-            new_ddpm.to(torch_device)
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    @require_torch_2
-    def test_from_save_pretrained_dynamo(self):
-        # 1. Load models
-        model = UNet2DModel(
-            block_out_channels=(32, 64),
-            layers_per_block=2,
-            sample_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
-            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
-        )
-        model = torch.compile(model)
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-
-        ddpm = DDPMPipeline(model, scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
-            new_ddpm.to(torch_device)
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    def test_from_pretrained_hub(self):
-        model_path = "google/ddpm-cifar10-32"
-
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-
-        ddpm = DDPMPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm = ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm_from_hub = ddpm_from_hub.to(torch_device)
-        ddpm_from_hub.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    def test_from_pretrained_hub_pass_model(self):
-        model_path = "google/ddpm-cifar10-32"
-
-        scheduler = DDPMScheduler(num_train_timesteps=10)
-
-        # pass unet into DiffusionPipeline
-        unet = UNet2DModel.from_pretrained(model_path)
-        ddpm_from_hub_custom_model = DiffusionPipeline.from_pretrained(model_path, unet=unet, scheduler=scheduler)
-        ddpm_from_hub_custom_model = ddpm_from_hub_custom_model.to(torch_device)
-        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
-
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler)
-        ddpm_from_hub = ddpm_from_hub.to(torch_device)
-        ddpm_from_hub_custom_model.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        image = ddpm_from_hub_custom_model(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        generator = torch.Generator(device=torch_device).manual_seed(0)
-        new_image = ddpm_from_hub(generator=generator, num_inference_steps=5, output_type="numpy").images
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    def test_output_format(self):
-        model_path = "google/ddpm-cifar10-32"
-
-        scheduler = DDIMScheduler.from_pretrained(model_path)
-        pipe = DDIMPipeline.from_pretrained(model_path, scheduler=scheduler)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        images = pipe(output_type="numpy").images
-        assert images.shape == (1, 32, 32, 3)
-        assert isinstance(images, np.ndarray)
-
-        images = pipe(output_type="pil", num_inference_steps=4).images
-        assert isinstance(images, list)
-        assert len(images) == 1
-        assert isinstance(images[0], PIL.Image.Image)
-
-        # use PIL by default
-        images = pipe(num_inference_steps=4).images
-        assert isinstance(images, list)
-        assert isinstance(images[0], PIL.Image.Image)
-
-    def test_from_flax_from_pt(self):
-        pipe_pt = StableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None
-        )
-        pipe_pt.to(torch_device)
-
-        if not is_flax_available():
-            raise ImportError("Make sure flax is installed.")
-
-        from diffusers import FlaxStableDiffusionPipeline
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe_pt.save_pretrained(tmpdirname)
-
-            pipe_flax, params = FlaxStableDiffusionPipeline.from_pretrained(
-                tmpdirname, safety_checker=None, from_pt=True
-            )
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            pipe_flax.save_pretrained(tmpdirname, params=params)
-            pipe_pt_2 = StableDiffusionPipeline.from_pretrained(tmpdirname, safety_checker=None, from_flax=True)
-            pipe_pt_2.to(torch_device)
-
-        prompt = "Hello"
-
-        generator = torch.manual_seed(0)
-        image_0 = pipe_pt(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images[0]
-
-        generator = torch.manual_seed(0)
-        image_1 = pipe_pt_2(
-            [prompt],
-            generator=generator,
-            num_inference_steps=2,
-            output_type="np",
-        ).images[0]
-
-        assert np.abs(image_0 - image_1).sum() < 1e-5, "Models don't give the same forward pass"
-
-    @require_compel
-    def test_weighted_prompts_compel(self):
-        from compel import Compel
-
-        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
-        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        pipe.enable_model_cpu_offload()
-        pipe.enable_attention_slicing()
-
-        compel = Compel(tokenizer=pipe.tokenizer, text_encoder=pipe.text_encoder)
-
-        prompt = "a red cat playing with a ball{}"
-
-        prompts = [prompt.format(s) for s in ["", "++", "--"]]
-
-        prompt_embeds = compel(prompts)
-
-        generator = [torch.Generator(device="cpu").manual_seed(33) for _ in range(prompt_embeds.shape[0])]
-
-        images = pipe(
-            prompt_embeds=prompt_embeds, generator=generator, num_inference_steps=20, output_type="numpy"
-        ).images
-
-        for i, image in enumerate(images):
-            expected_image = load_numpy(
-                "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-                f"/compel/forest_{i}.npy"
-            )
-
-            assert np.abs(image - expected_image).max() < 1e-2
-
-
-@nightly
-@require_torch_gpu
-class PipelineNightlyTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_ddpm_ddim_equality_batched(self):
-        seed = 0
-        model_id = "google/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        ddpm_scheduler = DDPMScheduler()
-        ddim_scheduler = DDIMScheduler()
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=ddpm_scheduler)
-        ddpm.to(torch_device)
-        ddpm.set_progress_bar_config(disable=None)
-
-        ddim = DDIMPipeline(unet=unet, scheduler=ddim_scheduler)
-        ddim.to(torch_device)
-        ddim.set_progress_bar_config(disable=None)
-
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
-        ddpm_images = ddpm(batch_size=2, generator=generator, output_type="numpy").images
-
-        generator = torch.Generator(device=torch_device).manual_seed(seed)
-        ddim_images = ddim(
-            batch_size=2,
-            generator=generator,
-            num_inference_steps=1000,
-            eta=1.0,
-            output_type="numpy",
-            use_clipped_model_output=True,  # Need this to make DDIM match DDPM
-        ).images
-
-        # the values aren't exactly equal, but the images look the same visually
-        assert np.abs(ddpm_images - ddim_images).max() < 1e-1
diff --git a/diffusers/tests/test_pipelines_common.py b/diffusers/tests/test_pipelines_common.py
deleted file mode 100644
index 13fbe924c799a6b427d1ed55b4d8f6cb4dd824fb..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_pipelines_common.py
+++ /dev/null
@@ -1,590 +0,0 @@
-import contextlib
-import gc
-import inspect
-import io
-import re
-import tempfile
-import unittest
-from typing import Callable, Union
-
-import numpy as np
-import torch
-
-import diffusers
-from diffusers import DiffusionPipeline
-from diffusers.utils import logging
-from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
-from diffusers.utils.testing_utils import require_torch, torch_device
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
-@require_torch
-class PipelineTesterMixin:
-    """
-    This mixin is designed to be used with unittest.TestCase classes.
-    It provides a set of common tests for each PyTorch pipeline, e.g. saving and loading the pipeline,
-    equivalence of dict and tuple outputs, etc.
-    """
-
-    # Canonical parameters that are passed to `__call__` regardless
-    # of the type of pipeline. They are always optional and have common
-    # sense default values.
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_images_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    # set these parameters to False in the child class if the pipeline does not support the corresponding functionality
-    test_attention_slicing = True
-    test_cpu_offload = True
-    test_xformers_attention = True
-
-    def get_generator(self, seed):
-        device = torch_device if torch_device != "mps" else "cpu"
-        generator = torch.Generator(device).manual_seed(seed)
-        return generator
-
-    @property
-    def pipeline_class(self) -> Union[Callable, DiffusionPipeline]:
-        raise NotImplementedError(
-            "You need to set the attribute `pipeline_class = ClassNameOfPipeline` in the child test class. "
-            "See existing pipeline tests for reference."
-        )
-
-    def get_dummy_components(self):
-        raise NotImplementedError(
-            "You need to implement `get_dummy_components(self)` in the child test class. "
-            "See existing pipeline tests for reference."
-        )
-
-    def get_dummy_inputs(self, device, seed=0):
-        raise NotImplementedError(
-            "You need to implement `get_dummy_inputs(self, device, seed)` in the child test class. "
-            "See existing pipeline tests for reference."
-        )
-
-    @property
-    def params(self) -> frozenset:
-        raise NotImplementedError(
-            "You need to set the attribute `params` in the child test class. "
-            "`params` are checked for if all values are present in `__call__`'s signature."
-            " You can set `params` using one of the common set of parameters defined in`pipeline_params.py`"
-            " e.g., `TEXT_TO_IMAGE_PARAMS` defines the common parameters used in text to  "
-            "image pipelines, including prompts and prompt embedding overrides."
-            "If your pipeline's set of arguments has minor changes from one of the common sets of arguments, "
-            "do not make modifications to the existing common sets of arguments. I.e. a text to image pipeline "
-            "with non-configurable height and width arguments should set the attribute as "
-            "`params = TEXT_TO_IMAGE_PARAMS - {'height', 'width'}`. "
-            "See existing pipeline tests for reference."
-        )
-
-    @property
-    def batch_params(self) -> frozenset:
-        raise NotImplementedError(
-            "You need to set the attribute `batch_params` in the child test class. "
-            "`batch_params` are the parameters required to be batched when passed to the pipeline's "
-            "`__call__` method. `pipeline_params.py` provides some common sets of parameters such as "
-            "`TEXT_TO_IMAGE_BATCH_PARAMS`, `IMAGE_VARIATION_BATCH_PARAMS`, etc... If your pipeline's "
-            "set of batch arguments has minor changes from one of the common sets of batch arguments, "
-            "do not make modifications to the existing common sets of batch arguments. I.e. a text to "
-            "image pipeline `negative_prompt` is not batched should set the attribute as "
-            "`batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {'negative_prompt'}`. "
-            "See existing pipeline tests for reference."
-        )
-
-    def tearDown(self):
-        # clean up the VRAM after each test in case of CUDA runtime errors
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_save_load_local(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-4)
-
-    def test_pipeline_call_signature(self):
-        self.assertTrue(
-            hasattr(self.pipeline_class, "__call__"), f"{self.pipeline_class} should have a `__call__` method"
-        )
-
-        parameters = inspect.signature(self.pipeline_class.__call__).parameters
-
-        optional_parameters = set()
-
-        for k, v in parameters.items():
-            if v.default != inspect._empty:
-                optional_parameters.add(k)
-
-        parameters = set(parameters.keys())
-        parameters.remove("self")
-        parameters.discard("kwargs")  # kwargs can be added if arguments of pipeline call function are deprecated
-
-        remaining_required_parameters = set()
-
-        for param in self.params:
-            if param not in parameters:
-                remaining_required_parameters.add(param)
-
-        self.assertTrue(
-            len(remaining_required_parameters) == 0,
-            f"Required parameters not present: {remaining_required_parameters}",
-        )
-
-        remaining_required_optional_parameters = set()
-
-        for param in self.required_optional_params:
-            if param not in optional_parameters:
-                remaining_required_optional_parameters.add(param)
-
-        self.assertTrue(
-            len(remaining_required_optional_parameters) == 0,
-            f"Required optional parameters not present: {remaining_required_optional_parameters}",
-        )
-
-    def test_inference_batch_consistent(self):
-        self._test_inference_batch_consistent()
-
-    def _test_inference_batch_consistent(
-        self, batch_sizes=[2, 4, 13], additional_params_copy_to_batched_inputs=["num_inference_steps"]
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        for batch_size in batch_sizes:
-            batched_inputs = {}
-            for name, value in inputs.items():
-                if name in self.batch_params:
-                    # prompt is string
-                    if name == "prompt":
-                        len_prompt = len(value)
-                        # make unequal batch sizes
-                        batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-
-                        # make last batch super long
-                        batched_inputs[name][-1] = 2000 * "very long"
-                    # or else we have images
-                    else:
-                        batched_inputs[name] = batch_size * [value]
-                elif name == "batch_size":
-                    batched_inputs[name] = batch_size
-                else:
-                    batched_inputs[name] = value
-
-            for arg in additional_params_copy_to_batched_inputs:
-                batched_inputs[arg] = inputs[arg]
-
-            batched_inputs["output_type"] = None
-
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
-                batched_inputs.pop("output_type")
-
-            output = pipe(**batched_inputs)
-
-            assert len(output[0]) == batch_size
-
-            batched_inputs["output_type"] = "np"
-
-            if self.pipeline_class.__name__ == "DanceDiffusionPipeline":
-                batched_inputs.pop("output_type")
-
-            output = pipe(**batched_inputs)[0]
-
-            assert output.shape[0] == batch_size
-
-        logger.setLevel(level=diffusers.logging.WARNING)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical()
-
-    def _test_inference_batch_single_identical(
-        self,
-        test_max_difference=None,
-        test_mean_pixel_difference=None,
-        relax_max_difference=False,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-        if test_max_difference is None:
-            # TODO(Pedro) - not sure why, but not at all reproducible at the moment it seems
-            # make sure that batched and non-batched is identical
-            test_max_difference = torch_device != "mps"
-
-        if test_mean_pixel_difference is None:
-            # TODO same as above
-            test_mean_pixel_difference = torch_device != "mps"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        batched_inputs = {}
-        batch_size = 3
-        for name, value in inputs.items():
-            if name in self.batch_params:
-                # prompt is string
-                if name == "prompt":
-                    len_prompt = len(value)
-                    # make unequal batch sizes
-                    batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-
-                    # make last batch super long
-                    batched_inputs[name][-1] = 2000 * "very long"
-                # or else we have images
-                else:
-                    batched_inputs[name] = batch_size * [value]
-            elif name == "batch_size":
-                batched_inputs[name] = batch_size
-            elif name == "generator":
-                batched_inputs[name] = [self.get_generator(i) for i in range(batch_size)]
-            else:
-                batched_inputs[name] = value
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-
-        if self.pipeline_class.__name__ != "DanceDiffusionPipeline":
-            batched_inputs["output_type"] = "np"
-
-        output_batch = pipe(**batched_inputs)
-        assert output_batch[0].shape[0] == batch_size
-
-        inputs["generator"] = self.get_generator(0)
-
-        output = pipe(**inputs)
-
-        logger.setLevel(level=diffusers.logging.WARNING)
-        if test_max_difference:
-            if relax_max_difference:
-                # Taking the median of the largest <n> differences
-                # is resilient to outliers
-                diff = np.abs(output_batch[0][0] - output[0][0])
-                diff = diff.flatten()
-                diff.sort()
-                max_diff = np.median(diff[-5:])
-            else:
-                max_diff = np.abs(output_batch[0][0] - output[0][0]).max()
-            assert max_diff < expected_max_diff
-
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_batch[0][0], output[0][0])
-
-    def test_dict_tuple_outputs_equivalent(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
-        output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_tuple)).max()
-        self.assertLess(max_diff, 1e-4)
-
-    def test_components_function(self):
-        init_components = self.get_dummy_components()
-        pipe = self.pipeline_class(**init_components)
-
-        self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
-
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_float16_inference(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe_fp16 = self.pipeline_class(**components)
-        pipe_fp16.to(torch_device, torch.float16)
-        pipe_fp16.set_progress_bar_config(disable=None)
-
-        output = pipe(**self.get_dummy_inputs(torch_device))[0]
-        output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_fp16)).max()
-        self.assertLess(max_diff, 1e-2, "The outputs of the fp16 and fp32 pipelines are too different.")
-
-    @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")
-    def test_save_load_float16(self):
-        components = self.get_dummy_components()
-        for name, module in components.items():
-            if hasattr(module, "half"):
-                components[name] = module.to(torch_device).half()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        for name, component in pipe_loaded.components.items():
-            if hasattr(component, "dtype"):
-                self.assertTrue(
-                    component.dtype == torch.float16,
-                    f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",
-                )
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-2, "The output of the fp16 pipeline changed after saving and loading.")
-
-    def test_save_load_optional_components(self):
-        if not hasattr(self.pipeline_class, "_optional_components"):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        # set all optional components to None
-        for optional_component in pipe._optional_components:
-            setattr(pipe, optional_component, None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output = pipe(**inputs)[0]
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)
-            pipe_loaded.to(torch_device)
-            pipe_loaded.set_progress_bar_config(disable=None)
-
-        for optional_component in pipe._optional_components:
-            self.assertTrue(
-                getattr(pipe_loaded, optional_component) is None,
-                f"`{optional_component}` did not stay set to None after loading.",
-            )
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_loaded = pipe_loaded(**inputs)[0]
-
-        max_diff = np.abs(to_np(output) - to_np(output_loaded)).max()
-        self.assertLess(max_diff, 1e-4)
-
-    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
-    def test_to_device(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.to("cpu")
-        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
-        self.assertTrue(all(device == "cpu" for device in model_devices))
-
-        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
-        self.assertTrue(np.isnan(output_cpu).sum() == 0)
-
-        pipe.to("cuda")
-        model_devices = [component.device.type for component in components.values() if hasattr(component, "device")]
-        self.assertTrue(all(device == "cuda" for device in model_devices))
-
-        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
-        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
-
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-
-        pipe.to(torch_dtype=torch.float16)
-        model_dtypes = [component.dtype for component in components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass()
-
-    def _test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_slicing = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff = np.abs(to_np(output_with_slicing) - to_np(output_without_slicing)).max()
-            self.assertLess(max_diff, expected_max_diff, "Attention slicing should not affect the inference results")
-
-        if test_mean_pixel_difference:
-            assert_mean_pixel_difference(output_with_slicing[0], output_without_slicing[0])
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),
-        reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",
-    )
-    def test_cpu_offload_forward_pass(self):
-        if not self.test_cpu_offload:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_offload = pipe(**inputs)[0]
-
-        pipe.enable_sequential_cpu_offload()
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_offload = pipe(**inputs)[0]
-
-        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
-        self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass()
-
-    def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=True, expected_max_diff=1e-4):
-        if not self.test_xformers_attention:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        output_without_offload = pipe(**inputs)[0]
-
-        pipe.enable_xformers_memory_efficient_attention()
-        inputs = self.get_dummy_inputs(torch_device)
-        output_with_offload = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff = np.abs(output_with_offload - output_without_offload).max()
-            self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")
-
-        assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0])
-
-    def test_progress_bar(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(torch_device)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
-            _ = pipe(**inputs)
-            stderr = stderr.getvalue()
-            # we can't calculate the number of progress steps beforehand e.g. for strength-dependent img2img,
-            # so we just match "5" in "#####| 1/5 [00:01<00:00]"
-            max_steps = re.search("/(.*?) ", stderr).group(1)
-            self.assertTrue(max_steps is not None and len(max_steps) > 0)
-            self.assertTrue(
-                f"{max_steps}/{max_steps}" in stderr, "Progress bar should be enabled and stopped at the max step"
-            )
-
-        pipe.set_progress_bar_config(disable=True)
-        with io.StringIO() as stderr, contextlib.redirect_stderr(stderr):
-            _ = pipe(**inputs)
-            self.assertTrue(stderr.getvalue() == "", "Progress bar should be disabled")
-
-    def test_num_images_per_prompt(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-
-        if "num_images_per_prompt" not in sig.parameters:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        batch_sizes = [1, 2]
-        num_images_per_prompts = [1, 2]
-
-        for batch_size in batch_sizes:
-            for num_images_per_prompt in num_images_per_prompts:
-                inputs = self.get_dummy_inputs(torch_device)
-
-                for key in inputs.keys():
-                    if key in self.batch_params:
-                        inputs[key] = batch_size * [inputs[key]]
-
-                images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images
-
-                assert images.shape[0] == batch_size * num_images_per_prompt
-
-
-# Some models (e.g. unCLIP) are extremely likely to significantly deviate depending on which hardware is used.
-# This helper function is used to check that the image doesn't deviate on average more than 10 pixels from a
-# reference image.
-def assert_mean_pixel_difference(image, expected_image):
-    image = np.asarray(DiffusionPipeline.numpy_to_pil(image)[0], dtype=np.float32)
-    expected_image = np.asarray(DiffusionPipeline.numpy_to_pil(expected_image)[0], dtype=np.float32)
-    avg_diff = np.abs(image - expected_image).mean()
-    assert avg_diff < 10, f"Error image deviates {avg_diff} pixels on average"
diff --git a/diffusers/tests/test_pipelines_flax.py b/diffusers/tests/test_pipelines_flax.py
deleted file mode 100644
index a461930f3a83ecfc8134d50ce5978d329d79f5c9..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_pipelines_flax.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-import unittest
-
-import numpy as np
-
-from diffusers.utils import is_flax_available
-from diffusers.utils.testing_utils import require_flax, slow
-
-
-if is_flax_available():
-    import jax
-    import jax.numpy as jnp
-    from flax.jax_utils import replicate
-    from flax.training.common_utils import shard
-    from jax import pmap
-
-    from diffusers import FlaxDDIMScheduler, FlaxDiffusionPipeline, FlaxStableDiffusionPipeline
-
-
-@require_flax
-class DownloadTests(unittest.TestCase):
-    def test_download_only_pytorch(self):
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            # pipeline has Flax weights
-            _ = FlaxDiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None, cache_dir=tmpdirname
-            )
-
-            all_root_files = [t[-1] for t in os.walk(os.path.join(tmpdirname, os.listdir(tmpdirname)[0], "snapshots"))]
-            files = [item for sublist in all_root_files for item in sublist]
-
-            # None of the downloaded files should be a PyTorch file even if we have some here:
-            # https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe/blob/main/unet/diffusion_pytorch_model.bin
-            assert not any(f.endswith(".bin") for f in files)
-
-
-@slow
-@require_flax
-class FlaxPipelineTests(unittest.TestCase):
-    def test_dummy_all_tpus(self):
-        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
-        )
-
-        prompt = (
-            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
-            " field, close up, split lighting, cinematic"
-        )
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 4
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = pipeline.prepare_inputs(prompt)
-
-        p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,))
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, num_samples)
-        prompt_ids = shard(prompt_ids)
-
-        images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
-
-        assert images.shape == (num_samples, 1, 64, 64, 3)
-        if jax.device_count() == 8:
-            assert np.abs(np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 3.1111548) < 1e-3
-            assert np.abs(np.abs(images, dtype=np.float32).sum() - 199746.95) < 5e-1
-
-        images_pil = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
-
-        assert len(images_pil) == num_samples
-
-    def test_stable_diffusion_v1_4(self):
-        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="flax", safety_checker=None
-        )
-
-        prompt = (
-            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
-            " field, close up, split lighting, cinematic"
-        )
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 50
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = pipeline.prepare_inputs(prompt)
-
-        p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,))
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, num_samples)
-        prompt_ids = shard(prompt_ids)
-
-        images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
-
-        assert images.shape == (num_samples, 1, 512, 512, 3)
-        if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.05652401)) < 1e-3
-            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2383808.2)) < 5e-1
-
-    def test_stable_diffusion_v1_4_bfloat_16(self):
-        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="bf16", dtype=jnp.bfloat16, safety_checker=None
-        )
-
-        prompt = (
-            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
-            " field, close up, split lighting, cinematic"
-        )
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 50
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = pipeline.prepare_inputs(prompt)
-
-        p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,))
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, num_samples)
-        prompt_ids = shard(prompt_ids)
-
-        images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
-
-        assert images.shape == (num_samples, 1, 512, 512, 3)
-        if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
-            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
-
-    def test_stable_diffusion_v1_4_bfloat_16_with_safety(self):
-        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4", revision="bf16", dtype=jnp.bfloat16
-        )
-
-        prompt = (
-            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
-            " field, close up, split lighting, cinematic"
-        )
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 50
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = pipeline.prepare_inputs(prompt)
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, num_samples)
-        prompt_ids = shard(prompt_ids)
-
-        images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
-
-        assert images.shape == (num_samples, 1, 512, 512, 3)
-        if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.06652832)) < 1e-3
-            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2384849.8)) < 5e-1
-
-    def test_stable_diffusion_v1_4_bfloat_16_ddim(self):
-        scheduler = FlaxDDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="scaled_linear",
-            set_alpha_to_one=False,
-            steps_offset=1,
-        )
-
-        pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
-            "CompVis/stable-diffusion-v1-4",
-            revision="bf16",
-            dtype=jnp.bfloat16,
-            scheduler=scheduler,
-            safety_checker=None,
-        )
-        scheduler_state = scheduler.create_state()
-
-        params["scheduler"] = scheduler_state
-
-        prompt = (
-            "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of"
-            " field, close up, split lighting, cinematic"
-        )
-
-        prng_seed = jax.random.PRNGKey(0)
-        num_inference_steps = 50
-
-        num_samples = jax.device_count()
-        prompt = num_samples * [prompt]
-        prompt_ids = pipeline.prepare_inputs(prompt)
-
-        p_sample = pmap(pipeline.__call__, static_broadcasted_argnums=(3,))
-
-        # shard inputs and rng
-        params = replicate(params)
-        prng_seed = jax.random.split(prng_seed, num_samples)
-        prompt_ids = shard(prompt_ids)
-
-        images = p_sample(prompt_ids, params, prng_seed, num_inference_steps).images
-
-        assert images.shape == (num_samples, 1, 512, 512, 3)
-        if jax.device_count() == 8:
-            assert np.abs((np.abs(images[0, 0, :2, :2, -2:], dtype=np.float32).sum() - 0.045043945)) < 1e-3
-            assert np.abs((np.abs(images, dtype=np.float32).sum() - 2347693.5)) < 5e-1
diff --git a/diffusers/tests/test_pipelines_onnx_common.py b/diffusers/tests/test_pipelines_onnx_common.py
deleted file mode 100644
index 575ecd0075318e8ec62ab7cd76bff5b0b1ca82ad..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_pipelines_onnx_common.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from diffusers.utils.testing_utils import require_onnxruntime
-
-
-@require_onnxruntime
-class OnnxPipelineTesterMixin:
-    """
-    This mixin is designed to be used with unittest.TestCase classes.
-    It provides a set of common tests for each ONNXRuntime pipeline, e.g. saving and loading the pipeline,
-    equivalence of dict and tuple outputs, etc.
-    """
-
-    pass
diff --git a/diffusers/tests/test_training.py b/diffusers/tests/test_training.py
deleted file mode 100644
index d540f997622148082874272ff7cebffea4d4450d..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_training.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import DDIMScheduler, DDPMScheduler, UNet2DModel
-from diffusers.training_utils import set_seed
-from diffusers.utils.testing_utils import slow
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class TrainingTests(unittest.TestCase):
-    def get_model_optimizer(self, resolution=32):
-        set_seed(0)
-        model = UNet2DModel(sample_size=resolution, in_channels=3, out_channels=3)
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
-        return model, optimizer
-
-    @slow
-    def test_training_step_equality(self):
-        device = "cpu"  # ensure full determinism without setting the CUBLAS_WORKSPACE_CONFIG env variable
-        ddpm_scheduler = DDPMScheduler(
-            num_train_timesteps=1000,
-            beta_start=0.0001,
-            beta_end=0.02,
-            beta_schedule="linear",
-            clip_sample=True,
-        )
-        ddim_scheduler = DDIMScheduler(
-            num_train_timesteps=1000,
-            beta_start=0.0001,
-            beta_end=0.02,
-            beta_schedule="linear",
-            clip_sample=True,
-        )
-
-        assert ddpm_scheduler.config.num_train_timesteps == ddim_scheduler.config.num_train_timesteps
-
-        # shared batches for DDPM and DDIM
-        set_seed(0)
-        clean_images = [torch.randn((4, 3, 32, 32)).clip(-1, 1).to(device) for _ in range(4)]
-        noise = [torch.randn((4, 3, 32, 32)).to(device) for _ in range(4)]
-        timesteps = [torch.randint(0, 1000, (4,)).long().to(device) for _ in range(4)]
-
-        # train with a DDPM scheduler
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train().to(device)
-        for i in range(4):
-            optimizer.zero_grad()
-            ddpm_noisy_images = ddpm_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddpm_noise_pred = model(ddpm_noisy_images, timesteps[i]).sample
-            loss = torch.nn.functional.mse_loss(ddpm_noise_pred, noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-
-        # recreate the model and optimizer, and retry with DDIM
-        model, optimizer = self.get_model_optimizer(resolution=32)
-        model.train().to(device)
-        for i in range(4):
-            optimizer.zero_grad()
-            ddim_noisy_images = ddim_scheduler.add_noise(clean_images[i], noise[i], timesteps[i])
-            ddim_noise_pred = model(ddim_noisy_images, timesteps[i]).sample
-            loss = torch.nn.functional.mse_loss(ddim_noise_pred, noise[i])
-            loss.backward()
-            optimizer.step()
-        del model, optimizer
-
-        self.assertTrue(torch.allclose(ddpm_noisy_images, ddim_noisy_images, atol=1e-5))
-        self.assertTrue(torch.allclose(ddpm_noise_pred, ddim_noise_pred, atol=1e-5))
diff --git a/diffusers/tests/test_unet_2d_blocks.py b/diffusers/tests/test_unet_2d_blocks.py
deleted file mode 100644
index e560240422ace376e8ccca989da9144ee8e8d98d..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_unet_2d_blocks.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from diffusers.models.unet_2d_blocks import *  # noqa F403
-from diffusers.utils import torch_device
-
-from .test_unet_blocks_common import UNetBlockTesterMixin
-
-
-class DownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = DownBlock2D  # noqa F405
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [-0.0232, -0.9869, 0.8054, -0.0637, -0.1688, -1.4264, 0.4470, -1.3394, 0.0904]
-        super().test_output(expected_slice)
-
-
-class ResnetDownsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = ResnetDownsampleBlock2D  # noqa F405
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [0.0710, 0.2410, -0.7320, -1.0757, -1.1343, 0.3540, -0.0133, -0.2576, 0.0948]
-        super().test_output(expected_slice)
-
-
-class AttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnDownBlock2D  # noqa F405
-    block_type = "down"
-
-    def test_output(self):
-        expected_slice = [0.0636, 0.8964, -0.6234, -1.0131, 0.0844, 0.4935, 0.3437, 0.0911, -0.2957]
-        super().test_output(expected_slice)
-
-
-class CrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = CrossAttnDownBlock2D  # noqa F405
-    block_type = "down"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.2440, -0.6953, -0.2140, -0.3874, 0.1966, 1.2077, 0.0441, -0.7718, 0.2800]
-        super().test_output(expected_slice)
-
-
-class SimpleCrossAttnDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SimpleCrossAttnDownBlock2D  # noqa F405
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    @unittest.skipIf(torch_device == "mps", "MPS result is not consistent")
-    def test_output(self):
-        expected_slice = [0.7921, -0.0992, -0.1962, -0.7695, -0.4242, 0.7804, 0.4737, 0.2765, 0.3338]
-        super().test_output(expected_slice)
-
-
-class SkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SkipDownBlock2D  # noqa F405
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_skip_sample=True)
-
-    def test_output(self):
-        expected_slice = [-0.0845, -0.2087, -0.2465, 0.0971, 0.1900, -0.0484, 0.2664, 0.4179, 0.5069]
-        super().test_output(expected_slice)
-
-
-class AttnSkipDownBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnSkipDownBlock2D  # noqa F405
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_skip_sample=True)
-
-    def test_output(self):
-        expected_slice = [0.5539, 0.1609, 0.4924, 0.0537, -0.1995, 0.4050, 0.0979, -0.2721, -0.0642]
-        super().test_output(expected_slice)
-
-
-class DownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = DownEncoderBlock2D  # noqa F405
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 32,
-            "out_channels": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [1.1102, 0.5302, 0.4872, -0.0023, -0.8042, 0.0483, -0.3489, -0.5632, 0.7626]
-        super().test_output(expected_slice)
-
-
-class AttnDownEncoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnDownEncoderBlock2D  # noqa F405
-    block_type = "down"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 32,
-            "out_channels": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.8966, -0.1486, 0.8568, 0.8141, -0.9046, -0.1342, -0.0972, -0.7417, 0.1538]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2D  # noqa F405
-    block_type = "mid"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 32,
-            "temb_channels": 128,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [-0.1062, 1.7248, 0.3494, 1.4569, -0.0910, -1.2421, -0.9984, 0.6736, 1.0028]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2DCrossAttn  # noqa F405
-    block_type = "mid"
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.1879, 2.2653, 0.5987, 1.1568, -0.8454, -1.6109, -0.8919, 0.8306, 1.6758]
-        super().test_output(expected_slice)
-
-
-class UNetMidBlock2DSimpleCrossAttnTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UNetMidBlock2DSimpleCrossAttn  # noqa F405
-    block_type = "mid"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.7143, 1.9974, 0.5448, 1.3977, 0.1282, -1.1237, -1.4238, 0.5530, 0.8880]
-        super().test_output(expected_slice)
-
-
-class UpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [-0.2041, -0.4165, -0.3022, 0.0041, -0.6628, -0.7053, 0.1928, -0.0325, 0.0523]
-        super().test_output(expected_slice)
-
-
-class ResnetUpsampleBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = ResnetUpsampleBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [0.2287, 0.3549, -0.1346, 0.4797, -0.1715, -0.9649, 0.7305, -0.5864, -0.6244]
-        super().test_output(expected_slice)
-
-
-class CrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = CrossAttnUpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [-0.2796, -0.4364, -0.1067, -0.2693, 0.1894, 0.3869, -0.3470, 0.4584, 0.5091]
-        super().test_output(expected_slice)
-
-
-class SimpleCrossAttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SimpleCrossAttnUpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True, include_encoder_hidden_states=True)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict, inputs_dict = super().prepare_init_args_and_inputs_for_common()
-        init_dict["cross_attention_dim"] = 32
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.2645, 0.1480, 0.0909, 0.8044, -0.9758, -0.9083, 0.0994, -1.1453, -0.7402]
-        super().test_output(expected_slice)
-
-
-class AttnUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnUpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    @unittest.skipIf(torch_device == "mps", "MPS result is not consistent")
-    def test_output(self):
-        expected_slice = [0.0979, 0.1326, 0.0021, 0.0659, 0.2249, 0.0059, 0.1132, 0.5952, 0.1033]
-        super().test_output(expected_slice)
-
-
-class SkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = SkipUpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [-0.0893, -0.1234, -0.1506, -0.0332, 0.0123, -0.0211, 0.0566, 0.0143, 0.0362]
-        super().test_output(expected_slice)
-
-
-class AttnSkipUpBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnSkipUpBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_res_hidden_states_tuple=True)
-
-    def test_output(self):
-        expected_slice = [0.0361, 0.0617, 0.2787, -0.0350, 0.0342, 0.3421, -0.0843, 0.0913, 0.3015]
-        super().test_output(expected_slice)
-
-
-class UpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = UpDecoderBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.4404, 0.1998, -0.9886, -0.3320, -0.3128, -0.7034, -0.6955, -0.2338, -0.3137]
-        super().test_output(expected_slice)
-
-
-class AttnUpDecoderBlock2DTests(UNetBlockTesterMixin, unittest.TestCase):
-    block_class = AttnUpDecoderBlock2D  # noqa F405
-    block_type = "up"
-
-    @property
-    def dummy_input(self):
-        return super().get_dummy_input(include_temb=False)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {"in_channels": 32, "out_channels": 32}
-
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self):
-        expected_slice = [0.6738, 0.4491, 0.1055, 1.0710, 0.7316, 0.3339, 0.3352, 0.1023, 0.3568]
-        super().test_output(expected_slice)
diff --git a/diffusers/tests/test_unet_blocks_common.py b/diffusers/tests/test_unet_blocks_common.py
deleted file mode 100644
index 17b7f65d6da31c43f062eaa6bed7284ce85e471f..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_unet_blocks_common.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-from typing import Tuple
-
-import torch
-
-from diffusers.utils import floats_tensor, randn_tensor, torch_all_close, torch_device
-from diffusers.utils.testing_utils import require_torch
-
-
-@require_torch
-class UNetBlockTesterMixin:
-    @property
-    def dummy_input(self):
-        return self.get_dummy_input()
-
-    @property
-    def output_shape(self):
-        if self.block_type == "down":
-            return (4, 32, 16, 16)
-        elif self.block_type == "mid":
-            return (4, 32, 32, 32)
-        elif self.block_type == "up":
-            return (4, 32, 64, 64)
-
-        raise ValueError(f"'{self.block_type}' is not a supported block_type. Set it to 'up', 'mid', or 'down'.")
-
-    def get_dummy_input(
-        self,
-        include_temb=True,
-        include_res_hidden_states_tuple=False,
-        include_encoder_hidden_states=False,
-        include_skip_sample=False,
-    ):
-        batch_size = 4
-        num_channels = 32
-        sizes = (32, 32)
-
-        generator = torch.manual_seed(0)
-        device = torch.device(torch_device)
-        shape = (batch_size, num_channels) + sizes
-        hidden_states = randn_tensor(shape, generator=generator, device=device)
-        dummy_input = {"hidden_states": hidden_states}
-
-        if include_temb:
-            temb_channels = 128
-            dummy_input["temb"] = randn_tensor((batch_size, temb_channels), generator=generator, device=device)
-
-        if include_res_hidden_states_tuple:
-            generator_1 = torch.manual_seed(1)
-            dummy_input["res_hidden_states_tuple"] = (randn_tensor(shape, generator=generator_1, device=device),)
-
-        if include_encoder_hidden_states:
-            dummy_input["encoder_hidden_states"] = floats_tensor((batch_size, 32, 32)).to(torch_device)
-
-        if include_skip_sample:
-            dummy_input["skip_sample"] = randn_tensor(((batch_size, 3) + sizes), generator=generator, device=device)
-
-        return dummy_input
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 32,
-            "out_channels": 32,
-            "temb_channels": 128,
-        }
-        if self.block_type == "up":
-            init_dict["prev_output_channel"] = 32
-
-        if self.block_type == "mid":
-            init_dict.pop("out_channels")
-
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_output(self, expected_slice):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        unet_block = self.block_class(**init_dict)
-        unet_block.to(torch_device)
-        unet_block.eval()
-
-        with torch.no_grad():
-            output = unet_block(**inputs_dict)
-
-        if isinstance(output, Tuple):
-            output = output[0]
-
-        self.assertEqual(output.shape, self.output_shape)
-
-        output_slice = output[0, -1, -3:, -3:]
-        expected_slice = torch.tensor(expected_slice).to(torch_device)
-        assert torch_all_close(output_slice.flatten(), expected_slice, atol=5e-3)
-
-    @unittest.skipIf(torch_device == "mps", "Training is not supported in mps")
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.block_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        output = model(**inputs_dict)
-
-        if isinstance(output, Tuple):
-            output = output[0]
-
-        device = torch.device(torch_device)
-        noise = randn_tensor(output.shape, device=device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
diff --git a/diffusers/tests/test_utils.py b/diffusers/tests/test_utils.py
deleted file mode 100755
index 4fc4e1a06638ae14848424db24f212ae24afbf34..0000000000000000000000000000000000000000
--- a/diffusers/tests/test_utils.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from diffusers import __version__
-from diffusers.utils import deprecate
-
-
-class DeprecateTester(unittest.TestCase):
-    higher_version = ".".join([str(int(__version__.split(".")[0]) + 1)] + __version__.split(".")[1:])
-    lower_version = "0.0.1"
-
-    def test_deprecate_function_arg(self):
-        kwargs = {"deprecated_arg": 4}
-
-        with self.assertWarns(FutureWarning) as warning:
-            output = deprecate("deprecated_arg", self.higher_version, "message", take_from=kwargs)
-
-        assert output == 4
-        assert (
-            str(warning.warning)
-            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}."
-            " message"
-        )
-
-    def test_deprecate_function_arg_tuple(self):
-        kwargs = {"deprecated_arg": 4}
-
-        with self.assertWarns(FutureWarning) as warning:
-            output = deprecate(("deprecated_arg", self.higher_version, "message"), take_from=kwargs)
-
-        assert output == 4
-        assert (
-            str(warning.warning)
-            == f"The `deprecated_arg` argument is deprecated and will be removed in version {self.higher_version}."
-            " message"
-        )
-
-    def test_deprecate_function_args(self):
-        kwargs = {"deprecated_arg_1": 4, "deprecated_arg_2": 8}
-        with self.assertWarns(FutureWarning) as warning:
-            output_1, output_2 = deprecate(
-                ("deprecated_arg_1", self.higher_version, "Hey"),
-                ("deprecated_arg_2", self.higher_version, "Hey"),
-                take_from=kwargs,
-            )
-        assert output_1 == 4
-        assert output_2 == 8
-        assert (
-            str(warning.warnings[0].message)
-            == "The `deprecated_arg_1` argument is deprecated and will be removed in version"
-            f" {self.higher_version}. Hey"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == "The `deprecated_arg_2` argument is deprecated and will be removed in version"
-            f" {self.higher_version}. Hey"
-        )
-
-    def test_deprecate_function_incorrect_arg(self):
-        kwargs = {"deprecated_arg": 4}
-
-        with self.assertRaises(TypeError) as error:
-            deprecate(("wrong_arg", self.higher_version, "message"), take_from=kwargs)
-
-        assert "test_deprecate_function_incorrect_arg in" in str(error.exception)
-        assert "line" in str(error.exception)
-        assert "got an unexpected keyword argument `deprecated_arg`" in str(error.exception)
-
-    def test_deprecate_arg_no_kwarg(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "message"))
-
-        assert (
-            str(warning.warning)
-            == f"`deprecated_arg` is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_args_no_kwarg(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(
-                ("deprecated_arg_1", self.higher_version, "Hey"),
-                ("deprecated_arg_2", self.higher_version, "Hey"),
-            )
-        assert (
-            str(warning.warnings[0].message)
-            == f"`deprecated_arg_1` is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == f"`deprecated_arg_2` is deprecated and will be removed in version {self.higher_version}. Hey"
-        )
-
-    def test_deprecate_class_obj(self):
-        class Args:
-            arg = 5
-
-        with self.assertWarns(FutureWarning) as warning:
-            arg = deprecate(("arg", self.higher_version, "message"), take_from=Args())
-
-        assert arg == 5
-        assert (
-            str(warning.warning)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_class_objs(self):
-        class Args:
-            arg = 5
-            foo = 7
-
-        with self.assertWarns(FutureWarning) as warning:
-            arg_1, arg_2 = deprecate(
-                ("arg", self.higher_version, "message"),
-                ("foo", self.higher_version, "message"),
-                ("does not exist", self.higher_version, "message"),
-                take_from=Args(),
-            )
-
-        assert arg_1 == 5
-        assert arg_2 == 7
-        assert (
-            str(warning.warning)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-        assert (
-            str(warning.warnings[0].message)
-            == f"The `arg` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-        assert (
-            str(warning.warnings[1].message)
-            == f"The `foo` attribute is deprecated and will be removed in version {self.higher_version}. message"
-        )
-
-    def test_deprecate_incorrect_version(self):
-        kwargs = {"deprecated_arg": 4}
-
-        with self.assertRaises(ValueError) as error:
-            deprecate(("wrong_arg", self.lower_version, "message"), take_from=kwargs)
-
-        assert (
-            str(error.exception)
-            == "The deprecation tuple ('wrong_arg', '0.0.1', 'message') should be removed since diffusers' version"
-            f" {__version__} is >= {self.lower_version}"
-        )
-
-    def test_deprecate_incorrect_no_standard_warn(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
-
-        assert str(warning.warning) == "This message is better!!!"
-
-    def test_deprecate_stacklevel(self):
-        with self.assertWarns(FutureWarning) as warning:
-            deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
-        assert str(warning.warning) == "This message is better!!!"
-        assert "diffusers/tests/test_utils.py" in warning.filename