Cinemo

Sleeping

App Files Files Community

maxin-cn commited on Jul 27, 2024

Commit

be791d6

verified ·

1 Parent(s): cc8bc4c

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
.gitignore +1 -0
LICENSE +201 -0
README.md +133 -12
__pycache__/utils.cpython-312.pyc +0 -0
animated_images/aircraft.jpg +3 -0
animated_images/car.jpg +3 -0
animated_images/fireworks.jpg +0 -0
animated_images/flowers.jpg +0 -0
animated_images/forest.jpg +3 -0
animated_images/shark_unwater.jpg +0 -0
configs/sample.yaml +38 -0
datasets/__pycache__/video_transforms.cpython-312.pyc +0 -0
datasets/video_transforms.py +748 -0
demo.py +311 -0
environment.yml +21 -0
example/aircrafts_flying/0.jpg +0 -0
example/aircrafts_flying/aircrafts_flying.mp4 +0 -0
example/car_moving/0.jpg +0 -0
example/car_moving/car_moving.mp4 +0 -0
example/fireworks/0.jpg +0 -0
example/fireworks/fireworks.mp4 +0 -0
example/flowers_swaying/0.jpg +0 -0
example/flowers_swaying/flowers_swaying.mp4 +0 -0
example/girl_walking_on_the_beach/0.jpg +0 -0
example/girl_walking_on_the_beach/girl_walking_on_the_beach.mp4 +0 -0
example/house_rotating/0.jpg +0 -0
example/house_rotating/house_rotating.mp4 +0 -0
example/people_runing/0.jpg +0 -0
example/people_runing/people_runing.mp4 +0 -0
example/shark_swimming/0.jpg +0 -0
example/shark_swimming/shark_swimming.mp4 +0 -0
example/windmill_turning/0.jpg +0 -0
example/windmill_turning/windmill_turning.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/5e69f32e801f7ae77024/temp.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/98ce26b896864325a1dd/temp.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/b12875c4b9b633b752c4/.nfs6a1237621cfe7a8800009149 +0 -0
gradio_cached_examples/39/Generated Animation/b12875c4b9b633b752c4/temp.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/b54545fbdd15c944208e/temp.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/cf8ea2ef6e0b7eeb7fe6/.nfs88c2a0e49709591000009148 +0 -0
gradio_cached_examples/39/Generated Animation/cf8ea2ef6e0b7eeb7fe6/temp.mp4 +0 -0
gradio_cached_examples/39/Generated Animation/de8039b347e55995b4cb/temp.mp4 +0 -0
gradio_cached_examples/39/indices.csv +6 -0
gradio_cached_examples/39/log.csv +7 -0
models/__init__.py +33 -0
models/__pycache__/__init__.cpython-312.pyc +0 -0
models/__pycache__/attention.cpython-312.pyc +0 -0
models/__pycache__/resnet.cpython-312.pyc +0 -0
models/__pycache__/rotary_embedding_torch_mx.cpython-312.pyc +0 -0
models/__pycache__/temporal_attention.cpython-312.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+animated_images/aircraft.jpg filter=lfs diff=lfs merge=lfs -text
+animated_images/car.jpg filter=lfs diff=lfs merge=lfs -text
+animated_images/forest.jpg filter=lfs diff=lfs merge=lfs -text
+visuals/animations/dragon_glowing_eyes/dragon_glowing_eyes.gif filter=lfs diff=lfs merge=lfs -text
+visuals/animations/girl_dancing_under_the_stars/girl_dancing_under_the_stars.gif filter=lfs diff=lfs merge=lfs -text
+visuals/animations/people_walking/people_walking.gif filter=lfs diff=lfs merge=lfs -text
+visuals/animations/sea_swell/sea_swell.gif filter=lfs diff=lfs merge=lfs -text
+visuals/video_editing/edit/editing_a_corgi_walking_in_the_park_at_sunrise_oil_painting_style.gif filter=lfs diff=lfs merge=lfs -text
+visuals/video_editing/origin/a_corgi_walking_in_the_park_at_sunrise_oil_painting_style.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .vscode

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [XIN MA] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,133 @@
----
-title: Cinemo
-emoji: 💻
-colorFrom: red
-colorTo: blue
-sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Cinemo
+app_file: demo.py
+sdk: gradio
+sdk_version: 4.37.2
+---
+## Cinemo: Consistent and Controllable Image Animation with Motion Diffusion Models<br><sub>Official PyTorch Implementation</sub>
+[![Arxiv](https://img.shields.io/badge/Arxiv-b31b1b.svg)](https://arxiv.org/abs/2407.15642)
+[![Project Page](https://img.shields.io/badge/Project-Website-blue)](https://maxin-cn.github.io/cinemo_project/)
+This repo contains pre-trained weights, and sampling code for our paper exploring image animation with motion diffusion models (Cinemo). You can find more visualizations on our [project page](https://maxin-cn.github.io/cinemo_project/).
+In this project, we propose a novel method called Cinemo, which can perform motion-controllable image animation with strong consistency and smoothness. To improve motion smoothness, Cinemo learns the distribution of motion residuals, rather than directly generating subsequent frames. Additionally, a structural similarity index-based method is proposed to control the motion intensity. Furthermore, we propose a noise refinement technique based on discrete cosine transformation to ensure temporal consistency. These three methods help Cinemo generate highly consistent, smooth, and motion-controlled image animation results. Compared to previous methods, Cinemo offers simpler and more precise user control and better generative performance.
+<div align="center">
+    <img src="visuals/pipeline.svg">
+</div>
+## News
+- (🔥 New) Jul. 23, 2024. 💥 Our paper is released on [arxiv](https://arxiv.org/abs/2407.15642).
+- (🔥 New) Jun. 2, 2024. 💥 The inference code is released. The checkpoint can be found [here](https://huggingface.co/maxin-cn/Cinemo/tree/main).
+## Setup
+First, download and set up the repo:
+```bash
+git clone https://github.com/maxin-cn/Cinemo
+cd Cinemo
+```
+We provide an [`environment.yml`](environment.yml) file that can be used to create a Conda environment. If you only want
+to run pre-trained models locally on CPU, you can remove the `cudatoolkit` and `pytorch-cuda` requirements from the file.
+```bash
+conda env create -f environment.yml
+conda activate cinemo
+```
+## Animation
+You can sample from our **pre-trained Cinemo models** with [`animation.py`](pipelines/animation.py). Weights for our pre-trained Cinemo model can be found [here](https://huggingface.co/maxin-cn/Cinemo/tree/main).  The script has various arguments for adjusting sampling steps, changing the classifier-free guidance scale, etc:
+```bash
+bash pipelines/animation.sh
+```
+All related checkpoints will download automatically and then you will get the following results,
+<table style="width:100%; text-align:center;">
+<tr>
+  <td align="center">Input image</td>
+  <td align="center">Output video</td>
+  <td align="center">Input image</td>
+  <td align="center">Output video</td>
+</tr>
+<tr>
+  <td align="center"><img src="visuals/animations/people_walking/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/animations/people_walking/people_walking.gif" width="100%"></td>
+  <td align="center"><img src="visuals/animations/sea_swell/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/animations/sea_swell/sea_swell.gif" width="100%"></td>
+</tr>
+<tr>
+  <td align="center" colspan="2">"People Walking"</td>
+  <td align="center" colspan="2">"Sea Swell"</td>
+</tr>
+<tr>
+  <td align="center"><img src="visuals/animations/girl_dancing_under_the_stars/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/animations/girl_dancing_under_the_stars/girl_dancing_under_the_stars.gif" width="100%"></td>
+  <td align="center"><img src="visuals/animations/dragon_glowing_eyes/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/animations/dragon_glowing_eyes/dragon_glowing_eyes.gif" width="100%"></td>
+</tr>
+<tr>
+  <td align="center" colspan="2">"Girl Dancing under the Stars"</td>
+  <td align="center" colspan="2">"Dragon Glowing Eyes"</td>
+</tr>
+</table>
+## Other Applications
+You can also utilize Cinemo for other applications, such as motion transfer and video editing:
+```bash
+bash pipelines/video_editing.sh
+```
+All related checkpoints will download automatically and you will get the following results,
+<table style="width:100%; text-align:center;">
+<tr>
+  <td align="center">Input video</td>
+  <td align="center">First frame</td>
+  <td align="center">Edited first frame</td>
+  <td align="center">Output video</td>
+</tr>
+<tr>
+  <td align="center"><img src="visuals/video_editing/origin/a_corgi_walking_in_the_park_at_sunrise_oil_painting_style.gif" width="100%"></td>
+  <td align="center"><img src="visuals/video_editing/origin/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/video_editing/edit/0.jpg" width="100%"></td>
+  <td align="center"><img src="visuals/video_editing/edit/editing_a_corgi_walking_in_the_park_at_sunrise_oil_painting_style.gif" width="100%"></td>
+</tr>
+</table>
+## Citation
+If you find this work useful for your research, please consider citing it.
+```bibtex
+@article{ma2024cinemo,
+  title={Cinemo: Latent Diffusion Transformer for Video Generation},
+  author={Ma, Xin and Wang, Yaohui and Jia, Gengyun and Chen, Xinyuan and Li, Yuan-Fang and Chen, Cunjian and Qiao, Yu},
+  journal={arXiv preprint arXiv:2407.15642},
+  year={2024}
+}
+```
+## Acknowledgments
+Cinemo has been greatly inspired by the following amazing works and teams: [LaVie](https://github.com/Vchitect/LaVie) and [SEINE](https://github.com/Vchitect/SEINE), we thank all the contributors for open-sourcing.
+## License
+The code and model weights are licensed under [LICENSE](LICENSE).

__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (6.62 kB). View file

animated_images/aircraft.jpg ADDED Viewed

Git LFS Details

SHA256: 8c74eb22424fdcf0e40a6e22eeb497babe23d8fa9700f70ad9b9288072801bc6
Pointer size: 132 Bytes
Size of remote file: 2.72 MB

animated_images/car.jpg ADDED Viewed

Git LFS Details

SHA256: 331981fa29ba5d5314a3c7f42499b30cd98f8e7b01ed147626934a1d808a103b
Pointer size: 132 Bytes
Size of remote file: 1.06 MB

animated_images/fireworks.jpg ADDED Viewed

animated_images/flowers.jpg ADDED Viewed

animated_images/forest.jpg ADDED Viewed

Git LFS Details

SHA256: 2519a6b8cd3d901388b52d9835c223dee93610d3aa4db15659003a004e2ac2eb
Pointer size: 132 Bytes
Size of remote file: 7 MB

animated_images/shark_unwater.jpg ADDED Viewed

configs/sample.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+# ckpt
+ckpt: /mnt/lustre/maxin/work/animation/animation-v6/results_qt_003-UNet-pixabaylaion-Xfor-Gc-320_512_0303000.pt
+save_img_path: "./sample_videos/"
+# pretrained_model_path: "/mnt/hwfile/gcc/maxin/work/pretrained/stable-diffusion-v1-4/"
+# pretrained_model_path: "maxin-cn/Cinemo"
+pretrained_model_path: "./pretrained/Cinemo"
+# model config:
+model: UNet
+video_length: 15
+image_size: [320, 512]
+# beta schedule
+beta_start: 0.0001
+beta_end: 0.02
+beta_schedule: "linear"
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+run_time: 0
+use_dct: True
+guidance_scale: 7.5 #
+motion_bucket_id: 95 # [0-19] The larger the value, the stronger the motion intensity
+sample_method: 'DDIM'
+num_sampling_steps: 50
+enable_vae_temporal_decoder: True
+image_prompts: [
+              ['aircraft.jpg', 'aircrafts flying'],
+              ['car.jpg' ,"car moving"],
+              ['fireworks.jpg', 'fireworks'],
+              ['flowers.jpg', 'flowers swaying'],
+              ['forest.jpg', 'people walking'],
+              ['shark_unwater.jpg', 'shark falling into the sea'],
+              ]

datasets/__pycache__/video_transforms.cpython-312.pyc ADDED Viewed

Binary file (32.3 kB). View file

datasets/video_transforms.py ADDED Viewed

	@@ -0,0 +1,748 @@

+import torch
+import random
+import numbers
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+from PIL import Image
+from torchvision.utils import _log_api_usage_once
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i : i + h, j : j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resize_with_scale_factor(clip, scale_factor, interpolation_mode):
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_factor, mode=interpolation_mode, align_corners=False)
+def resize_scale_with_height(clip, target_size, interpolation_mode):
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size / H
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resize_scale_with_weight(clip, target_size, interpolation_mode):
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size / W
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    # print(clip.shape)
+    th, tw = crop_size
+    if h < th or w < tw:
+        # print(h, w)
+        raise ValueError("height {} and width {} must be no smaller than crop_size".format(h, w))
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw), i, j
+def center_crop_using_short_edge(clip):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h < w:
+        th, tw = h, h
+        i = 0
+        j = int(round((w - tw) / 2.0))
+    else:
+        th, tw = w, w
+        i = int(round((h - th) / 2.0))
+        j = 0
+    return crop(clip, i, j, th, tw)
+def random_shift_crop(clip):
+    '''
+    Slide along the long edge, with the short edge as crop size
+    '''
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        long_edge = w
+        short_edge = h
+    else:
+        long_edge = h
+        short_edge =w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    return crop(clip, i, j, th, tw), i, j
+def random_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size[-2], crop_size[-1]
+    if h < th or w < tw:
+        raise ValueError("height {} and width {} must be no smaller than crop_size".format(h, w))
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    clip_crop = crop(clip, i, j, th, tw)
+    return clip_crop, i, j
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    # print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class CenterCropResizeVideo:
+    '''
+    First use the short side for cropping length,
+    center crop video, then resize to the specified size
+    '''
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        # print(clip.shape)
+        clip_center_crop = center_crop_using_short_edge(clip)
+        # print(clip_center_crop.shape) 320 512
+        clip_center_crop_resize = resize(clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        return clip_center_crop_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class SDXL:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        # add aditional one pixel for avoiding error in center crop
+        ori_h, ori_w = clip.size(-2), clip.size(-1)
+        tar_h, tar_w = self.size[0] + 1, self.size[1] + 1
+        # if ori_h >= tar_h and ori_w >= tar_w:
+        #     clip_tar_crop, i, j =  random_crop(clip=clip, crop_size=self.size)
+        # else:
+        #     tar_h_div_ori_h = tar_h / ori_h
+        #     tar_w_div_ori_w = tar_w / ori_w
+        #     if tar_h_div_ori_h > tar_w_div_ori_w:
+        #         clip = resize_with_scale_factor(clip=clip, scale_factor=tar_h_div_ori_h, interpolation_mode=self.interpolation_mode)
+        #     else:
+        #         clip = resize_with_scale_factor(clip=clip, scale_factor=tar_w_div_ori_w, interpolation_mode=self.interpolation_mode)
+        #     clip_tar_crop, i, j = random_crop(clip, self.size)
+        if ori_h >= tar_h and ori_w >= tar_w:
+            tar_h_div_ori_h = tar_h / ori_h
+            tar_w_div_ori_w = tar_w / ori_w
+            if tar_h_div_ori_h > tar_w_div_ori_w:
+                clip = resize_with_scale_factor(clip=clip, scale_factor=tar_h_div_ori_h, interpolation_mode=self.interpolation_mode)
+            else:
+                clip = resize_with_scale_factor(clip=clip, scale_factor=tar_w_div_ori_w, interpolation_mode=self.interpolation_mode)
+            ori_h, ori_w = clip.size(-2), clip.size(-1)
+            clip_tar_crop, i, j = random_crop(clip, self.size)
+        else:
+            tar_h_div_ori_h = tar_h / ori_h
+            tar_w_div_ori_w = tar_w / ori_w
+            if tar_h_div_ori_h > tar_w_div_ori_w:
+                clip = resize_with_scale_factor(clip=clip, scale_factor=tar_h_div_ori_h, interpolation_mode=self.interpolation_mode)
+            else:
+                clip = resize_with_scale_factor(clip=clip, scale_factor=tar_w_div_ori_w, interpolation_mode=self.interpolation_mode)
+            clip_tar_crop, i, j = random_crop(clip, self.size)
+        return clip_tar_crop, ori_h, ori_w, i, j
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class SDXLCenterCrop:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        # add aditional one pixel for avoiding error in center crop
+        ori_h, ori_w = clip.size(-2), clip.size(-1)
+        tar_h, tar_w = self.size[0] + 1, self.size[1] + 1
+        tar_h_div_ori_h = tar_h / ori_h
+        tar_w_div_ori_w = tar_w / ori_w
+        # print('before resize', clip.shape)
+        if tar_h_div_ori_h > tar_w_div_ori_w:
+            clip = resize_with_scale_factor(clip=clip, scale_factor=tar_h_div_ori_h, interpolation_mode=self.interpolation_mode)
+            # print('after h resize', clip.shape)
+        else:
+            clip = resize_with_scale_factor(clip=clip, scale_factor=tar_w_div_ori_w, interpolation_mode=self.interpolation_mode)
+        # print('after resize', clip.shape)
+        # print(clip.shape)
+        # clip_tar_crop, i, j = random_crop(clip, self.size)
+        clip_tar_crop, i, j = center_crop(clip, self.size)
+        # print('after crop', clip_tar_crop.shape)
+        return clip_tar_crop, ori_h, ori_w, i, j
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class InternVideo320512:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        # add aditional one pixel for avoiding error in center crop
+        h, w = clip.size(-2), clip.size(-1)
+        # print('before resize', clip.shape)
+        if h < 320:
+            clip = resize_scale_with_height(clip=clip, target_size=321, interpolation_mode=self.interpolation_mode)
+            # print('after h resize', clip.shape)
+        if w < 512:
+            clip = resize_scale_with_weight(clip=clip, target_size=513, interpolation_mode=self.interpolation_mode)
+            # print('after w resize', clip.shape)
+        # print(clip.shape)
+        clip_center_crop = center_crop(clip, self.size)
+        clip_center_crop_no_subtitles = center_crop(clip, (220, 352))
+        clip_center_resize = resize(clip_center_crop_no_subtitles, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        # print(clip_center_crop.shape)
+        return clip_center_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class CenterCropVideo:
+    '''
+    First scale to the specified size in equal proportion to the short edge,
+    then center cropping
+    '''
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    '''
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    '''
+    def __init__(
+            self,
+            size,
+            interpolation_mode="bilinear",
+         ):
+        if isinstance(size, tuple):
+                if len(size) != 2:
+                    raise ValueError(f"size should be tuple (height, width), instead got {size}")
+                self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+        return clip_resize
+class ResizeVideo():
+    '''
+    First use the short side for cropping length,
+    center crop video, then resize to the specified size
+    '''
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize(clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        return clip_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return to_tensor(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+class Compose:
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+    """
+    def __init__(self, transforms):
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            _log_api_usage_once(self)
+        self.transforms = transforms
+    def __call__(self, img):
+        for t in self.transforms:
+            if isinstance(t, SDXLCenterCrop) or isinstance(t, SDXL):
+                img, ori_h, ori_w, crops_coords_top, crops_coords_left = t(img)
+            else:
+                img = t(img)
+        return img, ori_h, ori_w, crops_coords_top, crops_coords_left
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+	"""Temporally crop the given frame indices at a random location.
+	Args:
+		size (int): Desired length of frames will be seen in the model.
+	"""
+	def __init__(self, size):
+		self.size = size
+	def __call__(self, total_frames):
+		rand_end = max(0, total_frames - self.size - 1)
+		begin_index = random.randint(0, rand_end)
+		end_index = min(begin_index + self.size, total_frames)
+		return begin_index, end_index
+if __name__ == '__main__':
+    from torchvision import transforms
+    import torchvision.io as io
+    import numpy as np
+    from torchvision.utils import save_image
+    import os
+    vframes, aframes, info = io.read_video(
+    filename='./v_Archery_g01_c03.avi',
+    pts_unit='sec',
+    output_format='TCHW'
+    )
+    trans = transforms.Compose([
+        ToTensorVideo(),
+        RandomHorizontalFlipVideo(),
+        UCFCenterCropVideo(512),
+        # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    target_video_len = 32
+    frame_interval = 1
+    total_frames = len(vframes)
+    print(total_frames)
+    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+    # Sampling video frames
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    # print(start_frame_ind)
+    # print(end_frame_ind)
+    assert end_frame_ind - start_frame_ind >= target_video_len
+    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
+    print(frame_indice)
+    select_vframes = vframes[frame_indice]
+    print(select_vframes.shape)
+    print(select_vframes.dtype)
+    select_vframes_trans = trans(select_vframes)
+    print(select_vframes_trans.shape)
+    print(select_vframes_trans.dtype)
+    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
+    print(select_vframes_trans_int.dtype)
+    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
+    io.write_video('./test.avi', select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
+    for i in range(target_video_len):
+        save_image(select_vframes_trans[i], os.path.join('./test000', '%04d.png' % i), normalize=True, value_range=(-1, 1))

demo.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import gradio as gr
+import os
+import torch
+import argparse
+import torchvision
+from pipelines.pipeline_videogen import VideoGenPipeline
+from diffusers.schedulers import DDIMScheduler
+from diffusers.models import AutoencoderKL
+from diffusers.models import AutoencoderKLTemporalDecoder
+from transformers import CLIPTokenizer, CLIPTextModel
+from omegaconf import OmegaConf
+import os, sys
+sys.path.append(os.path.split(sys.path[0])[0])
+from models import get_models
+import imageio
+from PIL import Image
+import numpy as np
+from datasets import video_transforms
+from torchvision import transforms
+from einops import rearrange, repeat
+from utils import dct_low_pass_filter, exchanged_mixed_dct_freq
+from copy import deepcopy
+import spaces
+import requests
+from datetime import datetime
+import random
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default="./configs/sample.yaml")
+args = parser.parse_args()
+args = OmegaConf.load(args.config)
+torch.set_grad_enabled(False)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 # torch.float16
+unet = get_models(args).to(device, dtype=dtype)
+if args.enable_vae_temporal_decoder:
+    if args.use_dct:
+        vae_for_base_content = AutoencoderKLTemporalDecoder.from_pretrained(args.pretrained_model_path, subfolder="vae_temporal_decoder", torch_dtype=torch.float64).to(device)
+    else:
+        vae_for_base_content = AutoencoderKLTemporalDecoder.from_pretrained(args.pretrained_model_path, subfolder="vae_temporal_decoder", torch_dtype=torch.float16).to(device)
+    vae = deepcopy(vae_for_base_content).to(dtype=dtype)
+else:
+    vae_for_base_content = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae",).to(device, dtype=torch.float64)
+    vae = deepcopy(vae_for_base_content).to(dtype=dtype)
+tokenizer = CLIPTokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=dtype).to(device) # huge
+# set eval mode
+unet.eval()
+vae.eval()
+text_encoder.eval()
+basedir        = os.getcwd()
+savedir        = os.path.join(basedir, "samples/Gradio", datetime.now().strftime("%Y-%m-%dT%H-%M-%S"))
+savedir_sample = os.path.join(savedir, "sample")
+os.makedirs(savedir, exist_ok=True)
+def update_and_resize_image(input_image_path, height_slider, width_slider):
+    if input_image_path.startswith("http://") or input_image_path.startswith("https://"):
+        pil_image = Image.open(requests.get(input_image_path, stream=True).raw).convert('RGB')
+    else:
+        pil_image = Image.open(input_image_path).convert('RGB')
+    original_width, original_height = pil_image.size
+    if original_height == height_slider and original_width == width_slider:
+        return gr.Image(value=np.array(pil_image))
+    ratio1 = height_slider / original_height
+    ratio2 = width_slider / original_width
+    if ratio1 > ratio2:
+        new_width = int(original_width * ratio1)
+        new_height = int(original_height * ratio1)
+    else:
+        new_width = int(original_width * ratio2)
+        new_height = int(original_height * ratio2)
+    pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
+    left = (new_width - width_slider) / 2
+    top = (new_height - height_slider) / 2
+    right = left + width_slider
+    bottom = top + height_slider
+    pil_image = pil_image.crop((left, top, right, bottom))
+    return gr.Image(value=np.array(pil_image))
+def update_textbox_and_save_image(input_image, height_slider, width_slider):
+    pil_image = Image.fromarray(input_image.astype(np.uint8)).convert("RGB")
+    original_width, original_height = pil_image.size
+    ratio1 = height_slider / original_height
+    ratio2 = width_slider / original_width
+    if ratio1 > ratio2:
+        new_width = int(original_width * ratio1)
+        new_height = int(original_height * ratio1)
+    else:
+        new_width = int(original_width * ratio2)
+        new_height = int(original_height * ratio2)
+    pil_image = pil_image.resize((new_width, new_height), Image.LANCZOS)
+    left = (new_width - width_slider) / 2
+    top = (new_height - height_slider) / 2
+    right = left + width_slider
+    bottom = top + height_slider
+    pil_image = pil_image.crop((left, top, right, bottom))
+    img_path = os.path.join(savedir, "input_image.png")
+    pil_image.save(img_path)
+    return gr.Textbox(value=img_path), gr.Image(value=np.array(pil_image))
+def prepare_image(image, vae, transform_video, device, dtype=torch.float16):
+    image = torch.as_tensor(np.array(image, dtype=np.uint8, copy=True)).unsqueeze(0).permute(0, 3, 1, 2)
+    image = transform_video(image)
+    image = vae.encode(image.to(dtype=dtype, device=device)).latent_dist.sample().mul_(vae.config.scaling_factor)
+    image = image.unsqueeze(2)
+    return image
+@spaces.GPU
+def gen_video(input_image, prompt, negative_prompt, diffusion_step, height, width, scfg_scale, use_dctinit, dct_coefficients, noise_level, motion_bucket_id, seed):
+    torch.manual_seed(seed)
+    scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_path,
+                                              subfolder="scheduler",
+                                              beta_start=args.beta_start,
+                                              beta_end=args.beta_end,
+                                              beta_schedule=args.beta_schedule)
+    videogen_pipeline = VideoGenPipeline(vae=vae,
+                                         text_encoder=text_encoder,
+                                         tokenizer=tokenizer,
+                                         scheduler=scheduler,
+                                         unet=unet).to(device)
+    # videogen_pipeline.enable_xformers_memory_efficient_attention()
+    transform_video = transforms.Compose([
+        video_transforms.ToTensorVideo(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+    ])
+    if args.use_dct:
+        base_content = prepare_image(input_image, vae_for_base_content, transform_video, device, dtype=torch.float64).to(device)
+    else:
+        base_content = prepare_image(input_image, vae_for_base_content, transform_video, device, dtype=torch.float16).to(device)
+    if use_dctinit:
+        # filter params
+        print("Using DCT!")
+        base_content_repeat = repeat(base_content, 'b c f h w -> b c (f r) h w', r=15).contiguous()
+        # define filter
+        freq_filter = dct_low_pass_filter(dct_coefficients=base_content, percentage=dct_coefficients)
+        noise = torch.randn(1, 4, 15, 40, 64).to(device)
+        # add noise to base_content
+        diffuse_timesteps = torch.full((1,),int(noise_level))
+        diffuse_timesteps = diffuse_timesteps.long()
+        # 3d content
+        base_content_noise = scheduler.add_noise(
+            original_samples=base_content_repeat.to(device),
+            noise=noise,
+            timesteps=diffuse_timesteps.to(device))
+        # 3d content
+        latents = exchanged_mixed_dct_freq(noise=noise,
+                    base_content=base_content_noise,
+                    LPF_3d=freq_filter).to(dtype=torch.float16)
+    base_content = base_content.to(dtype=torch.float16)
+    videos = videogen_pipeline(prompt,
+                               negative_prompt=negative_prompt,
+                               latents=latents if use_dctinit else None,
+                               base_content=base_content,
+                               video_length=15,
+                               height=height,
+                               width=width,
+                               num_inference_steps=diffusion_step,
+                               guidance_scale=scfg_scale,
+                               motion_bucket_id=100-motion_bucket_id,
+                               enable_vae_temporal_decoder=args.enable_vae_temporal_decoder).video
+    save_path = args.save_img_path + 'temp' + '.mp4'
+    # torchvision.io.write_video(save_path, videos[0], fps=8, video_codec='h264', options={'crf': '10'})
+    imageio.mimwrite(save_path, videos[0], fps=8, quality=7)
+    return save_path
+if not os.path.exists(args.save_img_path):
+    os.makedirs(args.save_img_path)
+with gr.Blocks() as demo:
+    gr.Markdown("<font color=red size=6.5><center>Cinemo: Consistent and Controllable Image Animation with Motion Diffusion Models</center></font>")
+    gr.Markdown(
+        """<div style="display: flex;align-items: center;justify-content: center">
+        [<a href="https://arxiv.org/abs/2407.15642">Arxiv Report</a>] | [<a href="https://https://maxin-cn.github.io/cinemo_project/">Project Page</a>] | [<a href="https://github.com/maxin-cn/Cinemo">Github</a>]</div>
+        """
+    )
+    with gr.Column(variant="panel"):
+        gr.Markdown(
+            """
+            - Input image can be specified using the "Input Image URL" text box or uploaded by clicking or dragging the image to the "Input Image" box.
+            - Input image will be resized and/or center cropped to a given resolution (320 x 512) automatically.
+            - After setting the input image path, press the "Preview" button to visualize the resized input image.
+            """
+        )
+        with gr.Row():
+            prompt_textbox = gr.Textbox(label="Prompt", lines=1)
+            negative_prompt_textbox = gr.Textbox(label="Negative prompt", lines=1)
+        with gr.Row(equal_height=False):
+            with gr.Column():
+                with gr.Row():
+                    sample_step_slider = gr.Slider(label="Sampling steps", value=50, minimum=10, maximum=250, step=1)
+                with gr.Row():
+                    seed_textbox = gr.Slider(label="Seed", value=100, minimum=1, maximum=int(1e8), step=1, interactive=True)
+                    # seed_textbox = gr.Textbox(label="Seed", value=100)
+                    # seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+                    # seed_button.click(fn=lambda: gr.Textbox(value=random.randint(1, int(1e8))), inputs=[], outputs=[seed_textbox])
+                with gr.Row():
+                    height = gr.Slider(label="Height", value=320, minimum=0, maximum=512, step=16, interactive=False)
+                    width  = gr.Slider(label="Width",  value=512, minimum=0, maximum=512, step=16, interactive=False)
+                with gr.Row():
+                    txt_cfg_scale = gr.Slider(label="CFG Scale",   value=7.5, minimum=1.0,   maximum=20.0, step=0.1, interactive=True)
+                    motion_bucket_id = gr.Slider(label="Motion Intensity",   value=10, minimum=1,   maximum=20, step=1, interactive=True)
+                with gr.Row():
+                    use_dctinit = gr.Checkbox(label="Enable DCTInit", value=True)
+                    dct_coefficients = gr.Slider(label="DCT Coefficients", value=0.23, minimum=0, maximum=1, step=0.01, interactive=True)
+                    noise_level = gr.Slider(label="Noise Level", value=985, minimum=1, maximum=999, step=1, interactive=True)
+                generate_button = gr.Button(value="Generate", variant='primary')
+            with gr.Column():
+                with gr.Row():
+                    input_image_path = gr.Textbox(label="Input Image URL", lines=1, scale=10, info="Press Enter or the Preview button to confirm the input image.")
+                    preview_button = gr.Button(value="Preview")
+                with gr.Row():
+                    input_image = gr.Image(label="Input Image", interactive=True)
+                    input_image.upload(fn=update_textbox_and_save_image, inputs=[input_image, height, width], outputs=[input_image_path, input_image])
+                    result_video = gr.Video(label="Generated Animation", interactive=False, autoplay=True)
+        preview_button.click(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
+        input_image_path.submit(fn=update_and_resize_image, inputs=[input_image_path, height, width], outputs=[input_image])
+        EXAMPLES = [
+            ["./example/aircrafts_flying/0.jpg", "aircrafts flying",                   "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
+            ["./example/fireworks/0.jpg", "fireworks",                                 "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
+            ["./example/flowers_swaying/0.jpg", "flowers swaying",                     "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
+            ["./example/girl_walking_on_the_beach/0.jpg", "girl walking on the beach", "", 50, 320, 512, 7.5, True, 0.23, 985, 10, 200],
+            ["./example/house_rotating/0.jpg", "house rotating",                       "", 50, 320, 512, 7.5, True, 0.23, 985, 10, 100],
+            ["./example/people_runing/0.jpg", "people runing",                         "", 50, 320, 512, 7.5, True, 0.23, 975, 10, 100],
+]
+        examples = gr.Examples(
+            examples = EXAMPLES,
+            fn = gen_video,
+            inputs=[input_image, prompt_textbox, negative_prompt_textbox, sample_step_slider, height, width, txt_cfg_scale, use_dctinit, dct_coefficients, noise_level, motion_bucket_id, seed_textbox],
+            outputs=[result_video],
+            # cache_examples=True,
+            cache_examples="lazy",
+        )
+        generate_button.click(
+                fn=gen_video,
+                inputs=[
+                    input_image,
+                    prompt_textbox,
+                    negative_prompt_textbox,
+                    sample_step_slider,
+                    height,
+                    width,
+                    txt_cfg_scale,
+                    use_dctinit,
+                    dct_coefficients,
+                    noise_level,
+                    motion_bucket_id,
+                    seed_textbox,
+                ],
+                outputs=[result_video]
+            )
+demo.launch(debug=False, share=True)
+# demo.launch(server_name="0.0.0.0", server_port=10034, enable_queue=True)

environment.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: cinemo
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python >= 3.10
+  - pytorch >= 2.0
+  - torchvision
+  - pytorch-cuda >= 11.7
+  - pip:
+    - timm
+    - diffusers[torch]==0.24.0
+    - accelerate
+    - python-hostlist
+    - tensorboard
+    - einops
+    - transformers
+    - av
+    - scikit-image
+    - decord
+    - pandas

example/aircrafts_flying/0.jpg ADDED Viewed

example/aircrafts_flying/aircrafts_flying.mp4 ADDED Viewed

Binary file (533 kB). View file

example/car_moving/0.jpg ADDED Viewed

example/car_moving/car_moving.mp4 ADDED Viewed

Binary file (399 kB). View file

example/fireworks/0.jpg ADDED Viewed

example/fireworks/fireworks.mp4 ADDED Viewed

Binary file (479 kB). View file

example/flowers_swaying/0.jpg ADDED Viewed

example/flowers_swaying/flowers_swaying.mp4 ADDED Viewed

Binary file (469 kB). View file

example/girl_walking_on_the_beach/0.jpg ADDED Viewed

example/girl_walking_on_the_beach/girl_walking_on_the_beach.mp4 ADDED Viewed

Binary file (619 kB). View file

example/house_rotating/0.jpg ADDED Viewed

example/house_rotating/house_rotating.mp4 ADDED Viewed

Binary file (481 kB). View file

example/people_runing/0.jpg ADDED Viewed

example/people_runing/people_runing.mp4 ADDED Viewed

Binary file (482 kB). View file

example/shark_swimming/0.jpg ADDED Viewed

example/shark_swimming/shark_swimming.mp4 ADDED Viewed

Binary file (282 kB). View file

example/windmill_turning/0.jpg ADDED Viewed

example/windmill_turning/windmill_turning.mp4 ADDED Viewed

Binary file (403 kB). View file

gradio_cached_examples/39/Generated Animation/5e69f32e801f7ae77024/temp.mp4 ADDED Viewed

Binary file (226 kB). View file

gradio_cached_examples/39/Generated Animation/98ce26b896864325a1dd/temp.mp4 ADDED Viewed

Binary file (223 kB). View file

gradio_cached_examples/39/Generated Animation/b12875c4b9b633b752c4/.nfs6a1237621cfe7a8800009149 ADDED Viewed

Binary file (334 kB). View file

gradio_cached_examples/39/Generated Animation/b12875c4b9b633b752c4/temp.mp4 ADDED Viewed

Binary file (619 kB). View file

gradio_cached_examples/39/Generated Animation/b54545fbdd15c944208e/temp.mp4 ADDED Viewed

Binary file (272 kB). View file

gradio_cached_examples/39/Generated Animation/cf8ea2ef6e0b7eeb7fe6/.nfs88c2a0e49709591000009148 ADDED Viewed

Binary file (352 kB). View file

gradio_cached_examples/39/Generated Animation/cf8ea2ef6e0b7eeb7fe6/temp.mp4 ADDED Viewed

Binary file (481 kB). View file

gradio_cached_examples/39/Generated Animation/de8039b347e55995b4cb/temp.mp4 ADDED Viewed

Binary file (206 kB). View file

gradio_cached_examples/39/indices.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+0
+1
+2
+3
+4
+5

gradio_cached_examples/39/log.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Generated Animation,flag,username,timestamp
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/de8039b347e55995b4cb/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/24d44f90b57c00bd6fdccd61cc35a4f4d459388c/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:42:07.603268
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/b54545fbdd15c944208e/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/deb9620f616c3681cb074388781099f78a25dc8f/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:42:25.127236
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/5e69f32e801f7ae77024/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/d9e392300169a439b3f5721579849e3e5ce6abf9/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:44:01.949003
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/b12875c4b9b633b752c4/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/d1a5066828be61057823cf98edae890db907f358/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:45:00.196580
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/cf8ea2ef6e0b7eeb7fe6/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/5998620333bb5bdaf52b49f2de86e428df991431/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:45:28.848377
+"{""video"": {""path"": ""gradio_cached_examples/39/Generated Animation/98ce26b896864325a1dd/temp.mp4"", ""url"": ""/file=/data/pe1/000scratch/slurm_tmpdir/20240727_job_53250001.VBWa/gradio/7e3c6838256fd5a89b20ad62ce42288212e62097/temp.mp4"", ""size"": null, ""orig_name"": ""temp.mp4"", ""mime_type"": null, ""is_stream"": false, ""meta"": {""_type"": ""gradio.FileData""}}, ""subtitles"": null}",,,2024-07-27 14:45:48.004623

models/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import sys
+sys.path.append(os.path.split(sys.path[0])[0])
+from .unet import UNet3DConditionModel
+from torch.optim.lr_scheduler import LambdaLR
+def customized_lr_scheduler(optimizer, warmup_steps=5000): # 5000 from u-vit
+    from torch.optim.lr_scheduler import LambdaLR
+    def fn(step):
+        if warmup_steps > 0:
+            return min(step / warmup_steps, 1)
+        else:
+            return 1
+    return LambdaLR(optimizer, fn)
+def get_lr_scheduler(optimizer, name, **kwargs):
+    if name == 'warmup':
+        return customized_lr_scheduler(optimizer, **kwargs)
+    elif name == 'cosine':
+        from torch.optim.lr_scheduler import CosineAnnealingLR
+        return CosineAnnealingLR(optimizer, **kwargs)
+    else:
+        raise NotImplementedError(name)
+def get_models(args):
+    if 'UNet' in args.model:
+        return UNet3DConditionModel.from_pretrained(args.pretrained_model_path, subfolder="unet")
+    else:
+        raise '{} Model Not Supported!'.format(args.model)

models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.73 kB). View file

models/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

models/__pycache__/resnet.cpython-312.pyc ADDED Viewed

Binary file (14.4 kB). View file

models/__pycache__/rotary_embedding_torch_mx.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

models/__pycache__/temporal_attention.cpython-312.pyc ADDED Viewed

Binary file (21.7 kB). View file