Update config

Files changed (3) hide show

README.md CHANGED Viewed

@@ -61,9 +61,9 @@ EasyAnimateV4:
 We attempted to implement EasyAnimate using 3D full attention, but this structure performed moderately on slice VAE and incurred considerable training costs. As a result, the performance of version V4 did not significantly surpass that of version V3. Due to limited resources, we are migrating EasyAnimate to a retrained 16-channel MagVit to pursue better model performance.
-| Name | Type | Storage Space | Url | Hugging Face | Description |
 |--|--|--|--|--|--|
-| EasyAnimateV4-XL-2-InP.tar.gz | EasyAnimateV4 | Before extraction: 8.9 GB \/ After extraction: 14.0 GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV4-XL-2-InP.tar.gz) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV4-XL-2-InP)| Our official graph-generated video model is capable of predicting videos at multiple resolutions (512, 768, 1024, 1280) and has been trained on 144 frames at a rate of 24 frames per second. |
 # Algorithm Detailed

 We attempted to implement EasyAnimate using 3D full attention, but this structure performed moderately on slice VAE and incurred considerable training costs. As a result, the performance of version V4 did not significantly surpass that of version V3. Due to limited resources, we are migrating EasyAnimate to a retrained 16-channel MagVit to pursue better model performance.
+| Name | Type | Storage Space | Hugging Face | Model Scope | Description |
 |--|--|--|--|--|--|
+| EasyAnimateV4-XL-2-InP.tar.gz | EasyAnimateV4 | Before extraction: 8.9 GB \/ After extraction: 14.0 GB |[🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV4-XL-2-InP)| [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV4-XL-2-InP)| | Our official graph-generated video model is capable of predicting videos at multiple resolutions (512, 768, 1024, 1280) and has been trained on 144 frames at a rate of 24 frames per second. |
 # Algorithm Detailed

transformer/config.json CHANGED Viewed

@@ -1,12 +1,15 @@
 {
   "_class_name": "HunyuanTransformer3DModel",
-  "_diffusers_version": "0.28.2",
   "activation_fn": "gelu-approximate",
   "after_norm": false,
   "attention_head_dim": 88,
   "basic_block_type": "basic",
   "cross_attention_dim": 1024,
   "cross_attention_dim_t5": 2048,
   "hidden_size": 1408,
   "in_channels": 12,
   "learn_sigma": true,
@@ -23,8 +26,10 @@
   "patch_size": 2,
   "pooled_projection_dim": 1024,
   "projection_dim": 1024,
   "sample_size": 128,
   "text_len": 77,
   "text_len_t5": 256,
-  "time_position_encoding": true
 }

 {
   "_class_name": "HunyuanTransformer3DModel",
+  "_diffusers_version": "0.30.1",
   "activation_fn": "gelu-approximate",
+  "add_noise_in_inpaint_model": false,
   "after_norm": false,
   "attention_head_dim": 88,
   "basic_block_type": "basic",
   "cross_attention_dim": 1024,
   "cross_attention_dim_t5": 2048,
+  "enable_clip_in_inpaint": true,
+  "enable_text_attention_mask": true,
   "hidden_size": 1408,
   "in_channels": 12,
   "learn_sigma": true,
   "patch_size": 2,
   "pooled_projection_dim": 1024,
   "projection_dim": 1024,
+  "resize_inpaint_mask_directly": false,
   "sample_size": 128,
   "text_len": 77,
   "text_len_t5": 256,
+  "time_position_encoding": true,
+  "time_position_encoding_type": "2d_rope"
 }

vae/config.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "_class_name": "AutoencoderKL",
-  "_diffusers_version": "0.22.0.dev0",
   "act_fn": "silu",
   "block_out_channels": [
     128,
@@ -8,9 +8,18 @@
     512,
     512
   ],
   "down_block_types": [
-    "SpatialDownBlock3D",
-    "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D"
   ],
@@ -18,24 +27,31 @@
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
   "norm_num_groups": 32,
   "out_channels": 3,
   "sample_size": 256,
   "scaling_factor": 0.18215,
-  "slice_mag_vae": false,
   "slice_compression_vae": false,
-  "cache_compression_vae": true,
-  "use_tiling": false,
-  "use_tiling_encoder": false,
-  "use_tiling_decoder": true,
-  "mid_block_attention_type": "3d",
   "tile_sample_min_size": 384,
-  "mini_batch_encoder": 8,
-  "mini_batch_decoder": 2,
   "up_block_types": [
-    "SpatialUpBlock3D",
-    "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D"
-  ]
 }

 {
+  "_class_name": "AutoencoderKLMagvit",
+  "_diffusers_version": "0.30.1",
   "act_fn": "silu",
   "block_out_channels": [
     128,
     512,
     512
   ],
+  "cache_compression_vae": true,
+  "cache_mag_vae": false,
+  "ch": 128,
+  "ch_mult": [
+    1,
+    2,
+    4,
+    4
+  ],
   "down_block_types": [
+    "SpatialDownBlock3D",
+    "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D",
     "SpatialTemporalDownBlock3D"
   ],
   "in_channels": 3,
   "latent_channels": 4,
   "layers_per_block": 2,
+  "mid_block_attention_type": "3d",
+  "mid_block_num_attention_heads": 1,
+  "mid_block_type": "MidBlock3D",
+  "mid_block_use_attention": true,
+  "mini_batch_decoder": 2,
+  "mini_batch_encoder": 8,
   "norm_num_groups": 32,
+  "num_attention_heads": 1,
   "out_channels": 3,
   "sample_size": 256,
   "scaling_factor": 0.18215,
   "slice_compression_vae": false,
+  "slice_mag_vae": false,
+  "spatial_group_norm": false,
+  "tile_overlap_factor": 0.25,
   "tile_sample_min_size": 384,
   "up_block_types": [
+    "SpatialUpBlock3D",
+    "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D",
     "SpatialTemporalUpBlock3D"
+  ],
+  "upcast_vae": false,
+  "use_gc_blocks": null,
+  "use_tiling": false,
+  "use_tiling_decoder": true,
+  "use_tiling_encoder": false
 }