bubbliiiing commited on
Commit
0bdb0bf
·
1 Parent(s): e32f3df

Update config

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. transformer/config.json +7 -2
  3. vae/config.json +31 -15
README.md CHANGED
@@ -61,9 +61,9 @@ EasyAnimateV4:
61
 
62
  We attempted to implement EasyAnimate using 3D full attention, but this structure performed moderately on slice VAE and incurred considerable training costs. As a result, the performance of version V4 did not significantly surpass that of version V3. Due to limited resources, we are migrating EasyAnimate to a retrained 16-channel MagVit to pursue better model performance.
63
 
64
- | Name | Type | Storage Space | Url | Hugging Face | Description |
65
  |--|--|--|--|--|--|
66
- | EasyAnimateV4-XL-2-InP.tar.gz | EasyAnimateV4 | Before extraction: 8.9 GB \/ After extraction: 14.0 GB | [Download](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Diffusion_Transformer/EasyAnimateV4-XL-2-InP.tar.gz) | [🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV4-XL-2-InP)| Our official graph-generated video model is capable of predicting videos at multiple resolutions (512, 768, 1024, 1280) and has been trained on 144 frames at a rate of 24 frames per second. |
67
 
68
 
69
  # Algorithm Detailed
 
61
 
62
  We attempted to implement EasyAnimate using 3D full attention, but this structure performed moderately on slice VAE and incurred considerable training costs. As a result, the performance of version V4 did not significantly surpass that of version V3. Due to limited resources, we are migrating EasyAnimate to a retrained 16-channel MagVit to pursue better model performance.
63
 
64
+ | Name | Type | Storage Space | Hugging Face | Model Scope | Description |
65
  |--|--|--|--|--|--|
66
+ | EasyAnimateV4-XL-2-InP.tar.gz | EasyAnimateV4 | Before extraction: 8.9 GB \/ After extraction: 14.0 GB |[🤗Link](https://huggingface.co/alibaba-pai/EasyAnimateV4-XL-2-InP)| [😄Link](https://modelscope.cn/models/PAI/EasyAnimateV4-XL-2-InP)| | Our official graph-generated video model is capable of predicting videos at multiple resolutions (512, 768, 1024, 1280) and has been trained on 144 frames at a rate of 24 frames per second. |
67
 
68
 
69
  # Algorithm Detailed
transformer/config.json CHANGED
@@ -1,12 +1,15 @@
1
  {
2
  "_class_name": "HunyuanTransformer3DModel",
3
- "_diffusers_version": "0.28.2",
4
  "activation_fn": "gelu-approximate",
 
5
  "after_norm": false,
6
  "attention_head_dim": 88,
7
  "basic_block_type": "basic",
8
  "cross_attention_dim": 1024,
9
  "cross_attention_dim_t5": 2048,
 
 
10
  "hidden_size": 1408,
11
  "in_channels": 12,
12
  "learn_sigma": true,
@@ -23,8 +26,10 @@
23
  "patch_size": 2,
24
  "pooled_projection_dim": 1024,
25
  "projection_dim": 1024,
 
26
  "sample_size": 128,
27
  "text_len": 77,
28
  "text_len_t5": 256,
29
- "time_position_encoding": true
 
30
  }
 
1
  {
2
  "_class_name": "HunyuanTransformer3DModel",
3
+ "_diffusers_version": "0.30.1",
4
  "activation_fn": "gelu-approximate",
5
+ "add_noise_in_inpaint_model": false,
6
  "after_norm": false,
7
  "attention_head_dim": 88,
8
  "basic_block_type": "basic",
9
  "cross_attention_dim": 1024,
10
  "cross_attention_dim_t5": 2048,
11
+ "enable_clip_in_inpaint": true,
12
+ "enable_text_attention_mask": true,
13
  "hidden_size": 1408,
14
  "in_channels": 12,
15
  "learn_sigma": true,
 
26
  "patch_size": 2,
27
  "pooled_projection_dim": 1024,
28
  "projection_dim": 1024,
29
+ "resize_inpaint_mask_directly": false,
30
  "sample_size": 128,
31
  "text_len": 77,
32
  "text_len_t5": 256,
33
+ "time_position_encoding": true,
34
+ "time_position_encoding_type": "2d_rope"
35
  }
vae/config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.22.0.dev0",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
@@ -8,9 +8,18 @@
8
  512,
9
  512
10
  ],
 
 
 
 
 
 
 
 
 
11
  "down_block_types": [
12
- "SpatialDownBlock3D",
13
- "SpatialTemporalDownBlock3D",
14
  "SpatialTemporalDownBlock3D",
15
  "SpatialTemporalDownBlock3D"
16
  ],
@@ -18,24 +27,31 @@
18
  "in_channels": 3,
19
  "latent_channels": 4,
20
  "layers_per_block": 2,
 
 
 
 
 
 
21
  "norm_num_groups": 32,
 
22
  "out_channels": 3,
23
  "sample_size": 256,
24
  "scaling_factor": 0.18215,
25
- "slice_mag_vae": false,
26
  "slice_compression_vae": false,
27
- "cache_compression_vae": true,
28
- "use_tiling": false,
29
- "use_tiling_encoder": false,
30
- "use_tiling_decoder": true,
31
- "mid_block_attention_type": "3d",
32
  "tile_sample_min_size": 384,
33
- "mini_batch_encoder": 8,
34
- "mini_batch_decoder": 2,
35
  "up_block_types": [
36
- "SpatialUpBlock3D",
37
- "SpatialTemporalUpBlock3D",
38
  "SpatialTemporalUpBlock3D",
39
  "SpatialTemporalUpBlock3D"
40
- ]
 
 
 
 
 
41
  }
 
1
  {
2
+ "_class_name": "AutoencoderKLMagvit",
3
+ "_diffusers_version": "0.30.1",
4
  "act_fn": "silu",
5
  "block_out_channels": [
6
  128,
 
8
  512,
9
  512
10
  ],
11
+ "cache_compression_vae": true,
12
+ "cache_mag_vae": false,
13
+ "ch": 128,
14
+ "ch_mult": [
15
+ 1,
16
+ 2,
17
+ 4,
18
+ 4
19
+ ],
20
  "down_block_types": [
21
+ "SpatialDownBlock3D",
22
+ "SpatialTemporalDownBlock3D",
23
  "SpatialTemporalDownBlock3D",
24
  "SpatialTemporalDownBlock3D"
25
  ],
 
27
  "in_channels": 3,
28
  "latent_channels": 4,
29
  "layers_per_block": 2,
30
+ "mid_block_attention_type": "3d",
31
+ "mid_block_num_attention_heads": 1,
32
+ "mid_block_type": "MidBlock3D",
33
+ "mid_block_use_attention": true,
34
+ "mini_batch_decoder": 2,
35
+ "mini_batch_encoder": 8,
36
  "norm_num_groups": 32,
37
+ "num_attention_heads": 1,
38
  "out_channels": 3,
39
  "sample_size": 256,
40
  "scaling_factor": 0.18215,
 
41
  "slice_compression_vae": false,
42
+ "slice_mag_vae": false,
43
+ "spatial_group_norm": false,
44
+ "tile_overlap_factor": 0.25,
 
 
45
  "tile_sample_min_size": 384,
 
 
46
  "up_block_types": [
47
+ "SpatialUpBlock3D",
48
+ "SpatialTemporalUpBlock3D",
49
  "SpatialTemporalUpBlock3D",
50
  "SpatialTemporalUpBlock3D"
51
+ ],
52
+ "upcast_vae": false,
53
+ "use_gc_blocks": null,
54
+ "use_tiling": false,
55
+ "use_tiling_decoder": true,
56
+ "use_tiling_encoder": false
57
  }