benjamin-paine commited on
Commit
57f6236
1 Parent(s): 6a6032e

Upload config.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.json +167 -0
config.json ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "reference_unet": {
3
+ "_class_name": "UNet2DConditionModel",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 768,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 4,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ },
37
+ "denoising_unet": {
38
+ "_class_name": "UNet2DConditionModel",
39
+ "act_fn": "silu",
40
+ "attention_head_dim": 8,
41
+ "block_out_channels": [
42
+ 320,
43
+ 640,
44
+ 1280,
45
+ 1280
46
+ ],
47
+ "center_input_sample": false,
48
+ "cross_attention_dim": 768,
49
+ "down_block_types": [
50
+ "CrossAttnDownBlock3D",
51
+ "CrossAttnDownBlock3D",
52
+ "CrossAttnDownBlock3D",
53
+ "DownBlock3D"
54
+ ],
55
+ "downsample_padding": 1,
56
+ "flip_sin_to_cos": true,
57
+ "freq_shift": 0,
58
+ "in_channels": 4,
59
+ "layers_per_block": 2,
60
+ "mid_block_scale_factor": 1,
61
+ "norm_eps": 1e-05,
62
+ "norm_num_groups": 32,
63
+ "out_channels": 4,
64
+ "sample_size": 64,
65
+ "up_block_types": [
66
+ "UpBlock3D",
67
+ "CrossAttnUpBlock3D",
68
+ "CrossAttnUpBlock3D",
69
+ "CrossAttnUpBlock3D"
70
+ ],
71
+ "mid_block_type": "UNetMidBlock3DCrossAttn",
72
+ "use_inflated_groupnorm": true,
73
+ "unet_use_cross_frame_attention": false,
74
+ "unet_use_temporal_attention": false,
75
+ "use_motion_module": true,
76
+ "motion_module_resolutions": [
77
+ 1,
78
+ 2,
79
+ 4,
80
+ 8
81
+ ],
82
+ "motion_module_mid_block": true,
83
+ "motion_module_decoder_only": false,
84
+ "motion_module_type": "Vanilla",
85
+ "motion_module_kwargs": {
86
+ "num_attention_heads": 8,
87
+ "num_transformer_block": 1,
88
+ "attention_block_types": [
89
+ "Temporal_Self",
90
+ "Temporal_Self"
91
+ ],
92
+ "temporal_position_encoding": true,
93
+ "temporal_position_encoding_max_len": 32,
94
+ "temporal_attention_dim_div": 1
95
+ }
96
+ },
97
+ "vae": {
98
+ "_class_name": "AutoencoderKL",
99
+ "act_fn": "silu",
100
+ "block_out_channels": [
101
+ 128,
102
+ 256,
103
+ 512,
104
+ 512
105
+ ],
106
+ "down_block_types": [
107
+ "DownEncoderBlock2D",
108
+ "DownEncoderBlock2D",
109
+ "DownEncoderBlock2D",
110
+ "DownEncoderBlock2D"
111
+ ],
112
+ "in_channels": 3,
113
+ "latent_channels": 4,
114
+ "layers_per_block": 2,
115
+ "norm_num_groups": 32,
116
+ "out_channels": 3,
117
+ "sample_size": 256,
118
+ "up_block_types": [
119
+ "UpDecoderBlock2D",
120
+ "UpDecoderBlock2D",
121
+ "UpDecoderBlock2D",
122
+ "UpDecoderBlock2D"
123
+ ]
124
+ },
125
+ "image_encoder": {
126
+ "architectures": [
127
+ "CLIPVisionModelWithProjection"
128
+ ],
129
+ "attention_dropout": 0,
130
+ "dropout": 0,
131
+ "hidden_act": "quick_gelu",
132
+ "hidden_size": 1024,
133
+ "image_size": 224,
134
+ "initializer_factor": 1,
135
+ "initializer_range": 0.02,
136
+ "intermediate_size": 4096,
137
+ "layer_norm_eps": 0.00001,
138
+ "model_type": "clip_vision_model",
139
+ "num_attention_heads": 16,
140
+ "num_channels": 3,
141
+ "num_hidden_layers": 24,
142
+ "patch_size": 14,
143
+ "projection_dim": 768,
144
+ "torch_dtype": "float32"
145
+ },
146
+ "guidance_encoder": {
147
+ "guidance_embedding_channels": 320,
148
+ "guidance_input_channels": 3,
149
+ "block_out_channels": [
150
+ 16,
151
+ 32,
152
+ 96,
153
+ 256
154
+ ]
155
+ },
156
+ "scheduler": {
157
+ "num_train_timesteps": 1000,
158
+ "beta_start": 0.00085,
159
+ "beta_end": 0.012,
160
+ "beta_schedule": "linear",
161
+ "steps_offset": 1,
162
+ "clip_sample": false,
163
+ "rescale_betas_zero_snr": true,
164
+ "timestep_spacing": "trailing",
165
+ "prediction_type": "v_prediction"
166
+ }
167
+ }