pcuenq HF staff commited on
Commit
3fce0e3
1 Parent(s): d363136

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +45 -0
  3. config.yaml +132 -0
  4. samples.png +3 -0
  5. t5.vocab +0 -0
  6. vis_model.pth +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apple-ascl
3
+ tags:
4
+ - mdm
5
+ ---
6
+
7
+ # Matryoshka Diffusion Models
8
+
9
+ Matryoshka Diffusion Models was introduced in [the paper of the same name](https://huggingface.co/papers/2310.15111), by Jiatao Gu,Shuangfei Zhai, Yizhe Zhang, Josh Susskind, Navdeep Jaitly.
10
+
11
+ This repository contains the **Flickr 1024** checkpoint.
12
+
13
+ ![Generation Examples from the MDM repository](samples.png)
14
+
15
+ ### Highlights
16
+
17
+ * This checkpoint was trained on a dataset of 50M text-image pairs collected from Flickr.
18
+ * This model was trained using nested UNets at various resolutions, and generates images with a resolution of 1024 × 1024.
19
+ * Despite training on relatively small datasets, MDMs show strong zero-shot capabilities of generating high-resolution images and videos.
20
+
21
+ ## Checkpoints
22
+
23
+ | Model | Dataset | Resolution | Nested UNets |
24
+ |---------------------------------------------------------|------------|-------------|--------------|
25
+ | [mdm-flickr-64](https://hf.co/pcuenq/mdm-flickr-64) | Flickr 50M | 64 × 64 | ❎ |
26
+ | [mdm-flickr-256](https://hf.co/pcuenq/mdm-flickr-256) | Flickr 50M | 256 × 256 | ✅ |
27
+ | [mdm-flickr-1024](https://hf.co/pcuenq/mdm-flickr-1024) | Flickr 50M | 1024 × 1024 | ✅ |
28
+
29
+ ## How to Use
30
+
31
+ Please, refer to the [original repository](https://github.com/apple/ml-mdm) for training and inference instructions.
32
+
33
+ ## Citation
34
+
35
+ ```
36
+ @misc{gu2023matryoshkadiffusionmodels,
37
+ title={Matryoshka Diffusion Models},
38
+ author={Jiatao Gu and Shuangfei Zhai and Yizhe Zhang and Josh Susskind and Navdeep Jaitly},
39
+ year={2023},
40
+ eprint={2310.15111},
41
+ archivePrefix={arXiv},
42
+ primaryClass={cs.CV},
43
+ url={https://arxiv.org/abs/2310.15111},
44
+ }
45
+ ```
config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Text2Image_Diffusion_R1024R256R64RND_T5XL_Detailed_PTV2W
2
+ dataset_config: configs/datasets/cc12m.yaml
3
+ # sampler_arguments:
4
+ min_examples: 10000
5
+ sample_dir: /mnt/data/samples
6
+ # batch-size: 8
7
+ sample_image_size: 1024
8
+ test_file_list: validation.tsv
9
+ # reader-config-file: configs/datasets/reader_config_eval.yaml
10
+ # shared_arguments:
11
+ output_dir: /mnt/data/outputs
12
+ num_diffusion_steps: 1000
13
+ reproject_signal: false
14
+ model_output_scale: 0
15
+ prediction_type: V_PREDICTION
16
+ loss_target_type: DDPM
17
+ schedule_type: DEEPFLOYD
18
+ prediction_length: 129
19
+ use_vdm_loss_weights: false
20
+ use_double_loss: true
21
+ no_use_residual: true
22
+ num_training_steps: 1000000
23
+ avg_lm_steps: 0
24
+ categorical_conditioning: 0
25
+ rescale_signal: 1
26
+ schedule_shifted: true
27
+ schedule_shifted_power: 2
28
+ skip_normalization: true
29
+ random_low_noise: true
30
+ vocab_file: t5.vocab
31
+ text_model: google/flan-t5-xl
32
+ model: nested2_unet
33
+ vision_model: nested2_unet
34
+
35
+ unet_config:
36
+ attention_levels: []
37
+ conditioning_feature_dim: -1
38
+ conditioning_feature_proj_dim: 2048
39
+ freeze_inner_unet: false
40
+ initialize_inner_with_pretrained: 8rwvbg85tt
41
+ inner_config:
42
+ attention_levels: []
43
+ conditioning_feature_dim: -1
44
+ conditioning_feature_proj_dim: 2048
45
+ freeze_inner_unet: false
46
+ initialize_inner_with_pretrained: null
47
+ inner_config:
48
+ attention_levels: [1, 2]
49
+ conditioning_feature_dim: -1
50
+ conditioning_feature_proj_dim: 2048
51
+ masked_cross_attention: 0
52
+ micro_conditioning: scale:64
53
+ nesting: true
54
+ num_attention_layers: [0, 1, 5]
55
+ num_lm_head_layers: 0
56
+ num_resnets_per_resolution: [2, 2, 2]
57
+ num_temporal_attention_layers: null
58
+ resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
59
+ use_attention_ffn: true}
60
+ resolution_channels: [256, 512, 768]
61
+ skip_cond_emb: false
62
+ skip_mid_blocks: false
63
+ temporal_dim: null
64
+ temporal_mode: false
65
+ temporal_positional_encoding: false
66
+ temporal_spatial_ds: false
67
+ interp_conditioning: false
68
+ masked_cross_attention: 1
69
+ micro_conditioning: scale:256
70
+ nesting: true
71
+ num_attention_layers: [0, 0, 0]
72
+ num_lm_head_layers: 0
73
+ num_resnets_per_resolution: [2, 2, 1]
74
+ num_temporal_attention_layers: null
75
+ resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
76
+ use_attention_ffn: false}
77
+ resolution_channels: [64, 128, 256]
78
+ skip_cond_emb: true
79
+ skip_inner_unet_input: false
80
+ skip_mid_blocks: true
81
+ skip_normalization: false
82
+ temporal_dim: 1024
83
+ temporal_mode: false
84
+ temporal_positional_encoding: false
85
+ temporal_spatial_ds: false
86
+ interp_conditioning: false
87
+ masked_cross_attention: 1
88
+ micro_conditioning: scale:1024
89
+ nesting: false
90
+ num_attention_layers: [0, 0, 0]
91
+ num_lm_head_layers: 0
92
+ num_resnets_per_resolution: [2, 2, 1]
93
+ num_temporal_attention_layers: null
94
+ resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
95
+ use_attention_ffn: false}
96
+ resolution_channels: [32, 32, 64]
97
+ skip_cond_emb: true
98
+ skip_inner_unet_input: false
99
+ skip_mid_blocks: true
100
+ skip_normalization: true
101
+ temporal_dim: 1024
102
+ temporal_mode: false
103
+ temporal_positional_encoding: false
104
+ temporal_spatial_ds: false
105
+
106
+ # import defaults
107
+ # reader-config-file: configs/datasets/reader_config.yaml
108
+ # add overrides
109
+ reader_config:
110
+ image_size: 1024
111
+ smaller_side_size: 1024
112
+ random_crop: false
113
+ max_caption_length: -1
114
+ max_caption_length: 512 # note
115
+ max_token_length: 128
116
+ reader_buffer_size: 64
117
+ shuffle_buffer_size: 9600
118
+ use_lm_mask: 1
119
+ # torchmetrics_arguments:
120
+ metrics: fid,clip
121
+ # trainer_arguments:
122
+ use_precomputed_text_embeddings: 0
123
+ batch_size: 4
124
+ multi_res_weights: '16:4:1'
125
+ gradient_clip_norm: 2
126
+ loss_factor: 1
127
+ num_gradient_accumulations: 1
128
+ warmup_steps: 10000
129
+ log_freq: 50
130
+ save_freq: 5000
131
+ lr: 5.0e-05
132
+ fp16: 1
samples.png ADDED

Git LFS Details

  • SHA256: 2ea4347ba4d9c3d592f0a3c8e3c37e7612d8d2102ccf50954e3d5f0fb6725c77
  • Pointer size: 132 Bytes
  • Size of remote file: 1.23 MB
t5.vocab ADDED
The diff for this file is too large to render. See raw diff
 
vis_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d55f4d68ccac35f8afb239d437ac71610af0da7c2453e66826402b840a493b
3
+ size 1924301753