pcuenq HF staff commited on
Commit
33631d3
1 Parent(s): 0cd50d4

Upload folder using huggingface_hub

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +45 -0
  3. config.yaml +116 -0
  4. samples.png +3 -0
  5. t5.vocab +0 -0
  6. vis_model.pth +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ samples.png filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apple-ascl
3
+ tags:
4
+ - mdm
5
+ ---
6
+
7
+ # Matryoshka Diffusion Models
8
+
9
+ Matryoshka Diffusion Models was introduced in [the paper of the same name](https://huggingface.co/papers/2310.15111), by Jiatao Gu,Shuangfei Zhai, Yizhe Zhang, Josh Susskind, Navdeep Jaitly.
10
+
11
+ This repository contains the **Flickr 256** checkpoint.
12
+
13
+ ![Generation Examples from the MDM repository](samples.png)
14
+
15
+ ### Highlights
16
+
17
+ * This checkpoint was trained on a dataset of 50M text-image pairs collected from Flickr.
18
+ * This model was trained using nested UNets at various resolutions, and generates images with a resolution of 256 × 256.
19
+ * Despite training on relatively small datasets, MDMs show strong zero-shot capabilities of generating high-resolution images and videos.
20
+
21
+ ## Checkpoints
22
+
23
+ | Model | Dataset | Resolution | Nested UNets |
24
+ |---------------------------------------------------------|------------|-------------|--------------|
25
+ | [mdm-flickr-64](https://hf.co/pcuenq/mdm-flickr-64) | Flickr 50M | 64 × 64 | ❎ |
26
+ | [mdm-flickr-256](https://hf.co/pcuenq/mdm-flickr-256) | Flickr 50M | 256 × 256 | ✅ |
27
+ | [mdm-flickr-1024](https://hf.co/pcuenq/mdm-flickr-1024) | Flickr 50M | 1024 × 1024 | ✅ |
28
+
29
+ ## How to Use
30
+
31
+ Please, refer to the [original repository](https://github.com/apple/ml-mdm) for training and inference instructions.
32
+
33
+ ## Citation
34
+
35
+ ```
36
+ @misc{gu2023matryoshkadiffusionmodels,
37
+ title={Matryoshka Diffusion Models},
38
+ author={Jiatao Gu and Shuangfei Zhai and Yizhe Zhang and Josh Susskind and Navdeep Jaitly},
39
+ year={2023},
40
+ eprint={2310.15111},
41
+ archivePrefix={arXiv},
42
+ primaryClass={cs.CV},
43
+ url={https://arxiv.org/abs/2310.15111},
44
+ }
45
+ ```
config.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: cc12m_256x256
2
+ dataset_config: configs/datasets/cc12m.yaml
3
+ # sampler_arguments
4
+ min_examples: 10000
5
+ sample-dir: /mnt/data/samples
6
+ # batch-size: 32
7
+ sample_image_size: 256
8
+ test_file_list: validation.tsv
9
+ #reader-config-file: configs/datasets/reader_config_eval.yaml
10
+ # shared_arguments
11
+ output_dir: /mnt/data/outputs
12
+ num_diffusion_steps: 1000
13
+ reproject_signal: false
14
+ model_output_scale: 0
15
+ prediction_type: V_PREDICTION
16
+ loss_target_type: DDPM
17
+ schedule_type: DEEPFLOYD
18
+ prediction_length: 129
19
+ use_vdm_loss_weights: false
20
+ use_double_loss: true
21
+ no_use_residual: true
22
+ num_training_steps: 1000000
23
+ avg_lm_steps: 0
24
+ categorical_conditioning: 0
25
+ rescale_signal: 1
26
+ schedule_shifted: true
27
+ skip_normalization: true
28
+ random_low_noise: true
29
+ vocab_file: t5.vocab
30
+ text_model: google/flan-t5-xl
31
+ model: nested_unet
32
+ vision_model: nested_unet
33
+ #model_config-file: configs/models/model_config_nested256.yaml
34
+ unet_config:
35
+ attention_levels: []
36
+ conditioning_feature_dim: -1
37
+ conditioning_feature_proj_dim: -1
38
+ freeze_inner_unet: false
39
+ initialize_inner_with_pretrained: None
40
+ inner_config:
41
+ attention_levels: [1, 2]
42
+ conditioning_feature_dim: -1
43
+ conditioning_feature_proj_dim: 2048
44
+ masked_cross_attention: 0
45
+ micro_conditioning: scale:64
46
+ nesting: true
47
+ num_attention_layers: [0, 1, 5]
48
+ num_lm_head_layers: 0
49
+ num_resnets_per_resolution: [2, 2, 2]
50
+ num_temporal_attention_layers: null
51
+ resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
52
+ use_attention_ffn: true}
53
+ resolution_channels: [256, 512, 768]
54
+ skip_cond_emb: false
55
+ skip_mid_blocks: false
56
+ temporal_dim: null
57
+ temporal_mode: false
58
+ temporal_positional_encoding: false
59
+ temporal_spatial_ds: false
60
+ interp_conditioning: false
61
+ masked_cross_attention: 1
62
+ micro_conditioning: scale:256
63
+ nesting: false
64
+ num_attention_layers: [0, 0, 0]
65
+ num_lm_head_layers: 0
66
+ num_resnets_per_resolution: [2, 2, 1]
67
+ num_temporal_attention_layers: null
68
+ resnet_config: {dropout: 0.0, num_channels: -1, num_groups_norm: 32, output_channels: -1,
69
+ use_attention_ffn: false}
70
+ resolution_channels: [64, 128, 256]
71
+ skip_cond_emb: true
72
+ skip_inner_unet_input: false
73
+ skip_mid_blocks: true
74
+ skip_normalization: true
75
+ temporal_dim: 1024
76
+ temporal_mode: false
77
+ temporal_positional_encoding: false
78
+ temporal_spatial_ds: false
79
+
80
+ reader_config:
81
+ image_size: 256
82
+ smaller_side_size: 256
83
+ random_crop: false
84
+ max_caption_length: -1
85
+ max_token_length: 128
86
+ reader_buffer_size: 2000
87
+ shuffle_buffer_size: 2000
88
+
89
+ append_eos: true
90
+ num_readers: 2
91
+ pad_to_max_length: false
92
+ padding_token: <pad>
93
+ prepad_bos: false
94
+ prepad_caption_with_space: true
95
+ random_crop: false
96
+ #reader_buffer_size: 64
97
+ #shuffle_buffer_size: 9600
98
+ use_tokenizer_scores: true
99
+
100
+ use_lm_mask: 1
101
+ # torchmetrics_arguments:
102
+ metrics: fid,clip
103
+ # trainer_arguments:
104
+ use_precomputed_text_embeddings: 0
105
+ pretrained_vision_file: vis_model_256x256.pth
106
+ #batch_size: 24
107
+ mixed_ratio: '2:1'
108
+ gradient_clip_norm: 2
109
+ loss_factor: 1
110
+ num_gradient_accumulations: 1
111
+ warmup_steps: 10000
112
+ # reader-config-file: configs/datasets/reader_config.yaml
113
+ log_freq: 50
114
+ save_freq: 5000
115
+ lr: 5.0e-05
116
+ fp16: 0
samples.png ADDED

Git LFS Details

  • SHA256: 2ea4347ba4d9c3d592f0a3c8e3c37e7612d8d2102ccf50954e3d5f0fb6725c77
  • Pointer size: 132 Bytes
  • Size of remote file: 1.23 MB
t5.vocab ADDED
The diff for this file is too large to render. See raw diff
 
vis_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e227885c80e9fc634abf064b3aca4bb56625ec54c0b02e34f43423437716c526
3
+ size 1906796858