model: | |
network: | |
dim: 512 | |
num_timesteps: 1000 | |
depth: 12 | |
dim_head: 64 | |
heads: 12 | |
diffusion: | |
image_embed_dim: ${model.network.dim} | |
timesteps: ${model.network.num_timesteps} | |
cond_drop_prob: 0.2 | |
image_embed_scale: 1.0 | |
text_embed_scale: 1.0 | |
beta_schedule: cosine | |
predict_x_start: true | |
data: | |
bs: 512 | |
format: webdataset | |
path: data/webdataset/sg2-ffhq-1024-clip/{00000..99}.tar | |
embed_noise_scale: 1.0 | |
sg_pkl: https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-1024x1024.pkl | |
clip_variant: ViT-B/32 | |
n_latents: 1 | |
latent_dim: 512 | |
latent_repeats: | |
- 18 | |
val_im_samples: 64 | |
val_text_samples: data/text/face-val.txt | |
val_samples_per_text: 4 | |
wandb_project: clip2latent | |
wandb_entity: null | |
name: baseline_noise_1 | |
device: cuda:0 | |
train: | |
znorm_embed: false | |
znorm_latent: true | |
max_it: 1000000 | |
val_it: 10000 | |
lr: 0.0001 | |
weight_decay: 0.01 | |
ema_update_every: 1 | |
ema_beta: 0.99999 | |