model: network: dim: 512 num_timesteps: 1000 depth: 12 dim_head: 64 heads: 12 diffusion: image_embed_dim: ${model.network.dim} timesteps: ${model.network.num_timesteps} cond_drop_prob: 0.2 image_embed_scale: 1.0 text_embed_scale: 1.0 beta_schedule: cosine predict_x_start: true data: bs: 512 format: webdataset path: data/webdataset/sg2-ffhq-1024-clip/{00000..99}.tar embed_noise_scale: 1.0 sg_pkl: https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-ffhq-1024x1024.pkl clip_variant: ViT-B/32 n_latents: 1 latent_dim: 512 latent_repeats: - 18 val_im_samples: 64 val_text_samples: data/text/face-val.txt val_samples_per_text: 4 wandb_project: clip2latent wandb_entity: null name: baseline_noise_1 device: cuda:0 train: znorm_embed: false znorm_latent: true max_it: 1000000 val_it: 10000 lr: 0.0001 weight_decay: 0.01 ema_update_every: 1 ema_beta: 0.99999