model: network: dim: 512 num_timesteps: 1000 depth: 12 dim_head: 64 heads: 12 diffusion: image_embed_dim: ${model.network.dim} timesteps: ${model.network.num_timesteps} cond_drop_prob: 0.2 image_embed_scale: 1.0 text_embed_scale: 1.0 beta_schedule: cosine predict_x_start: true data: bs: 512 format: webdataset path: data/webdataset/sg3-lhq-256-clip/{00000..99}.tar embed_noise_scale: 1.0 sg_pkl: https://huggingface.co/justinpinkney/stylegan3-t-lhq-256/resolve/main/lhq-256-stylegan3-t-25Mimg.pkl clip_variant: ViT-B/32 n_latents: 1 latent_dim: 512 latent_repeats: - 16 val_im_samples: 64 val_text_samples: data/text/landscape-val.txt val_samples_per_text: 4 wandb_project: clip2latent wandb_entity: null name: lhq_noise_1 device: cuda:0 train: znorm_embed: false znorm_latent: true max_it: 1000000 val_it: 10000 lr: 0.0001 weight_decay: 0.01 ema_update_every: 1 ema_beta: 0.99999