|
model: |
|
network: |
|
dim: 512 |
|
num_timesteps: 1000 |
|
depth: 12 |
|
dim_head: 64 |
|
heads: 12 |
|
diffusion: |
|
image_embed_dim: ${model.network.dim} |
|
timesteps: ${model.network.num_timesteps} |
|
cond_drop_prob: 0.2 |
|
image_embed_scale: 1.0 |
|
text_embed_scale: 1.0 |
|
beta_schedule: cosine |
|
predict_x_start: true |
|
data: |
|
bs: 512 |
|
format: webdataset |
|
path: data/webdataset/sg3-lhq-256-clip/{00000..99}.tar |
|
embed_noise_scale: 1.0 |
|
sg_pkl: https://huggingface.co/justinpinkney/stylegan3-t-lhq-256/resolve/main/lhq-256-stylegan3-t-25Mimg.pkl |
|
clip_variant: ViT-B/32 |
|
n_latents: 1 |
|
latent_dim: 512 |
|
latent_repeats: |
|
- 16 |
|
val_im_samples: 64 |
|
val_text_samples: data/text/landscape-val.txt |
|
val_samples_per_text: 4 |
|
wandb_project: clip2latent |
|
wandb_entity: null |
|
name: lhq_noise_1 |
|
device: cuda:0 |
|
train: |
|
znorm_embed: false |
|
znorm_latent: true |
|
max_it: 1000000 |
|
val_it: 10000 |
|
lr: 0.0001 |
|
weight_decay: 0.01 |
|
ema_update_every: 1 |
|
ema_beta: 0.99999 |
|
|