File size: 3,134 Bytes
d643072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
data:
  data_dir: []
  image_size: 1024
  caption_proportion:
    prompt: 1
  external_caption_suffixes: []
  external_clipscore_suffixes: []
  clip_thr_temperature: 0.1
  clip_thr: 25.0
  load_text_feat: false
  load_vae_feat: true
  transform: default_train
  type: SanaWebDatasetMS
  sort_dataset: false
# model config
model:
  model: SanaMS_600M_P1_D28
  image_size: 1024
  mixed_precision: fp16 # ['fp16', 'fp32', 'bf16']
  fp32_attention: true
  load_from:
  resume_from:
  aspect_ratio_type: ASPECT_RATIO_1024
  multi_scale: true
  attn_type: linear
  ffn_type: glumbconv
  mlp_acts:
    - silu
    - silu
    -
  mlp_ratio: 2.5
  use_pe: false
  qk_norm: false
  class_dropout_prob: 0.1
  # CFG & PAG settings
  pag_applied_layers:
    - 14
# VAE setting
vae:
  vae_type: dc-ae
  vae_pretrained: mit-han-lab/dc-ae-f32c32-sana-1.0
  scale_factor: 0.41407
  vae_latent_dim: 32
  vae_downsample_rate: 32
  sample_posterior: true
# text encoder
text_encoder:
  text_encoder_name: gemma-2-2b-it
  y_norm: true
  y_norm_scale_factor: 0.01
  model_max_length: 300
  # CHI
  chi_prompt:
    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
    - 'Here are examples of how to transform or refine prompts:'
    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
    - 'User Prompt: '
# Sana schedule Flow
scheduler:
  predict_v: true
  noise_schedule: linear_flow
  pred_sigma: false
  flow_shift: 4.0
  # logit-normal timestep
  weighting_scheme: logit_normal
  logit_mean: 0.0
  logit_std: 1.0
  vis_sampler: flow_dpm-solver
# training setting
train:
  num_workers: 10
  seed: 1
  train_batch_size: 64
  num_epochs: 100
  gradient_accumulation_steps: 1
  grad_checkpointing: true
  gradient_clip: 0.1
  optimizer:
    betas:
      - 0.9
      - 0.999
      - 0.9999
    eps:
      - 1.0e-30
      - 1.0e-16
    lr: 0.0001
    type: CAMEWrapper
    weight_decay: 0.0
  lr_schedule: constant
  lr_schedule_args:
    num_warmup_steps: 2000
  local_save_vis: true # if save log image locally
  visualize: true
  eval_sampling_steps: 500
  log_interval: 20
  save_model_epochs: 5
  save_model_steps: 500
  work_dir: output/debug
  online_metric: false
  eval_metric_step: 2000
  online_metric_dir: metric_helper