version: 1.0

system: "cross"

model:
  cls_embedding:
    content_dim: 768
    content_hidden: 256

  unet:  
    sample_size: [1, 1]
    in_channels: 256
    out_channels: 256
    layers_per_block: 2
    block_out_channels: [256]
    down_block_types:
      [
        "CrossAttnDownBlock2D",
      ]
    up_block_types:
      [
        "CrossAttnUpBlock2D",
      ]
    attention_head_dim: 32
    cross_attention_dim: 768

scheduler:
  num_train_steps: 1000
  beta_schedule: 'linear'
  beta_start: 0.0001
  beta_end: 0.02
  num_infer_steps: 50
  rescale_betas_zero_snr: true
  timestep_spacing: "trailing"
  clip_sample: false
  prediction_type: 'v_prediction'
  scale: 0.05 
  shift: -0.035