version: 1.0 system: "cross" model: cls_embedding: content_dim: 768 content_hidden: 256 unet: sample_size: [1, 1] in_channels: 256 out_channels: 256 layers_per_block: 2 block_out_channels: [256] down_block_types: [ "CrossAttnDownBlock2D", ] up_block_types: [ "CrossAttnUpBlock2D", ] attention_head_dim: 32 cross_attention_dim: 768 scheduler: num_train_steps: 1000 beta_schedule: 'linear' beta_start: 0.0001 beta_end: 0.02 num_infer_steps: 50 rescale_betas_zero_snr: true timestep_spacing: "trailing" clip_sample: false prediction_type: 'v_prediction' scale: 0.05 shift: -0.035