version: 1.0 system: "cross" diffwrap: cls_embedding: content_dim: 768 content_hidden: 256 use_pitch: true pitch_dim: 1 pitch_hidden: 128 unet: sample_size: [100, 256] in_channels: 385 out_channels: 1 layers_per_block: 2 block_out_channels: [128, 256, 512] down_block_types: [ "DownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", ] up_block_types: [ "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "UpBlock2D", ] attention_head_dim: 32 cross_attention_dim: 768