{
  "addition_attention": true,
  "attention_resolutions": [
    4,
    2,
    1
  ],
  "channel_mult": [
    1,
    2,
    4,
    4
  ],
  "context_dim": 1024,
  "conv_resample": true,
  "default_fs": 24,
  "dims": 2,
  "dropout": 0.1,
  "fs_condition": true,
  "image_cross_attention": true,
  "image_cross_attention_scale_learnable": false,
  "in_channels": 8,
  "model_channels": 320,
  "num_head_channels": 64,
  "num_heads": -1,
  "num_res_blocks": 2,
  "out_channels": 4,
  "resblock_updown": false,
  "temporal_attention": true,
  "temporal_conv": true,
  "temporal_length": 16,
  "temporal_selfatt_only": true,
  "tempspatial_aware": false,
  "transformer_depth": 1,
  "use_causal_attention": false,
  "use_linear": true,
  "use_relative_position": false,
  "use_scale_shift_norm": false
}