{ "addition_attention": true, "attention_resolutions": [ 4, 2, 1 ], "channel_mult": [ 1, 2, 4, 4 ], "context_dim": 1024, "conv_resample": true, "default_fs": 24, "dims": 2, "dropout": 0.1, "fs_condition": true, "image_cross_attention": true, "image_cross_attention_scale_learnable": false, "in_channels": 8, "model_channels": 320, "num_head_channels": 64, "num_heads": -1, "num_res_blocks": 2, "out_channels": 4, "resblock_updown": false, "temporal_attention": true, "temporal_conv": true, "temporal_length": 16, "temporal_selfatt_only": true, "tempspatial_aware": false, "transformer_depth": 1, "use_causal_attention": false, "use_linear": true, "use_relative_position": false, "use_scale_shift_norm": false }