{ "d_model": 192, "num_layers": 2, "T_local": 3, "cluster_size": 4, "batch_size": 48, "learning_rate": 4.74e-4, "weight_decay": 0.0381, "dropout": { "embedding": 0.4, "local_aggregation": 0.3, "attention": 0.3, "final": 0.4 }, "seq_len": 768 }