{ | |
"d_model": 192, | |
"num_layers": 2, | |
"T_local": 3, | |
"cluster_size": 4, | |
"batch_size": 48, | |
"learning_rate": 4.74e-4, | |
"weight_decay": 0.0381, | |
"dropout": { | |
"embedding": 0.4, | |
"local_aggregation": 0.3, | |
"attention": 0.3, | |
"final": 0.4 | |
}, | |
"seq_len": 256 | |
} |