sf_babylm_1 / finetune /mnli-mm /trainer_state.json
Omar
upload results
efff561
{
"best_metric": 0.6368554830551147,
"best_model_checkpoint": "omarmomen/sf_babylm_1/finetune/mnli-mm/checkpoint-10600",
"epoch": 3.103448275862069,
"global_step": 12600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05,
"eval_accuracy": 0.4094525873661041,
"eval_loss": 1.0740591287612915,
"eval_runtime": 8.0395,
"eval_samples_per_second": 781.642,
"eval_steps_per_second": 97.767,
"step": 200
},
{
"epoch": 0.1,
"eval_accuracy": 0.5098663568496704,
"eval_loss": 0.9865565896034241,
"eval_runtime": 8.0625,
"eval_samples_per_second": 779.412,
"eval_steps_per_second": 97.488,
"step": 400
},
{
"epoch": 0.12,
"learning_rate": 4.938423645320197e-05,
"loss": 1.0601,
"step": 500
},
{
"epoch": 0.15,
"eval_accuracy": 0.5108211040496826,
"eval_loss": 0.9675168395042419,
"eval_runtime": 8.16,
"eval_samples_per_second": 770.097,
"eval_steps_per_second": 96.323,
"step": 600
},
{
"epoch": 0.2,
"eval_accuracy": 0.5381922125816345,
"eval_loss": 0.9379056096076965,
"eval_runtime": 8.156,
"eval_samples_per_second": 770.471,
"eval_steps_per_second": 96.37,
"step": 800
},
{
"epoch": 0.25,
"learning_rate": 4.876847290640394e-05,
"loss": 0.9691,
"step": 1000
},
{
"epoch": 0.25,
"eval_accuracy": 0.547581136226654,
"eval_loss": 0.9261330962181091,
"eval_runtime": 8.0508,
"eval_samples_per_second": 780.547,
"eval_steps_per_second": 97.63,
"step": 1000
},
{
"epoch": 0.3,
"eval_accuracy": 0.5571292042732239,
"eval_loss": 0.903211236000061,
"eval_runtime": 8.0951,
"eval_samples_per_second": 776.268,
"eval_steps_per_second": 97.095,
"step": 1200
},
{
"epoch": 0.34,
"eval_accuracy": 0.5545830726623535,
"eval_loss": 0.9244930744171143,
"eval_runtime": 8.0553,
"eval_samples_per_second": 780.111,
"eval_steps_per_second": 97.576,
"step": 1400
},
{
"epoch": 0.37,
"learning_rate": 4.8152709359605915e-05,
"loss": 0.9402,
"step": 1500
},
{
"epoch": 0.39,
"eval_accuracy": 0.5622215270996094,
"eval_loss": 0.9127561450004578,
"eval_runtime": 8.1598,
"eval_samples_per_second": 770.116,
"eval_steps_per_second": 96.326,
"step": 1600
},
{
"epoch": 0.44,
"eval_accuracy": 0.5660407543182373,
"eval_loss": 0.9042630195617676,
"eval_runtime": 8.1639,
"eval_samples_per_second": 769.729,
"eval_steps_per_second": 96.277,
"step": 1800
},
{
"epoch": 0.49,
"learning_rate": 4.753694581280788e-05,
"loss": 0.9195,
"step": 2000
},
{
"epoch": 0.49,
"eval_accuracy": 0.5647676587104797,
"eval_loss": 0.9146111011505127,
"eval_runtime": 8.1205,
"eval_samples_per_second": 773.846,
"eval_steps_per_second": 96.792,
"step": 2000
},
{
"epoch": 0.54,
"eval_accuracy": 0.5862507820129395,
"eval_loss": 0.8741610646247864,
"eval_runtime": 8.158,
"eval_samples_per_second": 770.283,
"eval_steps_per_second": 96.347,
"step": 2200
},
{
"epoch": 0.59,
"eval_accuracy": 0.5868873596191406,
"eval_loss": 0.8720157146453857,
"eval_runtime": 8.1301,
"eval_samples_per_second": 772.932,
"eval_steps_per_second": 96.678,
"step": 2400
},
{
"epoch": 0.62,
"learning_rate": 4.6921182266009855e-05,
"loss": 0.9024,
"step": 2500
},
{
"epoch": 0.64,
"eval_accuracy": 0.5878421664237976,
"eval_loss": 0.8731514811515808,
"eval_runtime": 8.0908,
"eval_samples_per_second": 776.687,
"eval_steps_per_second": 97.148,
"step": 2600
},
{
"epoch": 0.69,
"eval_accuracy": 0.5845003128051758,
"eval_loss": 0.8767739534378052,
"eval_runtime": 8.1003,
"eval_samples_per_second": 775.772,
"eval_steps_per_second": 97.033,
"step": 2800
},
{
"epoch": 0.74,
"learning_rate": 4.630541871921182e-05,
"loss": 0.8949,
"step": 3000
},
{
"epoch": 0.74,
"eval_accuracy": 0.5845003128051758,
"eval_loss": 0.8801184296607971,
"eval_runtime": 8.0801,
"eval_samples_per_second": 777.716,
"eval_steps_per_second": 97.276,
"step": 3000
},
{
"epoch": 0.79,
"eval_accuracy": 0.6012094020843506,
"eval_loss": 0.8618054986000061,
"eval_runtime": 8.0659,
"eval_samples_per_second": 779.08,
"eval_steps_per_second": 97.447,
"step": 3200
},
{
"epoch": 0.84,
"eval_accuracy": 0.5973901748657227,
"eval_loss": 0.8645357489585876,
"eval_runtime": 8.0652,
"eval_samples_per_second": 779.15,
"eval_steps_per_second": 97.456,
"step": 3400
},
{
"epoch": 0.86,
"learning_rate": 4.5689655172413794e-05,
"loss": 0.8858,
"step": 3500
},
{
"epoch": 0.89,
"eval_accuracy": 0.5957988500595093,
"eval_loss": 0.8610660433769226,
"eval_runtime": 8.0809,
"eval_samples_per_second": 777.636,
"eval_steps_per_second": 97.266,
"step": 3600
},
{
"epoch": 0.94,
"eval_accuracy": 0.5907065272331238,
"eval_loss": 0.868955671787262,
"eval_runtime": 8.061,
"eval_samples_per_second": 779.553,
"eval_steps_per_second": 97.506,
"step": 3800
},
{
"epoch": 0.99,
"learning_rate": 4.507389162561577e-05,
"loss": 0.8806,
"step": 4000
},
{
"epoch": 0.99,
"eval_accuracy": 0.6093252897262573,
"eval_loss": 0.8431282043457031,
"eval_runtime": 8.0923,
"eval_samples_per_second": 776.543,
"eval_steps_per_second": 97.13,
"step": 4000
},
{
"epoch": 1.03,
"eval_accuracy": 0.5980267524719238,
"eval_loss": 0.8728816509246826,
"eval_runtime": 8.1474,
"eval_samples_per_second": 771.292,
"eval_steps_per_second": 96.473,
"step": 4200
},
{
"epoch": 1.08,
"eval_accuracy": 0.6032781600952148,
"eval_loss": 0.8630015254020691,
"eval_runtime": 8.1598,
"eval_samples_per_second": 770.121,
"eval_steps_per_second": 96.326,
"step": 4400
},
{
"epoch": 1.11,
"learning_rate": 4.4458128078817734e-05,
"loss": 0.828,
"step": 4500
},
{
"epoch": 1.13,
"eval_accuracy": 0.6098026633262634,
"eval_loss": 0.8493590950965881,
"eval_runtime": 8.1533,
"eval_samples_per_second": 770.73,
"eval_steps_per_second": 96.403,
"step": 4600
},
{
"epoch": 1.18,
"eval_accuracy": 0.6063017249107361,
"eval_loss": 0.848625898361206,
"eval_runtime": 8.1785,
"eval_samples_per_second": 768.354,
"eval_steps_per_second": 96.105,
"step": 4800
},
{
"epoch": 1.23,
"learning_rate": 4.384236453201971e-05,
"loss": 0.8119,
"step": 5000
},
{
"epoch": 1.23,
"eval_accuracy": 0.5962762832641602,
"eval_loss": 0.8927779793739319,
"eval_runtime": 8.192,
"eval_samples_per_second": 767.092,
"eval_steps_per_second": 95.948,
"step": 5000
},
{
"epoch": 1.28,
"eval_accuracy": 0.6140993237495422,
"eval_loss": 0.8620697259902954,
"eval_runtime": 10.5649,
"eval_samples_per_second": 594.799,
"eval_steps_per_second": 74.397,
"step": 5200
},
{
"epoch": 1.33,
"eval_accuracy": 0.6096435189247131,
"eval_loss": 0.8553202748298645,
"eval_runtime": 10.5461,
"eval_samples_per_second": 595.862,
"eval_steps_per_second": 74.53,
"step": 5400
},
{
"epoch": 1.35,
"learning_rate": 4.3226600985221674e-05,
"loss": 0.8043,
"step": 5500
},
{
"epoch": 1.38,
"eval_accuracy": 0.5985041260719299,
"eval_loss": 0.8708285689353943,
"eval_runtime": 10.6457,
"eval_samples_per_second": 590.283,
"eval_steps_per_second": 73.832,
"step": 5600
},
{
"epoch": 1.43,
"eval_accuracy": 0.6225334405899048,
"eval_loss": 0.8417609930038452,
"eval_runtime": 10.6407,
"eval_samples_per_second": 590.561,
"eval_steps_per_second": 73.867,
"step": 5800
},
{
"epoch": 1.48,
"learning_rate": 4.261083743842365e-05,
"loss": 0.8077,
"step": 6000
},
{
"epoch": 1.48,
"eval_accuracy": 0.6037555932998657,
"eval_loss": 0.8710538148880005,
"eval_runtime": 10.5553,
"eval_samples_per_second": 595.342,
"eval_steps_per_second": 74.465,
"step": 6000
},
{
"epoch": 1.53,
"eval_accuracy": 0.6029598712921143,
"eval_loss": 0.8956546783447266,
"eval_runtime": 10.6119,
"eval_samples_per_second": 592.164,
"eval_steps_per_second": 74.068,
"step": 6200
},
{
"epoch": 1.58,
"eval_accuracy": 0.612030565738678,
"eval_loss": 0.8615825176239014,
"eval_runtime": 10.5375,
"eval_samples_per_second": 596.345,
"eval_steps_per_second": 74.591,
"step": 6400
},
{
"epoch": 1.6,
"learning_rate": 4.199507389162562e-05,
"loss": 0.8097,
"step": 6500
},
{
"epoch": 1.63,
"eval_accuracy": 0.6109166145324707,
"eval_loss": 0.8452191352844238,
"eval_runtime": 10.6063,
"eval_samples_per_second": 592.478,
"eval_steps_per_second": 74.107,
"step": 6600
},
{
"epoch": 1.67,
"eval_accuracy": 0.5973901748657227,
"eval_loss": 0.9005106091499329,
"eval_runtime": 10.558,
"eval_samples_per_second": 595.188,
"eval_steps_per_second": 74.446,
"step": 6800
},
{
"epoch": 1.72,
"learning_rate": 4.1379310344827587e-05,
"loss": 0.8079,
"step": 7000
},
{
"epoch": 1.72,
"eval_accuracy": 0.6225334405899048,
"eval_loss": 0.8366883993148804,
"eval_runtime": 10.5514,
"eval_samples_per_second": 595.561,
"eval_steps_per_second": 74.493,
"step": 7000
},
{
"epoch": 1.77,
"eval_accuracy": 0.6128262281417847,
"eval_loss": 0.8606940507888794,
"eval_runtime": 10.5796,
"eval_samples_per_second": 593.974,
"eval_steps_per_second": 74.294,
"step": 7200
},
{
"epoch": 1.82,
"eval_accuracy": 0.6288987994194031,
"eval_loss": 0.8481429815292358,
"eval_runtime": 10.5474,
"eval_samples_per_second": 595.785,
"eval_steps_per_second": 74.521,
"step": 7400
},
{
"epoch": 1.85,
"learning_rate": 4.076354679802955e-05,
"loss": 0.8049,
"step": 7500
},
{
"epoch": 1.87,
"eval_accuracy": 0.6090070009231567,
"eval_loss": 0.8603523969650269,
"eval_runtime": 10.5286,
"eval_samples_per_second": 596.85,
"eval_steps_per_second": 74.654,
"step": 7600
},
{
"epoch": 1.92,
"eval_accuracy": 0.6255569458007812,
"eval_loss": 0.8576852679252625,
"eval_runtime": 10.5749,
"eval_samples_per_second": 594.235,
"eval_steps_per_second": 74.327,
"step": 7800
},
{
"epoch": 1.97,
"learning_rate": 4.014778325123153e-05,
"loss": 0.8012,
"step": 8000
},
{
"epoch": 1.97,
"eval_accuracy": 0.6320814490318298,
"eval_loss": 0.8284709453582764,
"eval_runtime": 10.5826,
"eval_samples_per_second": 593.804,
"eval_steps_per_second": 74.273,
"step": 8000
},
{
"epoch": 2.02,
"eval_accuracy": 0.6187142133712769,
"eval_loss": 0.8966588377952576,
"eval_runtime": 10.6039,
"eval_samples_per_second": 592.614,
"eval_steps_per_second": 74.124,
"step": 8200
},
{
"epoch": 2.07,
"eval_accuracy": 0.6199872493743896,
"eval_loss": 0.8712503910064697,
"eval_runtime": 10.4657,
"eval_samples_per_second": 600.435,
"eval_steps_per_second": 75.102,
"step": 8400
},
{
"epoch": 2.09,
"learning_rate": 3.95320197044335e-05,
"loss": 0.7301,
"step": 8500
},
{
"epoch": 2.12,
"eval_accuracy": 0.6012094020843506,
"eval_loss": 0.9067429304122925,
"eval_runtime": 10.4821,
"eval_samples_per_second": 599.496,
"eval_steps_per_second": 74.985,
"step": 8600
},
{
"epoch": 2.17,
"eval_accuracy": 0.6177594065666199,
"eval_loss": 0.8801796436309814,
"eval_runtime": 10.6296,
"eval_samples_per_second": 591.178,
"eval_steps_per_second": 73.944,
"step": 8800
},
{
"epoch": 2.22,
"learning_rate": 3.891625615763547e-05,
"loss": 0.7109,
"step": 9000
},
{
"epoch": 2.22,
"eval_accuracy": 0.6032781600952148,
"eval_loss": 0.9191064834594727,
"eval_runtime": 10.5447,
"eval_samples_per_second": 595.938,
"eval_steps_per_second": 74.54,
"step": 9000
},
{
"epoch": 2.27,
"eval_accuracy": 0.6131445169448853,
"eval_loss": 0.9177576899528503,
"eval_runtime": 10.5787,
"eval_samples_per_second": 594.023,
"eval_steps_per_second": 74.3,
"step": 9200
},
{
"epoch": 2.32,
"eval_accuracy": 0.6055060625076294,
"eval_loss": 0.9652428030967712,
"eval_runtime": 10.5388,
"eval_samples_per_second": 596.274,
"eval_steps_per_second": 74.582,
"step": 9400
},
{
"epoch": 2.34,
"learning_rate": 3.830049261083744e-05,
"loss": 0.7117,
"step": 9500
},
{
"epoch": 2.36,
"eval_accuracy": 0.6126670837402344,
"eval_loss": 0.9153077602386475,
"eval_runtime": 10.5723,
"eval_samples_per_second": 594.385,
"eval_steps_per_second": 74.345,
"step": 9600
},
{
"epoch": 2.41,
"eval_accuracy": 0.6225334405899048,
"eval_loss": 0.8883262276649475,
"eval_runtime": 10.6256,
"eval_samples_per_second": 591.403,
"eval_steps_per_second": 73.972,
"step": 9800
},
{
"epoch": 2.46,
"learning_rate": 3.768472906403941e-05,
"loss": 0.7093,
"step": 10000
},
{
"epoch": 2.46,
"eval_accuracy": 0.6352641582489014,
"eval_loss": 0.8633843064308167,
"eval_runtime": 10.563,
"eval_samples_per_second": 594.908,
"eval_steps_per_second": 74.411,
"step": 10000
},
{
"epoch": 2.51,
"eval_accuracy": 0.6166454553604126,
"eval_loss": 0.8823496103286743,
"eval_runtime": 10.5755,
"eval_samples_per_second": 594.203,
"eval_steps_per_second": 74.323,
"step": 10200
},
{
"epoch": 2.56,
"eval_accuracy": 0.6164863109588623,
"eval_loss": 0.8820719122886658,
"eval_runtime": 10.5271,
"eval_samples_per_second": 596.937,
"eval_steps_per_second": 74.665,
"step": 10400
},
{
"epoch": 2.59,
"learning_rate": 3.7068965517241385e-05,
"loss": 0.7217,
"step": 10500
},
{
"epoch": 2.61,
"eval_accuracy": 0.6368554830551147,
"eval_loss": 0.8529412746429443,
"eval_runtime": 10.5265,
"eval_samples_per_second": 596.97,
"eval_steps_per_second": 74.669,
"step": 10600
},
{
"epoch": 2.66,
"eval_accuracy": 0.6188733577728271,
"eval_loss": 0.873248279094696,
"eval_runtime": 10.534,
"eval_samples_per_second": 596.546,
"eval_steps_per_second": 74.616,
"step": 10800
},
{
"epoch": 2.71,
"learning_rate": 3.645320197044335e-05,
"loss": 0.716,
"step": 11000
},
{
"epoch": 2.71,
"eval_accuracy": 0.6086887121200562,
"eval_loss": 0.9195781350135803,
"eval_runtime": 10.6161,
"eval_samples_per_second": 591.928,
"eval_steps_per_second": 74.038,
"step": 11000
},
{
"epoch": 2.76,
"eval_accuracy": 0.6206238269805908,
"eval_loss": 0.8756206631660461,
"eval_runtime": 10.4293,
"eval_samples_per_second": 602.536,
"eval_steps_per_second": 75.365,
"step": 11200
},
{
"epoch": 2.81,
"eval_accuracy": 0.632718026638031,
"eval_loss": 0.8536378741264343,
"eval_runtime": 10.4816,
"eval_samples_per_second": 599.524,
"eval_steps_per_second": 74.988,
"step": 11400
},
{
"epoch": 2.83,
"learning_rate": 3.583743842364532e-05,
"loss": 0.718,
"step": 11500
},
{
"epoch": 2.86,
"eval_accuracy": 0.6323997378349304,
"eval_loss": 0.8347600102424622,
"eval_runtime": 10.5944,
"eval_samples_per_second": 593.143,
"eval_steps_per_second": 74.19,
"step": 11600
},
{
"epoch": 2.91,
"eval_accuracy": 0.6357415914535522,
"eval_loss": 0.8483295440673828,
"eval_runtime": 10.5051,
"eval_samples_per_second": 598.187,
"eval_steps_per_second": 74.821,
"step": 11800
},
{
"epoch": 2.96,
"learning_rate": 3.522167487684729e-05,
"loss": 0.7122,
"step": 12000
},
{
"epoch": 2.96,
"eval_accuracy": 0.6290579438209534,
"eval_loss": 0.8905266523361206,
"eval_runtime": 10.5623,
"eval_samples_per_second": 594.944,
"eval_steps_per_second": 74.415,
"step": 12000
},
{
"epoch": 3.0,
"eval_accuracy": 0.6179185509681702,
"eval_loss": 0.9339210391044617,
"eval_runtime": 10.5728,
"eval_samples_per_second": 594.358,
"eval_steps_per_second": 74.342,
"step": 12200
},
{
"epoch": 3.05,
"eval_accuracy": 0.6196690201759338,
"eval_loss": 0.9666189551353455,
"eval_runtime": 10.5432,
"eval_samples_per_second": 596.024,
"eval_steps_per_second": 74.55,
"step": 12400
},
{
"epoch": 3.08,
"learning_rate": 3.4605911330049265e-05,
"loss": 0.6537,
"step": 12500
},
{
"epoch": 3.1,
"eval_accuracy": 0.6268300414085388,
"eval_loss": 0.9578396677970886,
"eval_runtime": 10.5774,
"eval_samples_per_second": 594.096,
"eval_steps_per_second": 74.309,
"step": 12600
},
{
"epoch": 3.1,
"step": 12600,
"total_flos": 2.066089927214592e+16,
"train_loss": 0.8108822553119962,
"train_runtime": 3034.6018,
"train_samples_per_second": 856.06,
"train_steps_per_second": 13.379
}
],
"max_steps": 40600,
"num_train_epochs": 10,
"total_flos": 2.066089927214592e+16,
"trial_name": null,
"trial_params": null
}