{ "best_metric": 0.6368554830551147, "best_model_checkpoint": "omarmomen/sf_babylm_1/finetune/mnli-mm/checkpoint-10600", "epoch": 3.103448275862069, "global_step": 12600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "eval_accuracy": 0.4094525873661041, "eval_loss": 1.0740591287612915, "eval_runtime": 8.0395, "eval_samples_per_second": 781.642, "eval_steps_per_second": 97.767, "step": 200 }, { "epoch": 0.1, "eval_accuracy": 0.5098663568496704, "eval_loss": 0.9865565896034241, "eval_runtime": 8.0625, "eval_samples_per_second": 779.412, "eval_steps_per_second": 97.488, "step": 400 }, { "epoch": 0.12, "learning_rate": 4.938423645320197e-05, "loss": 1.0601, "step": 500 }, { "epoch": 0.15, "eval_accuracy": 0.5108211040496826, "eval_loss": 0.9675168395042419, "eval_runtime": 8.16, "eval_samples_per_second": 770.097, "eval_steps_per_second": 96.323, "step": 600 }, { "epoch": 0.2, "eval_accuracy": 0.5381922125816345, "eval_loss": 0.9379056096076965, "eval_runtime": 8.156, "eval_samples_per_second": 770.471, "eval_steps_per_second": 96.37, "step": 800 }, { "epoch": 0.25, "learning_rate": 4.876847290640394e-05, "loss": 0.9691, "step": 1000 }, { "epoch": 0.25, "eval_accuracy": 0.547581136226654, "eval_loss": 0.9261330962181091, "eval_runtime": 8.0508, "eval_samples_per_second": 780.547, "eval_steps_per_second": 97.63, "step": 1000 }, { "epoch": 0.3, "eval_accuracy": 0.5571292042732239, "eval_loss": 0.903211236000061, "eval_runtime": 8.0951, "eval_samples_per_second": 776.268, "eval_steps_per_second": 97.095, "step": 1200 }, { "epoch": 0.34, "eval_accuracy": 0.5545830726623535, "eval_loss": 0.9244930744171143, "eval_runtime": 8.0553, "eval_samples_per_second": 780.111, "eval_steps_per_second": 97.576, "step": 1400 }, { "epoch": 0.37, "learning_rate": 4.8152709359605915e-05, "loss": 0.9402, "step": 1500 }, { "epoch": 0.39, "eval_accuracy": 0.5622215270996094, "eval_loss": 0.9127561450004578, "eval_runtime": 8.1598, "eval_samples_per_second": 770.116, "eval_steps_per_second": 96.326, "step": 1600 }, { "epoch": 0.44, "eval_accuracy": 0.5660407543182373, "eval_loss": 0.9042630195617676, "eval_runtime": 8.1639, "eval_samples_per_second": 769.729, "eval_steps_per_second": 96.277, "step": 1800 }, { "epoch": 0.49, "learning_rate": 4.753694581280788e-05, "loss": 0.9195, "step": 2000 }, { "epoch": 0.49, "eval_accuracy": 0.5647676587104797, "eval_loss": 0.9146111011505127, "eval_runtime": 8.1205, "eval_samples_per_second": 773.846, "eval_steps_per_second": 96.792, "step": 2000 }, { "epoch": 0.54, "eval_accuracy": 0.5862507820129395, "eval_loss": 0.8741610646247864, "eval_runtime": 8.158, "eval_samples_per_second": 770.283, "eval_steps_per_second": 96.347, "step": 2200 }, { "epoch": 0.59, "eval_accuracy": 0.5868873596191406, "eval_loss": 0.8720157146453857, "eval_runtime": 8.1301, "eval_samples_per_second": 772.932, "eval_steps_per_second": 96.678, "step": 2400 }, { "epoch": 0.62, "learning_rate": 4.6921182266009855e-05, "loss": 0.9024, "step": 2500 }, { "epoch": 0.64, "eval_accuracy": 0.5878421664237976, "eval_loss": 0.8731514811515808, "eval_runtime": 8.0908, "eval_samples_per_second": 776.687, "eval_steps_per_second": 97.148, "step": 2600 }, { "epoch": 0.69, "eval_accuracy": 0.5845003128051758, "eval_loss": 0.8767739534378052, "eval_runtime": 8.1003, "eval_samples_per_second": 775.772, "eval_steps_per_second": 97.033, "step": 2800 }, { "epoch": 0.74, "learning_rate": 4.630541871921182e-05, "loss": 0.8949, "step": 3000 }, { "epoch": 0.74, "eval_accuracy": 0.5845003128051758, "eval_loss": 0.8801184296607971, "eval_runtime": 8.0801, "eval_samples_per_second": 777.716, "eval_steps_per_second": 97.276, "step": 3000 }, { "epoch": 0.79, "eval_accuracy": 0.6012094020843506, "eval_loss": 0.8618054986000061, "eval_runtime": 8.0659, "eval_samples_per_second": 779.08, "eval_steps_per_second": 97.447, "step": 3200 }, { "epoch": 0.84, "eval_accuracy": 0.5973901748657227, "eval_loss": 0.8645357489585876, "eval_runtime": 8.0652, "eval_samples_per_second": 779.15, "eval_steps_per_second": 97.456, "step": 3400 }, { "epoch": 0.86, "learning_rate": 4.5689655172413794e-05, "loss": 0.8858, "step": 3500 }, { "epoch": 0.89, "eval_accuracy": 0.5957988500595093, "eval_loss": 0.8610660433769226, "eval_runtime": 8.0809, "eval_samples_per_second": 777.636, "eval_steps_per_second": 97.266, "step": 3600 }, { "epoch": 0.94, "eval_accuracy": 0.5907065272331238, "eval_loss": 0.868955671787262, "eval_runtime": 8.061, "eval_samples_per_second": 779.553, "eval_steps_per_second": 97.506, "step": 3800 }, { "epoch": 0.99, "learning_rate": 4.507389162561577e-05, "loss": 0.8806, "step": 4000 }, { "epoch": 0.99, "eval_accuracy": 0.6093252897262573, "eval_loss": 0.8431282043457031, "eval_runtime": 8.0923, "eval_samples_per_second": 776.543, "eval_steps_per_second": 97.13, "step": 4000 }, { "epoch": 1.03, "eval_accuracy": 0.5980267524719238, "eval_loss": 0.8728816509246826, "eval_runtime": 8.1474, "eval_samples_per_second": 771.292, "eval_steps_per_second": 96.473, "step": 4200 }, { "epoch": 1.08, "eval_accuracy": 0.6032781600952148, "eval_loss": 0.8630015254020691, "eval_runtime": 8.1598, "eval_samples_per_second": 770.121, "eval_steps_per_second": 96.326, "step": 4400 }, { "epoch": 1.11, "learning_rate": 4.4458128078817734e-05, "loss": 0.828, "step": 4500 }, { "epoch": 1.13, "eval_accuracy": 0.6098026633262634, "eval_loss": 0.8493590950965881, "eval_runtime": 8.1533, "eval_samples_per_second": 770.73, "eval_steps_per_second": 96.403, "step": 4600 }, { "epoch": 1.18, "eval_accuracy": 0.6063017249107361, "eval_loss": 0.848625898361206, "eval_runtime": 8.1785, "eval_samples_per_second": 768.354, "eval_steps_per_second": 96.105, "step": 4800 }, { "epoch": 1.23, "learning_rate": 4.384236453201971e-05, "loss": 0.8119, "step": 5000 }, { "epoch": 1.23, "eval_accuracy": 0.5962762832641602, "eval_loss": 0.8927779793739319, "eval_runtime": 8.192, "eval_samples_per_second": 767.092, "eval_steps_per_second": 95.948, "step": 5000 }, { "epoch": 1.28, "eval_accuracy": 0.6140993237495422, "eval_loss": 0.8620697259902954, "eval_runtime": 10.5649, "eval_samples_per_second": 594.799, "eval_steps_per_second": 74.397, "step": 5200 }, { "epoch": 1.33, "eval_accuracy": 0.6096435189247131, "eval_loss": 0.8553202748298645, "eval_runtime": 10.5461, "eval_samples_per_second": 595.862, "eval_steps_per_second": 74.53, "step": 5400 }, { "epoch": 1.35, "learning_rate": 4.3226600985221674e-05, "loss": 0.8043, "step": 5500 }, { "epoch": 1.38, "eval_accuracy": 0.5985041260719299, "eval_loss": 0.8708285689353943, "eval_runtime": 10.6457, "eval_samples_per_second": 590.283, "eval_steps_per_second": 73.832, "step": 5600 }, { "epoch": 1.43, "eval_accuracy": 0.6225334405899048, "eval_loss": 0.8417609930038452, "eval_runtime": 10.6407, "eval_samples_per_second": 590.561, "eval_steps_per_second": 73.867, "step": 5800 }, { "epoch": 1.48, "learning_rate": 4.261083743842365e-05, "loss": 0.8077, "step": 6000 }, { "epoch": 1.48, "eval_accuracy": 0.6037555932998657, "eval_loss": 0.8710538148880005, "eval_runtime": 10.5553, "eval_samples_per_second": 595.342, "eval_steps_per_second": 74.465, "step": 6000 }, { "epoch": 1.53, "eval_accuracy": 0.6029598712921143, "eval_loss": 0.8956546783447266, "eval_runtime": 10.6119, "eval_samples_per_second": 592.164, "eval_steps_per_second": 74.068, "step": 6200 }, { "epoch": 1.58, "eval_accuracy": 0.612030565738678, "eval_loss": 0.8615825176239014, "eval_runtime": 10.5375, "eval_samples_per_second": 596.345, "eval_steps_per_second": 74.591, "step": 6400 }, { "epoch": 1.6, "learning_rate": 4.199507389162562e-05, "loss": 0.8097, "step": 6500 }, { "epoch": 1.63, "eval_accuracy": 0.6109166145324707, "eval_loss": 0.8452191352844238, "eval_runtime": 10.6063, "eval_samples_per_second": 592.478, "eval_steps_per_second": 74.107, "step": 6600 }, { "epoch": 1.67, "eval_accuracy": 0.5973901748657227, "eval_loss": 0.9005106091499329, "eval_runtime": 10.558, "eval_samples_per_second": 595.188, "eval_steps_per_second": 74.446, "step": 6800 }, { "epoch": 1.72, "learning_rate": 4.1379310344827587e-05, "loss": 0.8079, "step": 7000 }, { "epoch": 1.72, "eval_accuracy": 0.6225334405899048, "eval_loss": 0.8366883993148804, "eval_runtime": 10.5514, "eval_samples_per_second": 595.561, "eval_steps_per_second": 74.493, "step": 7000 }, { "epoch": 1.77, "eval_accuracy": 0.6128262281417847, "eval_loss": 0.8606940507888794, "eval_runtime": 10.5796, "eval_samples_per_second": 593.974, "eval_steps_per_second": 74.294, "step": 7200 }, { "epoch": 1.82, "eval_accuracy": 0.6288987994194031, "eval_loss": 0.8481429815292358, "eval_runtime": 10.5474, "eval_samples_per_second": 595.785, "eval_steps_per_second": 74.521, "step": 7400 }, { "epoch": 1.85, "learning_rate": 4.076354679802955e-05, "loss": 0.8049, "step": 7500 }, { "epoch": 1.87, "eval_accuracy": 0.6090070009231567, "eval_loss": 0.8603523969650269, "eval_runtime": 10.5286, "eval_samples_per_second": 596.85, "eval_steps_per_second": 74.654, "step": 7600 }, { "epoch": 1.92, "eval_accuracy": 0.6255569458007812, "eval_loss": 0.8576852679252625, "eval_runtime": 10.5749, "eval_samples_per_second": 594.235, "eval_steps_per_second": 74.327, "step": 7800 }, { "epoch": 1.97, "learning_rate": 4.014778325123153e-05, "loss": 0.8012, "step": 8000 }, { "epoch": 1.97, "eval_accuracy": 0.6320814490318298, "eval_loss": 0.8284709453582764, "eval_runtime": 10.5826, "eval_samples_per_second": 593.804, "eval_steps_per_second": 74.273, "step": 8000 }, { "epoch": 2.02, "eval_accuracy": 0.6187142133712769, "eval_loss": 0.8966588377952576, "eval_runtime": 10.6039, "eval_samples_per_second": 592.614, "eval_steps_per_second": 74.124, "step": 8200 }, { "epoch": 2.07, "eval_accuracy": 0.6199872493743896, "eval_loss": 0.8712503910064697, "eval_runtime": 10.4657, "eval_samples_per_second": 600.435, "eval_steps_per_second": 75.102, "step": 8400 }, { "epoch": 2.09, "learning_rate": 3.95320197044335e-05, "loss": 0.7301, "step": 8500 }, { "epoch": 2.12, "eval_accuracy": 0.6012094020843506, "eval_loss": 0.9067429304122925, "eval_runtime": 10.4821, "eval_samples_per_second": 599.496, "eval_steps_per_second": 74.985, "step": 8600 }, { "epoch": 2.17, "eval_accuracy": 0.6177594065666199, "eval_loss": 0.8801796436309814, "eval_runtime": 10.6296, "eval_samples_per_second": 591.178, "eval_steps_per_second": 73.944, "step": 8800 }, { "epoch": 2.22, "learning_rate": 3.891625615763547e-05, "loss": 0.7109, "step": 9000 }, { "epoch": 2.22, "eval_accuracy": 0.6032781600952148, "eval_loss": 0.9191064834594727, "eval_runtime": 10.5447, "eval_samples_per_second": 595.938, "eval_steps_per_second": 74.54, "step": 9000 }, { "epoch": 2.27, "eval_accuracy": 0.6131445169448853, "eval_loss": 0.9177576899528503, "eval_runtime": 10.5787, "eval_samples_per_second": 594.023, "eval_steps_per_second": 74.3, "step": 9200 }, { "epoch": 2.32, "eval_accuracy": 0.6055060625076294, "eval_loss": 0.9652428030967712, "eval_runtime": 10.5388, "eval_samples_per_second": 596.274, "eval_steps_per_second": 74.582, "step": 9400 }, { "epoch": 2.34, "learning_rate": 3.830049261083744e-05, "loss": 0.7117, "step": 9500 }, { "epoch": 2.36, "eval_accuracy": 0.6126670837402344, "eval_loss": 0.9153077602386475, "eval_runtime": 10.5723, "eval_samples_per_second": 594.385, "eval_steps_per_second": 74.345, "step": 9600 }, { "epoch": 2.41, "eval_accuracy": 0.6225334405899048, "eval_loss": 0.8883262276649475, "eval_runtime": 10.6256, "eval_samples_per_second": 591.403, "eval_steps_per_second": 73.972, "step": 9800 }, { "epoch": 2.46, "learning_rate": 3.768472906403941e-05, "loss": 0.7093, "step": 10000 }, { "epoch": 2.46, "eval_accuracy": 0.6352641582489014, "eval_loss": 0.8633843064308167, "eval_runtime": 10.563, "eval_samples_per_second": 594.908, "eval_steps_per_second": 74.411, "step": 10000 }, { "epoch": 2.51, "eval_accuracy": 0.6166454553604126, "eval_loss": 0.8823496103286743, "eval_runtime": 10.5755, "eval_samples_per_second": 594.203, "eval_steps_per_second": 74.323, "step": 10200 }, { "epoch": 2.56, "eval_accuracy": 0.6164863109588623, "eval_loss": 0.8820719122886658, "eval_runtime": 10.5271, "eval_samples_per_second": 596.937, "eval_steps_per_second": 74.665, "step": 10400 }, { "epoch": 2.59, "learning_rate": 3.7068965517241385e-05, "loss": 0.7217, "step": 10500 }, { "epoch": 2.61, "eval_accuracy": 0.6368554830551147, "eval_loss": 0.8529412746429443, "eval_runtime": 10.5265, "eval_samples_per_second": 596.97, "eval_steps_per_second": 74.669, "step": 10600 }, { "epoch": 2.66, "eval_accuracy": 0.6188733577728271, "eval_loss": 0.873248279094696, "eval_runtime": 10.534, "eval_samples_per_second": 596.546, "eval_steps_per_second": 74.616, "step": 10800 }, { "epoch": 2.71, "learning_rate": 3.645320197044335e-05, "loss": 0.716, "step": 11000 }, { "epoch": 2.71, "eval_accuracy": 0.6086887121200562, "eval_loss": 0.9195781350135803, "eval_runtime": 10.6161, "eval_samples_per_second": 591.928, "eval_steps_per_second": 74.038, "step": 11000 }, { "epoch": 2.76, "eval_accuracy": 0.6206238269805908, "eval_loss": 0.8756206631660461, "eval_runtime": 10.4293, "eval_samples_per_second": 602.536, "eval_steps_per_second": 75.365, "step": 11200 }, { "epoch": 2.81, "eval_accuracy": 0.632718026638031, "eval_loss": 0.8536378741264343, "eval_runtime": 10.4816, "eval_samples_per_second": 599.524, "eval_steps_per_second": 74.988, "step": 11400 }, { "epoch": 2.83, "learning_rate": 3.583743842364532e-05, "loss": 0.718, "step": 11500 }, { "epoch": 2.86, "eval_accuracy": 0.6323997378349304, "eval_loss": 0.8347600102424622, "eval_runtime": 10.5944, "eval_samples_per_second": 593.143, "eval_steps_per_second": 74.19, "step": 11600 }, { "epoch": 2.91, "eval_accuracy": 0.6357415914535522, "eval_loss": 0.8483295440673828, "eval_runtime": 10.5051, "eval_samples_per_second": 598.187, "eval_steps_per_second": 74.821, "step": 11800 }, { "epoch": 2.96, "learning_rate": 3.522167487684729e-05, "loss": 0.7122, "step": 12000 }, { "epoch": 2.96, "eval_accuracy": 0.6290579438209534, "eval_loss": 0.8905266523361206, "eval_runtime": 10.5623, "eval_samples_per_second": 594.944, "eval_steps_per_second": 74.415, "step": 12000 }, { "epoch": 3.0, "eval_accuracy": 0.6179185509681702, "eval_loss": 0.9339210391044617, "eval_runtime": 10.5728, "eval_samples_per_second": 594.358, "eval_steps_per_second": 74.342, "step": 12200 }, { "epoch": 3.05, "eval_accuracy": 0.6196690201759338, "eval_loss": 0.9666189551353455, "eval_runtime": 10.5432, "eval_samples_per_second": 596.024, "eval_steps_per_second": 74.55, "step": 12400 }, { "epoch": 3.08, "learning_rate": 3.4605911330049265e-05, "loss": 0.6537, "step": 12500 }, { "epoch": 3.1, "eval_accuracy": 0.6268300414085388, "eval_loss": 0.9578396677970886, "eval_runtime": 10.5774, "eval_samples_per_second": 594.096, "eval_steps_per_second": 74.309, "step": 12600 }, { "epoch": 3.1, "step": 12600, "total_flos": 2.066089927214592e+16, "train_loss": 0.8108822553119962, "train_runtime": 3034.6018, "train_samples_per_second": 856.06, "train_steps_per_second": 13.379 } ], "max_steps": 40600, "num_train_epochs": 10, "total_flos": 2.066089927214592e+16, "trial_name": null, "trial_params": null }