fi-rauma / trainer_state.json
codymd's picture
Updated model with double training data
235334a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 25.31645569620253,
"eval_steps": 500,
"global_step": 42000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3,
"learning_rate": 4.969861362266426e-05,
"loss": 3.7006,
"step": 500
},
{
"epoch": 0.6,
"learning_rate": 4.939722724532851e-05,
"loss": 3.5872,
"step": 1000
},
{
"epoch": 0.9,
"learning_rate": 4.909584086799277e-05,
"loss": 3.4617,
"step": 1500
},
{
"epoch": 1.0,
"eval_bleu": 12.3227,
"eval_gen_len": 54.6075,
"eval_loss": 3.1113245487213135,
"eval_runtime": 122.2496,
"eval_samples_per_second": 3.272,
"eval_steps_per_second": 0.409,
"step": 1659
},
{
"epoch": 1.21,
"learning_rate": 4.8794454490657024e-05,
"loss": 3.298,
"step": 2000
},
{
"epoch": 1.51,
"learning_rate": 4.849306811332128e-05,
"loss": 3.2018,
"step": 2500
},
{
"epoch": 1.81,
"learning_rate": 4.8191681735985535e-05,
"loss": 3.1014,
"step": 3000
},
{
"epoch": 2.0,
"eval_bleu": 15.8487,
"eval_gen_len": 50.1125,
"eval_loss": 2.8111488819122314,
"eval_runtime": 92.4044,
"eval_samples_per_second": 4.329,
"eval_steps_per_second": 0.541,
"step": 3318
},
{
"epoch": 2.11,
"learning_rate": 4.789029535864979e-05,
"loss": 2.9998,
"step": 3500
},
{
"epoch": 2.41,
"learning_rate": 4.7588908981314046e-05,
"loss": 2.883,
"step": 4000
},
{
"epoch": 2.71,
"learning_rate": 4.7287522603978304e-05,
"loss": 2.8409,
"step": 4500
},
{
"epoch": 3.0,
"eval_bleu": 20.5509,
"eval_gen_len": 43.98,
"eval_loss": 2.617112398147583,
"eval_runtime": 70.0674,
"eval_samples_per_second": 5.709,
"eval_steps_per_second": 0.714,
"step": 4977
},
{
"epoch": 3.01,
"learning_rate": 4.6986136226642556e-05,
"loss": 2.8043,
"step": 5000
},
{
"epoch": 3.32,
"learning_rate": 4.6684749849306815e-05,
"loss": 2.6486,
"step": 5500
},
{
"epoch": 3.62,
"learning_rate": 4.638336347197107e-05,
"loss": 2.6127,
"step": 6000
},
{
"epoch": 3.92,
"learning_rate": 4.6081977094635326e-05,
"loss": 2.5718,
"step": 6500
},
{
"epoch": 4.0,
"eval_bleu": 21.5273,
"eval_gen_len": 40.8575,
"eval_loss": 2.4335193634033203,
"eval_runtime": 62.0368,
"eval_samples_per_second": 6.448,
"eval_steps_per_second": 0.806,
"step": 6636
},
{
"epoch": 4.22,
"learning_rate": 4.5780590717299585e-05,
"loss": 2.4535,
"step": 7000
},
{
"epoch": 4.52,
"learning_rate": 4.547920433996384e-05,
"loss": 2.4269,
"step": 7500
},
{
"epoch": 4.82,
"learning_rate": 4.5177817962628096e-05,
"loss": 2.3852,
"step": 8000
},
{
"epoch": 5.0,
"eval_bleu": 24.0185,
"eval_gen_len": 38.945,
"eval_loss": 2.2908990383148193,
"eval_runtime": 53.5509,
"eval_samples_per_second": 7.47,
"eval_steps_per_second": 0.934,
"step": 8295
},
{
"epoch": 5.12,
"learning_rate": 4.487643158529235e-05,
"loss": 2.3305,
"step": 8500
},
{
"epoch": 5.42,
"learning_rate": 4.45750452079566e-05,
"loss": 2.2361,
"step": 9000
},
{
"epoch": 5.73,
"learning_rate": 4.427365883062086e-05,
"loss": 2.2201,
"step": 9500
},
{
"epoch": 6.0,
"eval_bleu": 25.0722,
"eval_gen_len": 38.4525,
"eval_loss": 2.2150681018829346,
"eval_runtime": 47.2306,
"eval_samples_per_second": 8.469,
"eval_steps_per_second": 1.059,
"step": 9954
},
{
"epoch": 6.03,
"learning_rate": 4.397227245328511e-05,
"loss": 2.1955,
"step": 10000
},
{
"epoch": 6.33,
"learning_rate": 4.367088607594937e-05,
"loss": 2.0928,
"step": 10500
},
{
"epoch": 6.63,
"learning_rate": 4.336949969861363e-05,
"loss": 2.0947,
"step": 11000
},
{
"epoch": 6.93,
"learning_rate": 4.306811332127788e-05,
"loss": 2.0583,
"step": 11500
},
{
"epoch": 7.0,
"eval_bleu": 26.051,
"eval_gen_len": 40.0775,
"eval_loss": 2.1219234466552734,
"eval_runtime": 55.6386,
"eval_samples_per_second": 7.189,
"eval_steps_per_second": 0.899,
"step": 11613
},
{
"epoch": 7.23,
"learning_rate": 4.276672694394214e-05,
"loss": 1.9657,
"step": 12000
},
{
"epoch": 7.53,
"learning_rate": 4.246534056660639e-05,
"loss": 1.9594,
"step": 12500
},
{
"epoch": 7.84,
"learning_rate": 4.216395418927065e-05,
"loss": 1.9464,
"step": 13000
},
{
"epoch": 8.0,
"eval_bleu": 27.8486,
"eval_gen_len": 39.54,
"eval_loss": 2.0415802001953125,
"eval_runtime": 50.0785,
"eval_samples_per_second": 7.987,
"eval_steps_per_second": 0.998,
"step": 13272
},
{
"epoch": 8.14,
"learning_rate": 4.186256781193491e-05,
"loss": 1.8901,
"step": 13500
},
{
"epoch": 8.44,
"learning_rate": 4.1561181434599153e-05,
"loss": 1.8331,
"step": 14000
},
{
"epoch": 8.74,
"learning_rate": 4.125979505726341e-05,
"loss": 1.8273,
"step": 14500
},
{
"epoch": 9.0,
"eval_bleu": 28.6882,
"eval_gen_len": 38.97,
"eval_loss": 1.9714975357055664,
"eval_runtime": 47.8353,
"eval_samples_per_second": 8.362,
"eval_steps_per_second": 1.045,
"step": 14931
},
{
"epoch": 9.04,
"learning_rate": 4.095840867992767e-05,
"loss": 1.8071,
"step": 15000
},
{
"epoch": 9.34,
"learning_rate": 4.065702230259192e-05,
"loss": 1.724,
"step": 15500
},
{
"epoch": 9.64,
"learning_rate": 4.035563592525618e-05,
"loss": 1.7173,
"step": 16000
},
{
"epoch": 9.95,
"learning_rate": 4.0054249547920434e-05,
"loss": 1.7341,
"step": 16500
},
{
"epoch": 10.0,
"eval_bleu": 29.4158,
"eval_gen_len": 39.27,
"eval_loss": 1.922670602798462,
"eval_runtime": 48.3901,
"eval_samples_per_second": 8.266,
"eval_steps_per_second": 1.033,
"step": 16590
},
{
"epoch": 10.25,
"learning_rate": 3.975286317058469e-05,
"loss": 1.6432,
"step": 17000
},
{
"epoch": 10.55,
"learning_rate": 3.945147679324895e-05,
"loss": 1.6414,
"step": 17500
},
{
"epoch": 10.85,
"learning_rate": 3.9150090415913203e-05,
"loss": 1.6285,
"step": 18000
},
{
"epoch": 11.0,
"eval_bleu": 29.6336,
"eval_gen_len": 39.7025,
"eval_loss": 1.8723887205123901,
"eval_runtime": 49.1746,
"eval_samples_per_second": 8.134,
"eval_steps_per_second": 1.017,
"step": 18249
},
{
"epoch": 11.15,
"learning_rate": 3.884870403857746e-05,
"loss": 1.5753,
"step": 18500
},
{
"epoch": 11.45,
"learning_rate": 3.8547317661241714e-05,
"loss": 1.5525,
"step": 19000
},
{
"epoch": 11.75,
"learning_rate": 3.8245931283905966e-05,
"loss": 1.5466,
"step": 19500
},
{
"epoch": 12.0,
"eval_bleu": 31.3296,
"eval_gen_len": 39.8675,
"eval_loss": 1.816349744796753,
"eval_runtime": 49.6256,
"eval_samples_per_second": 8.06,
"eval_steps_per_second": 1.008,
"step": 19908
},
{
"epoch": 12.06,
"learning_rate": 3.7944544906570225e-05,
"loss": 1.5254,
"step": 20000
},
{
"epoch": 12.36,
"learning_rate": 3.764315852923448e-05,
"loss": 1.4676,
"step": 20500
},
{
"epoch": 12.66,
"learning_rate": 3.7341772151898736e-05,
"loss": 1.4678,
"step": 21000
},
{
"epoch": 12.96,
"learning_rate": 3.7040385774562995e-05,
"loss": 1.4607,
"step": 21500
},
{
"epoch": 13.0,
"eval_bleu": 31.7515,
"eval_gen_len": 38.405,
"eval_loss": 1.7929939031600952,
"eval_runtime": 44.5172,
"eval_samples_per_second": 8.985,
"eval_steps_per_second": 1.123,
"step": 21567
},
{
"epoch": 13.26,
"learning_rate": 3.6738999397227247e-05,
"loss": 1.3787,
"step": 22000
},
{
"epoch": 13.56,
"learning_rate": 3.6437613019891505e-05,
"loss": 1.4049,
"step": 22500
},
{
"epoch": 13.86,
"learning_rate": 3.613622664255576e-05,
"loss": 1.385,
"step": 23000
},
{
"epoch": 14.0,
"eval_bleu": 32.458,
"eval_gen_len": 39.4675,
"eval_loss": 1.7518789768218994,
"eval_runtime": 49.1331,
"eval_samples_per_second": 8.141,
"eval_steps_per_second": 1.018,
"step": 23226
},
{
"epoch": 14.17,
"learning_rate": 3.5834840265220016e-05,
"loss": 1.3403,
"step": 23500
},
{
"epoch": 14.47,
"learning_rate": 3.553345388788427e-05,
"loss": 1.3166,
"step": 24000
},
{
"epoch": 14.77,
"learning_rate": 3.523206751054853e-05,
"loss": 1.321,
"step": 24500
},
{
"epoch": 15.0,
"eval_bleu": 32.9411,
"eval_gen_len": 38.8025,
"eval_loss": 1.7194263935089111,
"eval_runtime": 45.6686,
"eval_samples_per_second": 8.759,
"eval_steps_per_second": 1.095,
"step": 24885
},
{
"epoch": 15.07,
"learning_rate": 3.493068113321278e-05,
"loss": 1.2976,
"step": 25000
},
{
"epoch": 15.37,
"learning_rate": 3.462929475587703e-05,
"loss": 1.2358,
"step": 25500
},
{
"epoch": 15.67,
"learning_rate": 3.432790837854129e-05,
"loss": 1.2592,
"step": 26000
},
{
"epoch": 15.97,
"learning_rate": 3.402652200120555e-05,
"loss": 1.2662,
"step": 26500
},
{
"epoch": 16.0,
"eval_bleu": 33.8478,
"eval_gen_len": 39.1275,
"eval_loss": 1.6950603723526,
"eval_runtime": 49.9911,
"eval_samples_per_second": 8.001,
"eval_steps_per_second": 1.0,
"step": 26544
},
{
"epoch": 16.27,
"learning_rate": 3.37251356238698e-05,
"loss": 1.1963,
"step": 27000
},
{
"epoch": 16.58,
"learning_rate": 3.342374924653406e-05,
"loss": 1.2002,
"step": 27500
},
{
"epoch": 16.88,
"learning_rate": 3.312236286919831e-05,
"loss": 1.1939,
"step": 28000
},
{
"epoch": 17.0,
"eval_bleu": 34.5277,
"eval_gen_len": 39.0225,
"eval_loss": 1.685713529586792,
"eval_runtime": 49.4943,
"eval_samples_per_second": 8.082,
"eval_steps_per_second": 1.01,
"step": 28203
},
{
"epoch": 17.18,
"learning_rate": 3.282097649186257e-05,
"loss": 1.1459,
"step": 28500
},
{
"epoch": 17.48,
"learning_rate": 3.251959011452683e-05,
"loss": 1.1326,
"step": 29000
},
{
"epoch": 17.78,
"learning_rate": 3.221820373719108e-05,
"loss": 1.1406,
"step": 29500
},
{
"epoch": 18.0,
"eval_bleu": 35.8691,
"eval_gen_len": 38.76,
"eval_loss": 1.6470690965652466,
"eval_runtime": 45.2962,
"eval_samples_per_second": 8.831,
"eval_steps_per_second": 1.104,
"step": 29862
},
{
"epoch": 18.08,
"learning_rate": 3.191681735985534e-05,
"loss": 1.1292,
"step": 30000
},
{
"epoch": 18.38,
"learning_rate": 3.161543098251959e-05,
"loss": 1.071,
"step": 30500
},
{
"epoch": 18.69,
"learning_rate": 3.1314044605183844e-05,
"loss": 1.0918,
"step": 31000
},
{
"epoch": 18.99,
"learning_rate": 3.10126582278481e-05,
"loss": 1.0759,
"step": 31500
},
{
"epoch": 19.0,
"eval_bleu": 36.4448,
"eval_gen_len": 38.6925,
"eval_loss": 1.6456927061080933,
"eval_runtime": 46.4772,
"eval_samples_per_second": 8.606,
"eval_steps_per_second": 1.076,
"step": 31521
},
{
"epoch": 19.29,
"learning_rate": 3.0711271850512355e-05,
"loss": 1.0193,
"step": 32000
},
{
"epoch": 19.59,
"learning_rate": 3.0409885473176613e-05,
"loss": 1.0248,
"step": 32500
},
{
"epoch": 19.89,
"learning_rate": 3.010849909584087e-05,
"loss": 1.0378,
"step": 33000
},
{
"epoch": 20.0,
"eval_bleu": 37.2905,
"eval_gen_len": 38.945,
"eval_loss": 1.6285927295684814,
"eval_runtime": 49.741,
"eval_samples_per_second": 8.042,
"eval_steps_per_second": 1.005,
"step": 33180
},
{
"epoch": 20.19,
"learning_rate": 2.9807112718505124e-05,
"loss": 0.9915,
"step": 33500
},
{
"epoch": 20.49,
"learning_rate": 2.9505726341169383e-05,
"loss": 0.9848,
"step": 34000
},
{
"epoch": 20.8,
"learning_rate": 2.9204339963833638e-05,
"loss": 0.9851,
"step": 34500
},
{
"epoch": 21.0,
"eval_bleu": 38.4264,
"eval_gen_len": 38.7175,
"eval_loss": 1.5997543334960938,
"eval_runtime": 44.5032,
"eval_samples_per_second": 8.988,
"eval_steps_per_second": 1.124,
"step": 34839
},
{
"epoch": 21.1,
"learning_rate": 2.8902953586497894e-05,
"loss": 0.97,
"step": 35000
},
{
"epoch": 21.4,
"learning_rate": 2.8601567209162146e-05,
"loss": 0.9436,
"step": 35500
},
{
"epoch": 21.7,
"learning_rate": 2.83001808318264e-05,
"loss": 0.9372,
"step": 36000
},
{
"epoch": 22.0,
"eval_bleu": 37.9614,
"eval_gen_len": 38.9425,
"eval_loss": 1.607030987739563,
"eval_runtime": 47.0014,
"eval_samples_per_second": 8.51,
"eval_steps_per_second": 1.064,
"step": 36498
},
{
"epoch": 22.0,
"learning_rate": 2.7998794454490656e-05,
"loss": 0.9437,
"step": 36500
},
{
"epoch": 22.3,
"learning_rate": 2.7697408077154912e-05,
"loss": 0.8917,
"step": 37000
},
{
"epoch": 22.6,
"learning_rate": 2.7396021699819167e-05,
"loss": 0.8692,
"step": 37500
},
{
"epoch": 22.91,
"learning_rate": 2.7094635322483426e-05,
"loss": 0.9191,
"step": 38000
},
{
"epoch": 23.0,
"eval_bleu": 38.8655,
"eval_gen_len": 38.8825,
"eval_loss": 1.5746939182281494,
"eval_runtime": 50.4993,
"eval_samples_per_second": 7.921,
"eval_steps_per_second": 0.99,
"step": 38157
},
{
"epoch": 23.21,
"learning_rate": 2.679324894514768e-05,
"loss": 0.8555,
"step": 38500
},
{
"epoch": 23.51,
"learning_rate": 2.6491862567811937e-05,
"loss": 0.8533,
"step": 39000
},
{
"epoch": 23.81,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.8673,
"step": 39500
},
{
"epoch": 24.0,
"eval_bleu": 39.4605,
"eval_gen_len": 39.0175,
"eval_loss": 1.5650146007537842,
"eval_runtime": 50.8092,
"eval_samples_per_second": 7.873,
"eval_steps_per_second": 0.984,
"step": 39816
},
{
"epoch": 24.11,
"learning_rate": 2.5889089813140448e-05,
"loss": 0.841,
"step": 40000
},
{
"epoch": 24.41,
"learning_rate": 2.5587703435804706e-05,
"loss": 0.8155,
"step": 40500
},
{
"epoch": 24.71,
"learning_rate": 2.5286317058468955e-05,
"loss": 0.811,
"step": 41000
},
{
"epoch": 25.0,
"eval_bleu": 39.6804,
"eval_gen_len": 38.77,
"eval_loss": 1.5603779554367065,
"eval_runtime": 45.7389,
"eval_samples_per_second": 8.745,
"eval_steps_per_second": 1.093,
"step": 41475
},
{
"epoch": 25.02,
"learning_rate": 2.4984930681133214e-05,
"loss": 0.8335,
"step": 41500
},
{
"epoch": 25.32,
"learning_rate": 2.468354430379747e-05,
"loss": 0.7688,
"step": 42000
}
],
"logging_steps": 500,
"max_steps": 82950,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 1.0127259403812864e+16,
"trial_name": null,
"trial_params": null
}