|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 118.93805309734513, |
|
"eval_steps": 500, |
|
"global_step": 3360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.35398230088495575, |
|
"grad_norm": 8.875021934509277, |
|
"learning_rate": 1.4880952380952381e-06, |
|
"loss": 2.2984, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.7079646017699115, |
|
"grad_norm": 10.44489860534668, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 2.302, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9911504424778761, |
|
"eval_accuracy": 0.1575, |
|
"eval_loss": 2.266624689102173, |
|
"eval_runtime": 7.4116, |
|
"eval_samples_per_second": 53.97, |
|
"eval_steps_per_second": 1.754, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0619469026548674, |
|
"grad_norm": 11.131507873535156, |
|
"learning_rate": 4.464285714285715e-06, |
|
"loss": 2.2827, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.415929203539823, |
|
"grad_norm": 11.846382141113281, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 2.2517, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.7699115044247788, |
|
"grad_norm": 8.047672271728516, |
|
"learning_rate": 7.4404761904761905e-06, |
|
"loss": 2.2226, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.9823008849557522, |
|
"eval_accuracy": 0.315, |
|
"eval_loss": 2.1654422283172607, |
|
"eval_runtime": 5.7675, |
|
"eval_samples_per_second": 69.354, |
|
"eval_steps_per_second": 2.254, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.1238938053097347, |
|
"grad_norm": 7.9003190994262695, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 2.1848, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.47787610619469, |
|
"grad_norm": 14.892489433288574, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 2.1238, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.831858407079646, |
|
"grad_norm": 13.308501243591309, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 2.0639, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.9734513274336285, |
|
"eval_accuracy": 0.445, |
|
"eval_loss": 1.9969619512557983, |
|
"eval_runtime": 5.7209, |
|
"eval_samples_per_second": 69.919, |
|
"eval_steps_per_second": 2.272, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.185840707964602, |
|
"grad_norm": 16.818639755249023, |
|
"learning_rate": 1.3392857142857144e-05, |
|
"loss": 2.005, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.5398230088495577, |
|
"grad_norm": 13.497295379638672, |
|
"learning_rate": 1.4880952380952381e-05, |
|
"loss": 1.9232, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.893805309734513, |
|
"grad_norm": 16.994335174560547, |
|
"learning_rate": 1.636904761904762e-05, |
|
"loss": 1.8559, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.56, |
|
"eval_loss": 1.7373383045196533, |
|
"eval_runtime": 5.8519, |
|
"eval_samples_per_second": 68.354, |
|
"eval_steps_per_second": 2.222, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 4.247787610619469, |
|
"grad_norm": 25.03571891784668, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 1.7371, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.601769911504425, |
|
"grad_norm": 27.68413734436035, |
|
"learning_rate": 1.9345238095238097e-05, |
|
"loss": 1.6511, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.95575221238938, |
|
"grad_norm": 21.25657081604004, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.5966, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.991150442477876, |
|
"eval_accuracy": 0.605, |
|
"eval_loss": 1.48233962059021, |
|
"eval_runtime": 5.791, |
|
"eval_samples_per_second": 69.072, |
|
"eval_steps_per_second": 2.245, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 5.3097345132743365, |
|
"grad_norm": 33.357173919677734, |
|
"learning_rate": 2.2321428571428575e-05, |
|
"loss": 1.4767, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.663716814159292, |
|
"grad_norm": 21.856212615966797, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 1.3967, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.982300884955752, |
|
"eval_accuracy": 0.6125, |
|
"eval_loss": 1.2925149202346802, |
|
"eval_runtime": 5.8558, |
|
"eval_samples_per_second": 68.308, |
|
"eval_steps_per_second": 2.22, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 6.017699115044247, |
|
"grad_norm": 34.79808807373047, |
|
"learning_rate": 2.529761904761905e-05, |
|
"loss": 1.3046, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 6.371681415929204, |
|
"grad_norm": 27.307743072509766, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 1.2429, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 6.725663716814159, |
|
"grad_norm": 42.35990524291992, |
|
"learning_rate": 2.8273809523809523e-05, |
|
"loss": 1.204, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.9734513274336285, |
|
"eval_accuracy": 0.68, |
|
"eval_loss": 1.0512183904647827, |
|
"eval_runtime": 6.0412, |
|
"eval_samples_per_second": 66.212, |
|
"eval_steps_per_second": 2.152, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 7.079646017699115, |
|
"grad_norm": 35.177921295166016, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 1.1471, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 7.433628318584071, |
|
"grad_norm": 42.960391998291016, |
|
"learning_rate": 3.125e-05, |
|
"loss": 1.1087, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 7.787610619469026, |
|
"grad_norm": 28.81920623779297, |
|
"learning_rate": 3.273809523809524e-05, |
|
"loss": 1.0206, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7025, |
|
"eval_loss": 0.930656909942627, |
|
"eval_runtime": 5.9577, |
|
"eval_samples_per_second": 67.14, |
|
"eval_steps_per_second": 2.182, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 8.141592920353983, |
|
"grad_norm": 27.41872787475586, |
|
"learning_rate": 3.422619047619048e-05, |
|
"loss": 0.9949, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 8.495575221238939, |
|
"grad_norm": 30.292728424072266, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.9248, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 8.849557522123893, |
|
"grad_norm": 41.145442962646484, |
|
"learning_rate": 3.7202380952380956e-05, |
|
"loss": 0.9408, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.991150442477876, |
|
"eval_accuracy": 0.7425, |
|
"eval_loss": 0.8286006450653076, |
|
"eval_runtime": 6.1346, |
|
"eval_samples_per_second": 65.204, |
|
"eval_steps_per_second": 2.119, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 9.20353982300885, |
|
"grad_norm": 100.8248519897461, |
|
"learning_rate": 3.8690476190476195e-05, |
|
"loss": 0.9164, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 9.557522123893806, |
|
"grad_norm": 33.74626541137695, |
|
"learning_rate": 4.017857142857143e-05, |
|
"loss": 0.8392, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 9.91150442477876, |
|
"grad_norm": 23.371883392333984, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.8501, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 9.982300884955752, |
|
"eval_accuracy": 0.6975, |
|
"eval_loss": 0.8589820265769958, |
|
"eval_runtime": 5.7507, |
|
"eval_samples_per_second": 69.557, |
|
"eval_steps_per_second": 2.261, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 10.265486725663717, |
|
"grad_norm": 27.10701560974121, |
|
"learning_rate": 4.315476190476191e-05, |
|
"loss": 0.8698, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 10.619469026548673, |
|
"grad_norm": 31.57968521118164, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.7621, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 10.973451327433628, |
|
"grad_norm": 16.75609588623047, |
|
"learning_rate": 4.613095238095239e-05, |
|
"loss": 0.7545, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 10.973451327433628, |
|
"eval_accuracy": 0.7475, |
|
"eval_loss": 0.7702187895774841, |
|
"eval_runtime": 5.9752, |
|
"eval_samples_per_second": 66.944, |
|
"eval_steps_per_second": 2.176, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 11.327433628318584, |
|
"grad_norm": 38.2448616027832, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.7884, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 11.68141592920354, |
|
"grad_norm": 55.255489349365234, |
|
"learning_rate": 4.910714285714286e-05, |
|
"loss": 0.7484, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.765, |
|
"eval_loss": 0.7738745212554932, |
|
"eval_runtime": 5.7936, |
|
"eval_samples_per_second": 69.041, |
|
"eval_steps_per_second": 2.244, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 12.035398230088495, |
|
"grad_norm": 48.28477478027344, |
|
"learning_rate": 4.993386243386244e-05, |
|
"loss": 0.7274, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 12.389380530973451, |
|
"grad_norm": 29.47681999206543, |
|
"learning_rate": 4.976851851851852e-05, |
|
"loss": 0.6793, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 12.743362831858407, |
|
"grad_norm": 25.996002197265625, |
|
"learning_rate": 4.960317460317461e-05, |
|
"loss": 0.6909, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 12.991150442477876, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 0.7344464063644409, |
|
"eval_runtime": 5.7939, |
|
"eval_samples_per_second": 69.038, |
|
"eval_steps_per_second": 2.244, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 13.097345132743364, |
|
"grad_norm": 17.77817153930664, |
|
"learning_rate": 4.943783068783069e-05, |
|
"loss": 0.6684, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 13.451327433628318, |
|
"grad_norm": 42.287635803222656, |
|
"learning_rate": 4.927248677248678e-05, |
|
"loss": 0.6309, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 13.805309734513274, |
|
"grad_norm": 23.582502365112305, |
|
"learning_rate": 4.910714285714286e-05, |
|
"loss": 0.6558, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 13.982300884955752, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 0.6874340772628784, |
|
"eval_runtime": 5.9948, |
|
"eval_samples_per_second": 66.725, |
|
"eval_steps_per_second": 2.169, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 14.15929203539823, |
|
"grad_norm": 23.51905059814453, |
|
"learning_rate": 4.894179894179895e-05, |
|
"loss": 0.5937, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 14.513274336283185, |
|
"grad_norm": 23.072551727294922, |
|
"learning_rate": 4.8776455026455034e-05, |
|
"loss": 0.6672, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 14.867256637168142, |
|
"grad_norm": 20.066308975219727, |
|
"learning_rate": 4.8611111111111115e-05, |
|
"loss": 0.5923, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 14.973451327433628, |
|
"eval_accuracy": 0.7675, |
|
"eval_loss": 0.6640682816505432, |
|
"eval_runtime": 5.7834, |
|
"eval_samples_per_second": 69.163, |
|
"eval_steps_per_second": 2.248, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 15.221238938053098, |
|
"grad_norm": 82.714599609375, |
|
"learning_rate": 4.84457671957672e-05, |
|
"loss": 0.6061, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 15.575221238938052, |
|
"grad_norm": 23.3370361328125, |
|
"learning_rate": 4.8280423280423284e-05, |
|
"loss": 0.5595, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 15.929203539823009, |
|
"grad_norm": 39.12126159667969, |
|
"learning_rate": 4.811507936507937e-05, |
|
"loss": 0.5764, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.6109621524810791, |
|
"eval_runtime": 5.8219, |
|
"eval_samples_per_second": 68.706, |
|
"eval_steps_per_second": 2.233, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 16.283185840707965, |
|
"grad_norm": 18.160350799560547, |
|
"learning_rate": 4.794973544973545e-05, |
|
"loss": 0.564, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 16.63716814159292, |
|
"grad_norm": 22.667816162109375, |
|
"learning_rate": 4.778439153439154e-05, |
|
"loss": 0.5416, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 16.991150442477878, |
|
"grad_norm": 43.60499954223633, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.5235, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 16.991150442477878, |
|
"eval_accuracy": 0.76, |
|
"eval_loss": 0.680573582649231, |
|
"eval_runtime": 6.0278, |
|
"eval_samples_per_second": 66.359, |
|
"eval_steps_per_second": 2.157, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 17.345132743362832, |
|
"grad_norm": 17.05644416809082, |
|
"learning_rate": 4.745370370370371e-05, |
|
"loss": 0.5301, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 17.699115044247787, |
|
"grad_norm": 14.023980140686035, |
|
"learning_rate": 4.7288359788359796e-05, |
|
"loss": 0.4883, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 17.98230088495575, |
|
"eval_accuracy": 0.76, |
|
"eval_loss": 0.7902570962905884, |
|
"eval_runtime": 5.8085, |
|
"eval_samples_per_second": 68.865, |
|
"eval_steps_per_second": 2.238, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 18.053097345132745, |
|
"grad_norm": 14.874748229980469, |
|
"learning_rate": 4.7123015873015876e-05, |
|
"loss": 0.529, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 18.4070796460177, |
|
"grad_norm": 17.90399742126465, |
|
"learning_rate": 4.6957671957671964e-05, |
|
"loss": 0.455, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 18.761061946902654, |
|
"grad_norm": 17.8863468170166, |
|
"learning_rate": 4.6792328042328045e-05, |
|
"loss": 0.4682, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 18.97345132743363, |
|
"eval_accuracy": 0.7825, |
|
"eval_loss": 0.6469230055809021, |
|
"eval_runtime": 5.8117, |
|
"eval_samples_per_second": 68.826, |
|
"eval_steps_per_second": 2.237, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 19.115044247787612, |
|
"grad_norm": 25.429996490478516, |
|
"learning_rate": 4.662698412698413e-05, |
|
"loss": 0.4268, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 19.469026548672566, |
|
"grad_norm": 28.328603744506836, |
|
"learning_rate": 4.646164021164021e-05, |
|
"loss": 0.4233, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 19.82300884955752, |
|
"grad_norm": 18.160737991333008, |
|
"learning_rate": 4.62962962962963e-05, |
|
"loss": 0.441, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7825, |
|
"eval_loss": 0.6693841814994812, |
|
"eval_runtime": 5.8278, |
|
"eval_samples_per_second": 68.637, |
|
"eval_steps_per_second": 2.231, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 20.17699115044248, |
|
"grad_norm": 18.324892044067383, |
|
"learning_rate": 4.613095238095239e-05, |
|
"loss": 0.4421, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 20.530973451327434, |
|
"grad_norm": 27.71529769897461, |
|
"learning_rate": 4.596560846560847e-05, |
|
"loss": 0.3583, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 20.884955752212388, |
|
"grad_norm": 10.921418190002441, |
|
"learning_rate": 4.580026455026456e-05, |
|
"loss": 0.4201, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 20.991150442477878, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 0.7144609689712524, |
|
"eval_runtime": 5.725, |
|
"eval_samples_per_second": 69.869, |
|
"eval_steps_per_second": 2.271, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 21.238938053097346, |
|
"grad_norm": 27.364168167114258, |
|
"learning_rate": 4.563492063492064e-05, |
|
"loss": 0.3886, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 21.5929203539823, |
|
"grad_norm": 22.475797653198242, |
|
"learning_rate": 4.5469576719576725e-05, |
|
"loss": 0.366, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 21.946902654867255, |
|
"grad_norm": 14.525550842285156, |
|
"learning_rate": 4.5304232804232806e-05, |
|
"loss": 0.387, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 21.98230088495575, |
|
"eval_accuracy": 0.7775, |
|
"eval_loss": 0.6505405902862549, |
|
"eval_runtime": 6.0629, |
|
"eval_samples_per_second": 65.975, |
|
"eval_steps_per_second": 2.144, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 22.300884955752213, |
|
"grad_norm": 15.278279304504395, |
|
"learning_rate": 4.5138888888888894e-05, |
|
"loss": 0.3681, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 22.654867256637168, |
|
"grad_norm": 23.836380004882812, |
|
"learning_rate": 4.4973544973544974e-05, |
|
"loss": 0.4034, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 22.97345132743363, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 0.6168544292449951, |
|
"eval_runtime": 5.8701, |
|
"eval_samples_per_second": 68.143, |
|
"eval_steps_per_second": 2.215, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 23.008849557522122, |
|
"grad_norm": 15.120473861694336, |
|
"learning_rate": 4.480820105820106e-05, |
|
"loss": 0.3424, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 23.36283185840708, |
|
"grad_norm": 12.84585189819336, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.3669, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 23.716814159292035, |
|
"grad_norm": 24.2416934967041, |
|
"learning_rate": 4.447751322751323e-05, |
|
"loss": 0.3041, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.6416059136390686, |
|
"eval_runtime": 5.7657, |
|
"eval_samples_per_second": 69.375, |
|
"eval_steps_per_second": 2.255, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 24.07079646017699, |
|
"grad_norm": 17.557270050048828, |
|
"learning_rate": 4.431216931216932e-05, |
|
"loss": 0.3344, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 24.424778761061948, |
|
"grad_norm": 12.692173957824707, |
|
"learning_rate": 4.41468253968254e-05, |
|
"loss": 0.2928, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 24.778761061946902, |
|
"grad_norm": 15.073899269104004, |
|
"learning_rate": 4.3981481481481486e-05, |
|
"loss": 0.3021, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 24.991150442477878, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 0.6992344856262207, |
|
"eval_runtime": 5.8878, |
|
"eval_samples_per_second": 67.937, |
|
"eval_steps_per_second": 2.208, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 25.13274336283186, |
|
"grad_norm": 10.598650932312012, |
|
"learning_rate": 4.381613756613757e-05, |
|
"loss": 0.3147, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 25.486725663716815, |
|
"grad_norm": 17.572126388549805, |
|
"learning_rate": 4.3650793650793655e-05, |
|
"loss": 0.302, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 25.84070796460177, |
|
"grad_norm": 30.176406860351562, |
|
"learning_rate": 4.3485449735449736e-05, |
|
"loss": 0.2853, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 25.98230088495575, |
|
"eval_accuracy": 0.7975, |
|
"eval_loss": 0.6566324830055237, |
|
"eval_runtime": 5.7476, |
|
"eval_samples_per_second": 69.594, |
|
"eval_steps_per_second": 2.262, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 26.194690265486727, |
|
"grad_norm": 18.816850662231445, |
|
"learning_rate": 4.332010582010582e-05, |
|
"loss": 0.297, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 26.548672566371682, |
|
"grad_norm": 14.217058181762695, |
|
"learning_rate": 4.315476190476191e-05, |
|
"loss": 0.2849, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 26.902654867256636, |
|
"grad_norm": 17.27300453186035, |
|
"learning_rate": 4.298941798941799e-05, |
|
"loss": 0.27, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 26.97345132743363, |
|
"eval_accuracy": 0.7825, |
|
"eval_loss": 0.696976900100708, |
|
"eval_runtime": 6.0434, |
|
"eval_samples_per_second": 66.188, |
|
"eval_steps_per_second": 2.151, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 27.256637168141594, |
|
"grad_norm": 10.05718994140625, |
|
"learning_rate": 4.282407407407408e-05, |
|
"loss": 0.2331, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 27.61061946902655, |
|
"grad_norm": 12.319819450378418, |
|
"learning_rate": 4.265873015873016e-05, |
|
"loss": 0.2462, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 27.964601769911503, |
|
"grad_norm": 14.027650833129883, |
|
"learning_rate": 4.249338624338625e-05, |
|
"loss": 0.2722, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.785, |
|
"eval_loss": 0.6862995624542236, |
|
"eval_runtime": 5.7837, |
|
"eval_samples_per_second": 69.16, |
|
"eval_steps_per_second": 2.248, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 28.31858407079646, |
|
"grad_norm": 11.350334167480469, |
|
"learning_rate": 4.232804232804233e-05, |
|
"loss": 0.2366, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 28.672566371681416, |
|
"grad_norm": 8.22404670715332, |
|
"learning_rate": 4.2162698412698416e-05, |
|
"loss": 0.2143, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 28.991150442477878, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.679432213306427, |
|
"eval_runtime": 5.7657, |
|
"eval_samples_per_second": 69.376, |
|
"eval_steps_per_second": 2.255, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 29.02654867256637, |
|
"grad_norm": 11.07326889038086, |
|
"learning_rate": 4.1997354497354504e-05, |
|
"loss": 0.2443, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 29.38053097345133, |
|
"grad_norm": 28.747331619262695, |
|
"learning_rate": 4.1832010582010584e-05, |
|
"loss": 0.2424, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 29.734513274336283, |
|
"grad_norm": 12.262460708618164, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 0.2238, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 29.98230088495575, |
|
"eval_accuracy": 0.7975, |
|
"eval_loss": 0.6781703233718872, |
|
"eval_runtime": 5.7261, |
|
"eval_samples_per_second": 69.856, |
|
"eval_steps_per_second": 2.27, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 30.088495575221238, |
|
"grad_norm": 12.248478889465332, |
|
"learning_rate": 4.150132275132275e-05, |
|
"loss": 0.2414, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 30.442477876106196, |
|
"grad_norm": 26.66759490966797, |
|
"learning_rate": 4.133597883597884e-05, |
|
"loss": 0.222, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 30.79646017699115, |
|
"grad_norm": 17.73442268371582, |
|
"learning_rate": 4.117063492063492e-05, |
|
"loss": 0.2387, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 30.97345132743363, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.6944553256034851, |
|
"eval_runtime": 5.9266, |
|
"eval_samples_per_second": 67.492, |
|
"eval_steps_per_second": 2.193, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 31.150442477876105, |
|
"grad_norm": 10.954143524169922, |
|
"learning_rate": 4.100529100529101e-05, |
|
"loss": 0.1897, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 31.504424778761063, |
|
"grad_norm": 11.53641414642334, |
|
"learning_rate": 4.083994708994709e-05, |
|
"loss": 0.215, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 31.858407079646017, |
|
"grad_norm": 19.236658096313477, |
|
"learning_rate": 4.067460317460318e-05, |
|
"loss": 0.223, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7825, |
|
"eval_loss": 0.7377186417579651, |
|
"eval_runtime": 5.6881, |
|
"eval_samples_per_second": 70.322, |
|
"eval_steps_per_second": 2.285, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 32.21238938053097, |
|
"grad_norm": 18.005434036254883, |
|
"learning_rate": 4.0509259259259265e-05, |
|
"loss": 0.1703, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 32.56637168141593, |
|
"grad_norm": 10.189879417419434, |
|
"learning_rate": 4.0343915343915346e-05, |
|
"loss": 0.1882, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 32.92035398230089, |
|
"grad_norm": 23.737546920776367, |
|
"learning_rate": 4.017857142857143e-05, |
|
"loss": 0.2211, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 32.991150442477874, |
|
"eval_accuracy": 0.7775, |
|
"eval_loss": 0.7430591583251953, |
|
"eval_runtime": 5.8045, |
|
"eval_samples_per_second": 68.911, |
|
"eval_steps_per_second": 2.24, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 33.27433628318584, |
|
"grad_norm": 12.777314186096191, |
|
"learning_rate": 4.0013227513227514e-05, |
|
"loss": 0.2187, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 33.6283185840708, |
|
"grad_norm": 11.4727783203125, |
|
"learning_rate": 3.98478835978836e-05, |
|
"loss": 0.1952, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 33.982300884955755, |
|
"grad_norm": 9.033596992492676, |
|
"learning_rate": 3.968253968253968e-05, |
|
"loss": 0.1882, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 33.982300884955755, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.702938437461853, |
|
"eval_runtime": 5.7972, |
|
"eval_samples_per_second": 68.998, |
|
"eval_steps_per_second": 2.242, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 34.336283185840706, |
|
"grad_norm": 10.962681770324707, |
|
"learning_rate": 3.951719576719577e-05, |
|
"loss": 0.1788, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 34.690265486725664, |
|
"grad_norm": 9.644322395324707, |
|
"learning_rate": 3.935185185185186e-05, |
|
"loss": 0.1562, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 34.97345132743363, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.6886518597602844, |
|
"eval_runtime": 5.7736, |
|
"eval_samples_per_second": 69.281, |
|
"eval_steps_per_second": 2.252, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 35.04424778761062, |
|
"grad_norm": 7.208992004394531, |
|
"learning_rate": 3.918650793650794e-05, |
|
"loss": 0.1779, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 35.39823008849557, |
|
"grad_norm": 15.789375305175781, |
|
"learning_rate": 3.9021164021164026e-05, |
|
"loss": 0.1652, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 35.75221238938053, |
|
"grad_norm": 9.542279243469238, |
|
"learning_rate": 3.885582010582011e-05, |
|
"loss": 0.1689, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7975, |
|
"eval_loss": 0.7189816236495972, |
|
"eval_runtime": 5.7578, |
|
"eval_samples_per_second": 69.47, |
|
"eval_steps_per_second": 2.258, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 36.10619469026549, |
|
"grad_norm": 12.757901191711426, |
|
"learning_rate": 3.8690476190476195e-05, |
|
"loss": 0.178, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 36.46017699115044, |
|
"grad_norm": 8.646611213684082, |
|
"learning_rate": 3.8525132275132275e-05, |
|
"loss": 0.1613, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 36.8141592920354, |
|
"grad_norm": 12.245649337768555, |
|
"learning_rate": 3.835978835978836e-05, |
|
"loss": 0.1886, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 36.991150442477874, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.7677862048149109, |
|
"eval_runtime": 5.9107, |
|
"eval_samples_per_second": 67.674, |
|
"eval_steps_per_second": 2.199, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 37.16814159292036, |
|
"grad_norm": 16.76270866394043, |
|
"learning_rate": 3.8194444444444444e-05, |
|
"loss": 0.1777, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 37.52212389380531, |
|
"grad_norm": 12.296770095825195, |
|
"learning_rate": 3.802910052910053e-05, |
|
"loss": 0.182, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 37.876106194690266, |
|
"grad_norm": 19.789642333984375, |
|
"learning_rate": 3.786375661375662e-05, |
|
"loss": 0.1887, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 37.982300884955755, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.7334153652191162, |
|
"eval_runtime": 5.7675, |
|
"eval_samples_per_second": 69.354, |
|
"eval_steps_per_second": 2.254, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 38.230088495575224, |
|
"grad_norm": 9.019023895263672, |
|
"learning_rate": 3.76984126984127e-05, |
|
"loss": 0.1377, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 38.584070796460175, |
|
"grad_norm": 8.896963119506836, |
|
"learning_rate": 3.753306878306879e-05, |
|
"loss": 0.1751, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 38.93805309734513, |
|
"grad_norm": 8.166861534118652, |
|
"learning_rate": 3.736772486772487e-05, |
|
"loss": 0.1531, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 38.97345132743363, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.7359188199043274, |
|
"eval_runtime": 5.8237, |
|
"eval_samples_per_second": 68.685, |
|
"eval_steps_per_second": 2.232, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 39.29203539823009, |
|
"grad_norm": 11.483717918395996, |
|
"learning_rate": 3.7202380952380956e-05, |
|
"loss": 0.1703, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 39.64601769911504, |
|
"grad_norm": 7.077027797698975, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.1376, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 10.199649810791016, |
|
"learning_rate": 3.6871693121693124e-05, |
|
"loss": 0.1662, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.7593528032302856, |
|
"eval_runtime": 5.7627, |
|
"eval_samples_per_second": 69.411, |
|
"eval_steps_per_second": 2.256, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 40.35398230088496, |
|
"grad_norm": 12.18074893951416, |
|
"learning_rate": 3.6706349206349205e-05, |
|
"loss": 0.1502, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 40.70796460176991, |
|
"grad_norm": 13.122712135314941, |
|
"learning_rate": 3.654100529100529e-05, |
|
"loss": 0.1273, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 40.991150442477874, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.7341694831848145, |
|
"eval_runtime": 5.7253, |
|
"eval_samples_per_second": 69.866, |
|
"eval_steps_per_second": 2.271, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 41.06194690265487, |
|
"grad_norm": 12.024120330810547, |
|
"learning_rate": 3.637566137566138e-05, |
|
"loss": 0.1334, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 41.415929203539825, |
|
"grad_norm": 13.965011596679688, |
|
"learning_rate": 3.621031746031746e-05, |
|
"loss": 0.1394, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 41.769911504424776, |
|
"grad_norm": 17.121511459350586, |
|
"learning_rate": 3.604497354497355e-05, |
|
"loss": 0.1986, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 41.982300884955755, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.7780522108078003, |
|
"eval_runtime": 5.9228, |
|
"eval_samples_per_second": 67.536, |
|
"eval_steps_per_second": 2.195, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 42.123893805309734, |
|
"grad_norm": 11.281728744506836, |
|
"learning_rate": 3.587962962962963e-05, |
|
"loss": 0.1563, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 42.47787610619469, |
|
"grad_norm": 6.99351167678833, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.1541, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 42.83185840707964, |
|
"grad_norm": 12.189485549926758, |
|
"learning_rate": 3.55489417989418e-05, |
|
"loss": 0.1891, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 42.97345132743363, |
|
"eval_accuracy": 0.8225, |
|
"eval_loss": 0.7375677227973938, |
|
"eval_runtime": 5.736, |
|
"eval_samples_per_second": 69.735, |
|
"eval_steps_per_second": 2.266, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 43.1858407079646, |
|
"grad_norm": 7.004004955291748, |
|
"learning_rate": 3.5383597883597885e-05, |
|
"loss": 0.1332, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 43.53982300884956, |
|
"grad_norm": 15.816964149475098, |
|
"learning_rate": 3.521825396825397e-05, |
|
"loss": 0.1231, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 43.89380530973451, |
|
"grad_norm": 8.720746040344238, |
|
"learning_rate": 3.5052910052910054e-05, |
|
"loss": 0.1573, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.730354905128479, |
|
"eval_runtime": 5.7537, |
|
"eval_samples_per_second": 69.52, |
|
"eval_steps_per_second": 2.259, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 44.24778761061947, |
|
"grad_norm": 11.798288345336914, |
|
"learning_rate": 3.488756613756614e-05, |
|
"loss": 0.1337, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 44.60176991150443, |
|
"grad_norm": 8.698972702026367, |
|
"learning_rate": 3.472222222222222e-05, |
|
"loss": 0.1536, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 44.95575221238938, |
|
"grad_norm": 14.468975067138672, |
|
"learning_rate": 3.455687830687831e-05, |
|
"loss": 0.1536, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 44.991150442477874, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.777264416217804, |
|
"eval_runtime": 5.7994, |
|
"eval_samples_per_second": 68.972, |
|
"eval_steps_per_second": 2.242, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 45.309734513274336, |
|
"grad_norm": 5.724658966064453, |
|
"learning_rate": 3.439153439153439e-05, |
|
"loss": 0.1362, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 45.663716814159294, |
|
"grad_norm": 15.177201271057129, |
|
"learning_rate": 3.422619047619048e-05, |
|
"loss": 0.1562, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 45.982300884955755, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.7622714042663574, |
|
"eval_runtime": 5.6562, |
|
"eval_samples_per_second": 70.719, |
|
"eval_steps_per_second": 2.298, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 46.017699115044245, |
|
"grad_norm": 14.262038230895996, |
|
"learning_rate": 3.406084656084656e-05, |
|
"loss": 0.1609, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 46.3716814159292, |
|
"grad_norm": 10.63355541229248, |
|
"learning_rate": 3.3895502645502647e-05, |
|
"loss": 0.1297, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 46.72566371681416, |
|
"grad_norm": 11.54215145111084, |
|
"learning_rate": 3.3730158730158734e-05, |
|
"loss": 0.1264, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 46.97345132743363, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.8314040899276733, |
|
"eval_runtime": 5.9374, |
|
"eval_samples_per_second": 67.369, |
|
"eval_steps_per_second": 2.189, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 47.07964601769911, |
|
"grad_norm": 7.798260688781738, |
|
"learning_rate": 3.3564814814814815e-05, |
|
"loss": 0.1552, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 47.43362831858407, |
|
"grad_norm": 10.773923873901367, |
|
"learning_rate": 3.33994708994709e-05, |
|
"loss": 0.1188, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 47.78761061946903, |
|
"grad_norm": 5.990432262420654, |
|
"learning_rate": 3.3234126984126983e-05, |
|
"loss": 0.1596, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.8175, |
|
"eval_loss": 0.7830512523651123, |
|
"eval_runtime": 5.8001, |
|
"eval_samples_per_second": 68.964, |
|
"eval_steps_per_second": 2.241, |
|
"step": 1356 |
|
}, |
|
{ |
|
"epoch": 48.14159292035398, |
|
"grad_norm": 11.099681854248047, |
|
"learning_rate": 3.306878306878307e-05, |
|
"loss": 0.1342, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 48.49557522123894, |
|
"grad_norm": 7.706020355224609, |
|
"learning_rate": 3.290343915343915e-05, |
|
"loss": 0.1382, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 48.849557522123895, |
|
"grad_norm": 5.685822010040283, |
|
"learning_rate": 3.273809523809524e-05, |
|
"loss": 0.1237, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 48.991150442477874, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.7949317097663879, |
|
"eval_runtime": 5.6938, |
|
"eval_samples_per_second": 70.252, |
|
"eval_steps_per_second": 2.283, |
|
"step": 1384 |
|
}, |
|
{ |
|
"epoch": 49.203539823008846, |
|
"grad_norm": 10.283050537109375, |
|
"learning_rate": 3.257275132275133e-05, |
|
"loss": 0.1161, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 49.557522123893804, |
|
"grad_norm": 6.40301513671875, |
|
"learning_rate": 3.240740740740741e-05, |
|
"loss": 0.1363, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 49.91150442477876, |
|
"grad_norm": 17.212697982788086, |
|
"learning_rate": 3.2242063492063495e-05, |
|
"loss": 0.1355, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 49.982300884955755, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.781341552734375, |
|
"eval_runtime": 5.872, |
|
"eval_samples_per_second": 68.12, |
|
"eval_steps_per_second": 2.214, |
|
"step": 1412 |
|
}, |
|
{ |
|
"epoch": 50.26548672566372, |
|
"grad_norm": 4.185158729553223, |
|
"learning_rate": 3.2076719576719576e-05, |
|
"loss": 0.1294, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 50.61946902654867, |
|
"grad_norm": 10.888021469116211, |
|
"learning_rate": 3.1911375661375664e-05, |
|
"loss": 0.163, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 50.97345132743363, |
|
"grad_norm": 8.630338668823242, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 0.1251, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 50.97345132743363, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.7647480964660645, |
|
"eval_runtime": 5.8333, |
|
"eval_samples_per_second": 68.571, |
|
"eval_steps_per_second": 2.229, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 51.32743362831859, |
|
"grad_norm": 19.542953491210938, |
|
"learning_rate": 3.158068783068783e-05, |
|
"loss": 0.1289, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 51.68141592920354, |
|
"grad_norm": 14.15263843536377, |
|
"learning_rate": 3.141534391534391e-05, |
|
"loss": 0.1181, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.8175, |
|
"eval_loss": 0.7552239894866943, |
|
"eval_runtime": 6.0471, |
|
"eval_samples_per_second": 66.148, |
|
"eval_steps_per_second": 2.15, |
|
"step": 1469 |
|
}, |
|
{ |
|
"epoch": 52.0353982300885, |
|
"grad_norm": 8.696867942810059, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.1232, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 52.389380530973455, |
|
"grad_norm": 15.115714073181152, |
|
"learning_rate": 3.108465608465609e-05, |
|
"loss": 0.128, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 52.743362831858406, |
|
"grad_norm": 26.04839515686035, |
|
"learning_rate": 3.091931216931217e-05, |
|
"loss": 0.1224, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 52.991150442477874, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.8345909118652344, |
|
"eval_runtime": 5.7656, |
|
"eval_samples_per_second": 69.377, |
|
"eval_steps_per_second": 2.255, |
|
"step": 1497 |
|
}, |
|
{ |
|
"epoch": 53.097345132743364, |
|
"grad_norm": 6.820947647094727, |
|
"learning_rate": 3.075396825396826e-05, |
|
"loss": 0.1088, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 53.45132743362832, |
|
"grad_norm": 10.287090301513672, |
|
"learning_rate": 3.058862433862434e-05, |
|
"loss": 0.1094, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 53.80530973451327, |
|
"grad_norm": 12.619726181030273, |
|
"learning_rate": 3.0423280423280425e-05, |
|
"loss": 0.1201, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 53.982300884955755, |
|
"eval_accuracy": 0.7975, |
|
"eval_loss": 0.7740535140037537, |
|
"eval_runtime": 5.8431, |
|
"eval_samples_per_second": 68.457, |
|
"eval_steps_per_second": 2.225, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 54.15929203539823, |
|
"grad_norm": 3.6249613761901855, |
|
"learning_rate": 3.0257936507936506e-05, |
|
"loss": 0.1278, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 54.51327433628319, |
|
"grad_norm": 8.490986824035645, |
|
"learning_rate": 3.0092592592592593e-05, |
|
"loss": 0.1359, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 54.86725663716814, |
|
"grad_norm": 8.873104095458984, |
|
"learning_rate": 2.9927248677248678e-05, |
|
"loss": 0.1109, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 54.97345132743363, |
|
"eval_accuracy": 0.785, |
|
"eval_loss": 0.772351861000061, |
|
"eval_runtime": 5.8419, |
|
"eval_samples_per_second": 68.47, |
|
"eval_steps_per_second": 2.225, |
|
"step": 1553 |
|
}, |
|
{ |
|
"epoch": 55.2212389380531, |
|
"grad_norm": 3.0876898765563965, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 0.1108, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 55.575221238938056, |
|
"grad_norm": 6.528329372406006, |
|
"learning_rate": 2.959656084656085e-05, |
|
"loss": 0.1236, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 55.92920353982301, |
|
"grad_norm": 11.930560111999512, |
|
"learning_rate": 2.943121693121693e-05, |
|
"loss": 0.1084, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.7904237508773804, |
|
"eval_runtime": 5.7739, |
|
"eval_samples_per_second": 69.277, |
|
"eval_steps_per_second": 2.252, |
|
"step": 1582 |
|
}, |
|
{ |
|
"epoch": 56.283185840707965, |
|
"grad_norm": 6.980546951293945, |
|
"learning_rate": 2.9265873015873018e-05, |
|
"loss": 0.1144, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 56.63716814159292, |
|
"grad_norm": 11.986266136169434, |
|
"learning_rate": 2.91005291005291e-05, |
|
"loss": 0.1084, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 56.991150442477874, |
|
"grad_norm": 6.527588367462158, |
|
"learning_rate": 2.8935185185185186e-05, |
|
"loss": 0.1187, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 56.991150442477874, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.7423826456069946, |
|
"eval_runtime": 5.9051, |
|
"eval_samples_per_second": 67.738, |
|
"eval_steps_per_second": 2.201, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 57.34513274336283, |
|
"grad_norm": 6.542221546173096, |
|
"learning_rate": 2.876984126984127e-05, |
|
"loss": 0.1191, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 57.69911504424779, |
|
"grad_norm": 6.406174659729004, |
|
"learning_rate": 2.8604497354497355e-05, |
|
"loss": 0.0935, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 57.982300884955755, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.7410502433776855, |
|
"eval_runtime": 6.0733, |
|
"eval_samples_per_second": 65.862, |
|
"eval_steps_per_second": 2.141, |
|
"step": 1638 |
|
}, |
|
{ |
|
"epoch": 58.05309734513274, |
|
"grad_norm": 5.907907485961914, |
|
"learning_rate": 2.8439153439153442e-05, |
|
"loss": 0.0776, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 58.4070796460177, |
|
"grad_norm": 7.151067733764648, |
|
"learning_rate": 2.8273809523809523e-05, |
|
"loss": 0.1022, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 58.76106194690266, |
|
"grad_norm": 6.808224201202393, |
|
"learning_rate": 2.810846560846561e-05, |
|
"loss": 0.1023, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 58.97345132743363, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.7475782632827759, |
|
"eval_runtime": 5.7079, |
|
"eval_samples_per_second": 70.078, |
|
"eval_steps_per_second": 2.278, |
|
"step": 1666 |
|
}, |
|
{ |
|
"epoch": 59.11504424778761, |
|
"grad_norm": 7.42250919342041, |
|
"learning_rate": 2.7943121693121695e-05, |
|
"loss": 0.1206, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 59.469026548672566, |
|
"grad_norm": 11.038393020629883, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 0.1148, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 59.823008849557525, |
|
"grad_norm": 6.563925266265869, |
|
"learning_rate": 2.7612433862433863e-05, |
|
"loss": 0.1166, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.8175, |
|
"eval_loss": 0.7742089033126831, |
|
"eval_runtime": 5.6896, |
|
"eval_samples_per_second": 70.304, |
|
"eval_steps_per_second": 2.285, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 60.176991150442475, |
|
"grad_norm": 8.62690544128418, |
|
"learning_rate": 2.7447089947089948e-05, |
|
"loss": 0.1133, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 60.530973451327434, |
|
"grad_norm": 8.701354026794434, |
|
"learning_rate": 2.7281746031746032e-05, |
|
"loss": 0.0961, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 60.88495575221239, |
|
"grad_norm": 5.880581855773926, |
|
"learning_rate": 2.7116402116402116e-05, |
|
"loss": 0.099, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 60.991150442477874, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.7696540951728821, |
|
"eval_runtime": 5.8242, |
|
"eval_samples_per_second": 68.679, |
|
"eval_steps_per_second": 2.232, |
|
"step": 1723 |
|
}, |
|
{ |
|
"epoch": 61.23893805309734, |
|
"grad_norm": 6.893290996551514, |
|
"learning_rate": 2.6951058201058204e-05, |
|
"loss": 0.0968, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 61.5929203539823, |
|
"grad_norm": 9.383642196655273, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.0927, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 61.94690265486726, |
|
"grad_norm": 10.903635025024414, |
|
"learning_rate": 2.6620370370370372e-05, |
|
"loss": 0.1157, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 61.982300884955755, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.8537997603416443, |
|
"eval_runtime": 5.862, |
|
"eval_samples_per_second": 68.236, |
|
"eval_steps_per_second": 2.218, |
|
"step": 1751 |
|
}, |
|
{ |
|
"epoch": 62.30088495575221, |
|
"grad_norm": 11.495246887207031, |
|
"learning_rate": 2.6455026455026456e-05, |
|
"loss": 0.1071, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 62.65486725663717, |
|
"grad_norm": 8.735861778259277, |
|
"learning_rate": 2.628968253968254e-05, |
|
"loss": 0.1137, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 62.97345132743363, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.8544909954071045, |
|
"eval_runtime": 5.719, |
|
"eval_samples_per_second": 69.942, |
|
"eval_steps_per_second": 2.273, |
|
"step": 1779 |
|
}, |
|
{ |
|
"epoch": 63.008849557522126, |
|
"grad_norm": 9.511134147644043, |
|
"learning_rate": 2.6124338624338625e-05, |
|
"loss": 0.0962, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 63.36283185840708, |
|
"grad_norm": 9.274235725402832, |
|
"learning_rate": 2.5958994708994712e-05, |
|
"loss": 0.09, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 63.716814159292035, |
|
"grad_norm": 17.82819175720215, |
|
"learning_rate": 2.5793650793650796e-05, |
|
"loss": 0.094, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.8463137745857239, |
|
"eval_runtime": 5.7366, |
|
"eval_samples_per_second": 69.727, |
|
"eval_steps_per_second": 2.266, |
|
"step": 1808 |
|
}, |
|
{ |
|
"epoch": 64.070796460177, |
|
"grad_norm": 13.545799255371094, |
|
"learning_rate": 2.562830687830688e-05, |
|
"loss": 0.1186, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 64.42477876106194, |
|
"grad_norm": 8.884902954101562, |
|
"learning_rate": 2.5462962962962965e-05, |
|
"loss": 0.1027, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 64.77876106194691, |
|
"grad_norm": 9.229199409484863, |
|
"learning_rate": 2.529761904761905e-05, |
|
"loss": 0.1161, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 64.99115044247787, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8350917100906372, |
|
"eval_runtime": 5.7053, |
|
"eval_samples_per_second": 70.11, |
|
"eval_steps_per_second": 2.279, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 65.13274336283186, |
|
"grad_norm": 4.687648296356201, |
|
"learning_rate": 2.5132275132275137e-05, |
|
"loss": 0.0954, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 65.48672566371681, |
|
"grad_norm": 8.967106819152832, |
|
"learning_rate": 2.496693121693122e-05, |
|
"loss": 0.1102, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 65.84070796460178, |
|
"grad_norm": 7.873806953430176, |
|
"learning_rate": 2.4801587301587305e-05, |
|
"loss": 0.08, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 65.98230088495575, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.8610497117042542, |
|
"eval_runtime": 5.7201, |
|
"eval_samples_per_second": 69.929, |
|
"eval_steps_per_second": 2.273, |
|
"step": 1864 |
|
}, |
|
{ |
|
"epoch": 66.19469026548673, |
|
"grad_norm": 7.836842060089111, |
|
"learning_rate": 2.463624338624339e-05, |
|
"loss": 0.1068, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 66.54867256637168, |
|
"grad_norm": 8.14975357055664, |
|
"learning_rate": 2.4470899470899473e-05, |
|
"loss": 0.0787, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 66.90265486725664, |
|
"grad_norm": 5.568120956420898, |
|
"learning_rate": 2.4305555555555558e-05, |
|
"loss": 0.0799, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 66.97345132743362, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.8592771887779236, |
|
"eval_runtime": 5.7815, |
|
"eval_samples_per_second": 69.186, |
|
"eval_steps_per_second": 2.249, |
|
"step": 1892 |
|
}, |
|
{ |
|
"epoch": 67.2566371681416, |
|
"grad_norm": 5.271720886230469, |
|
"learning_rate": 2.4140211640211642e-05, |
|
"loss": 0.0781, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 67.61061946902655, |
|
"grad_norm": 3.000176429748535, |
|
"learning_rate": 2.3974867724867726e-05, |
|
"loss": 0.0927, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 67.96460176991151, |
|
"grad_norm": 1.825054407119751, |
|
"learning_rate": 2.380952380952381e-05, |
|
"loss": 0.0783, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.8423022627830505, |
|
"eval_runtime": 5.6528, |
|
"eval_samples_per_second": 70.762, |
|
"eval_steps_per_second": 2.3, |
|
"step": 1921 |
|
}, |
|
{ |
|
"epoch": 68.31858407079646, |
|
"grad_norm": 9.14376449584961, |
|
"learning_rate": 2.3644179894179898e-05, |
|
"loss": 0.1037, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 68.67256637168141, |
|
"grad_norm": 11.533547401428223, |
|
"learning_rate": 2.3478835978835982e-05, |
|
"loss": 0.0851, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 68.99115044247787, |
|
"eval_accuracy": 0.82, |
|
"eval_loss": 0.8265037536621094, |
|
"eval_runtime": 5.9247, |
|
"eval_samples_per_second": 67.514, |
|
"eval_steps_per_second": 2.194, |
|
"step": 1949 |
|
}, |
|
{ |
|
"epoch": 69.02654867256638, |
|
"grad_norm": 8.8108549118042, |
|
"learning_rate": 2.3313492063492066e-05, |
|
"loss": 0.1005, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 69.38053097345133, |
|
"grad_norm": 7.721718788146973, |
|
"learning_rate": 2.314814814814815e-05, |
|
"loss": 0.0839, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 69.73451327433628, |
|
"grad_norm": 9.032380104064941, |
|
"learning_rate": 2.2982804232804235e-05, |
|
"loss": 0.0775, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 69.98230088495575, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.8707697987556458, |
|
"eval_runtime": 5.6882, |
|
"eval_samples_per_second": 70.321, |
|
"eval_steps_per_second": 2.285, |
|
"step": 1977 |
|
}, |
|
{ |
|
"epoch": 70.08849557522124, |
|
"grad_norm": 7.697175979614258, |
|
"learning_rate": 2.281746031746032e-05, |
|
"loss": 0.1005, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 70.4424778761062, |
|
"grad_norm": 11.258265495300293, |
|
"learning_rate": 2.2652116402116403e-05, |
|
"loss": 0.1152, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 70.79646017699115, |
|
"grad_norm": 6.715971946716309, |
|
"learning_rate": 2.2486772486772487e-05, |
|
"loss": 0.0902, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 70.97345132743362, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8180540204048157, |
|
"eval_runtime": 5.7974, |
|
"eval_samples_per_second": 68.996, |
|
"eval_steps_per_second": 2.242, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 71.15044247787611, |
|
"grad_norm": 8.447669982910156, |
|
"learning_rate": 2.2321428571428575e-05, |
|
"loss": 0.0664, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 71.50442477876106, |
|
"grad_norm": 10.469520568847656, |
|
"learning_rate": 2.215608465608466e-05, |
|
"loss": 0.0893, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 71.85840707964601, |
|
"grad_norm": 6.971242904663086, |
|
"learning_rate": 2.1990740740740743e-05, |
|
"loss": 0.0904, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.82, |
|
"eval_loss": 0.8296825885772705, |
|
"eval_runtime": 5.7586, |
|
"eval_samples_per_second": 69.462, |
|
"eval_steps_per_second": 2.258, |
|
"step": 2034 |
|
}, |
|
{ |
|
"epoch": 72.21238938053098, |
|
"grad_norm": 6.059171199798584, |
|
"learning_rate": 2.1825396825396827e-05, |
|
"loss": 0.0767, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 72.56637168141593, |
|
"grad_norm": 9.134629249572754, |
|
"learning_rate": 2.166005291005291e-05, |
|
"loss": 0.0897, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 72.92035398230088, |
|
"grad_norm": 6.5583415031433105, |
|
"learning_rate": 2.1494708994708996e-05, |
|
"loss": 0.0898, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 72.99115044247787, |
|
"eval_accuracy": 0.82, |
|
"eval_loss": 0.8464323282241821, |
|
"eval_runtime": 5.7076, |
|
"eval_samples_per_second": 70.082, |
|
"eval_steps_per_second": 2.278, |
|
"step": 2062 |
|
}, |
|
{ |
|
"epoch": 73.27433628318585, |
|
"grad_norm": 12.021257400512695, |
|
"learning_rate": 2.132936507936508e-05, |
|
"loss": 0.1061, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 73.6283185840708, |
|
"grad_norm": 4.3469367027282715, |
|
"learning_rate": 2.1164021164021164e-05, |
|
"loss": 0.0744, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 73.98230088495575, |
|
"grad_norm": 9.244330406188965, |
|
"learning_rate": 2.0998677248677252e-05, |
|
"loss": 0.1013, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 73.98230088495575, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8324652314186096, |
|
"eval_runtime": 5.8164, |
|
"eval_samples_per_second": 68.771, |
|
"eval_steps_per_second": 2.235, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 74.33628318584071, |
|
"grad_norm": 4.678669452667236, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 0.1001, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 74.69026548672566, |
|
"grad_norm": 9.78695297241211, |
|
"learning_rate": 2.066798941798942e-05, |
|
"loss": 0.0726, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 74.97345132743362, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 0.8771929144859314, |
|
"eval_runtime": 5.7555, |
|
"eval_samples_per_second": 69.499, |
|
"eval_steps_per_second": 2.259, |
|
"step": 2118 |
|
}, |
|
{ |
|
"epoch": 75.04424778761062, |
|
"grad_norm": 3.598501443862915, |
|
"learning_rate": 2.0502645502645504e-05, |
|
"loss": 0.0975, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 75.39823008849558, |
|
"grad_norm": 10.049243927001953, |
|
"learning_rate": 2.033730158730159e-05, |
|
"loss": 0.0858, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 75.75221238938053, |
|
"grad_norm": 10.887565612792969, |
|
"learning_rate": 2.0171957671957673e-05, |
|
"loss": 0.0745, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.8505265712738037, |
|
"eval_runtime": 5.6825, |
|
"eval_samples_per_second": 70.391, |
|
"eval_steps_per_second": 2.288, |
|
"step": 2147 |
|
}, |
|
{ |
|
"epoch": 76.10619469026548, |
|
"grad_norm": 9.399345397949219, |
|
"learning_rate": 2.0006613756613757e-05, |
|
"loss": 0.0909, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 76.46017699115045, |
|
"grad_norm": 3.6555662155151367, |
|
"learning_rate": 1.984126984126984e-05, |
|
"loss": 0.0799, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 76.8141592920354, |
|
"grad_norm": 5.8622894287109375, |
|
"learning_rate": 1.967592592592593e-05, |
|
"loss": 0.0891, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 76.99115044247787, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8693811297416687, |
|
"eval_runtime": 5.7342, |
|
"eval_samples_per_second": 69.757, |
|
"eval_steps_per_second": 2.267, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 77.16814159292035, |
|
"grad_norm": 3.6685290336608887, |
|
"learning_rate": 1.9510582010582013e-05, |
|
"loss": 0.0909, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 77.52212389380531, |
|
"grad_norm": 9.96608829498291, |
|
"learning_rate": 1.9345238095238097e-05, |
|
"loss": 0.0962, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 77.87610619469027, |
|
"grad_norm": 7.857000827789307, |
|
"learning_rate": 1.917989417989418e-05, |
|
"loss": 0.0791, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 77.98230088495575, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8765752911567688, |
|
"eval_runtime": 5.7248, |
|
"eval_samples_per_second": 69.872, |
|
"eval_steps_per_second": 2.271, |
|
"step": 2203 |
|
}, |
|
{ |
|
"epoch": 78.23008849557522, |
|
"grad_norm": 10.403280258178711, |
|
"learning_rate": 1.9014550264550266e-05, |
|
"loss": 0.0622, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 78.58407079646018, |
|
"grad_norm": 6.770401477813721, |
|
"learning_rate": 1.884920634920635e-05, |
|
"loss": 0.0689, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 78.93805309734513, |
|
"grad_norm": 10.228433609008789, |
|
"learning_rate": 1.8683862433862434e-05, |
|
"loss": 0.0639, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 78.97345132743362, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.8461715579032898, |
|
"eval_runtime": 6.0121, |
|
"eval_samples_per_second": 66.532, |
|
"eval_steps_per_second": 2.162, |
|
"step": 2231 |
|
}, |
|
{ |
|
"epoch": 79.29203539823008, |
|
"grad_norm": 6.610928535461426, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.0705, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 79.64601769911505, |
|
"grad_norm": 9.066596984863281, |
|
"learning_rate": 1.8353174603174602e-05, |
|
"loss": 0.0795, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 11.537944793701172, |
|
"learning_rate": 1.818783068783069e-05, |
|
"loss": 0.0676, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.8991250395774841, |
|
"eval_runtime": 5.7259, |
|
"eval_samples_per_second": 69.858, |
|
"eval_steps_per_second": 2.27, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 80.35398230088495, |
|
"grad_norm": 5.221861839294434, |
|
"learning_rate": 1.8022486772486774e-05, |
|
"loss": 0.0932, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 80.70796460176992, |
|
"grad_norm": 6.904067039489746, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.0904, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 80.99115044247787, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.8550813794136047, |
|
"eval_runtime": 5.7321, |
|
"eval_samples_per_second": 69.782, |
|
"eval_steps_per_second": 2.268, |
|
"step": 2288 |
|
}, |
|
{ |
|
"epoch": 81.06194690265487, |
|
"grad_norm": 10.940287590026855, |
|
"learning_rate": 1.7691798941798943e-05, |
|
"loss": 0.0942, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 81.41592920353982, |
|
"grad_norm": 4.109130382537842, |
|
"learning_rate": 1.7526455026455027e-05, |
|
"loss": 0.0791, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 81.76991150442478, |
|
"grad_norm": 10.844812393188477, |
|
"learning_rate": 1.736111111111111e-05, |
|
"loss": 0.0788, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 81.98230088495575, |
|
"eval_accuracy": 0.795, |
|
"eval_loss": 0.9301651120185852, |
|
"eval_runtime": 5.6474, |
|
"eval_samples_per_second": 70.829, |
|
"eval_steps_per_second": 2.302, |
|
"step": 2316 |
|
}, |
|
{ |
|
"epoch": 82.12389380530973, |
|
"grad_norm": 3.5678551197052, |
|
"learning_rate": 1.7195767195767195e-05, |
|
"loss": 0.0603, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 82.47787610619469, |
|
"grad_norm": 7.562065124511719, |
|
"learning_rate": 1.703042328042328e-05, |
|
"loss": 0.093, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 82.83185840707965, |
|
"grad_norm": 10.092254638671875, |
|
"learning_rate": 1.6865079365079367e-05, |
|
"loss": 0.0787, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 82.97345132743362, |
|
"eval_accuracy": 0.8025, |
|
"eval_loss": 0.8706057071685791, |
|
"eval_runtime": 5.8345, |
|
"eval_samples_per_second": 68.558, |
|
"eval_steps_per_second": 2.228, |
|
"step": 2344 |
|
}, |
|
{ |
|
"epoch": 83.1858407079646, |
|
"grad_norm": 4.765111923217773, |
|
"learning_rate": 1.669973544973545e-05, |
|
"loss": 0.0579, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 83.53982300884955, |
|
"grad_norm": 6.426796913146973, |
|
"learning_rate": 1.6534391534391536e-05, |
|
"loss": 0.0697, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 83.89380530973452, |
|
"grad_norm": 7.382542610168457, |
|
"learning_rate": 1.636904761904762e-05, |
|
"loss": 0.0918, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.868044912815094, |
|
"eval_runtime": 5.7723, |
|
"eval_samples_per_second": 69.297, |
|
"eval_steps_per_second": 2.252, |
|
"step": 2373 |
|
}, |
|
{ |
|
"epoch": 84.24778761061947, |
|
"grad_norm": 5.388473987579346, |
|
"learning_rate": 1.6203703703703704e-05, |
|
"loss": 0.0752, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 84.60176991150442, |
|
"grad_norm": 6.751432418823242, |
|
"learning_rate": 1.6038359788359788e-05, |
|
"loss": 0.0671, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 84.95575221238938, |
|
"grad_norm": 8.372601509094238, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 0.0681, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 84.99115044247787, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.8481296300888062, |
|
"eval_runtime": 5.711, |
|
"eval_samples_per_second": 70.04, |
|
"eval_steps_per_second": 2.276, |
|
"step": 2401 |
|
}, |
|
{ |
|
"epoch": 85.30973451327434, |
|
"grad_norm": 6.79984712600708, |
|
"learning_rate": 1.5707671957671957e-05, |
|
"loss": 0.0634, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 85.66371681415929, |
|
"grad_norm": 9.60888671875, |
|
"learning_rate": 1.5542328042328044e-05, |
|
"loss": 0.115, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 85.98230088495575, |
|
"eval_accuracy": 0.8025, |
|
"eval_loss": 0.8552606105804443, |
|
"eval_runtime": 6.0227, |
|
"eval_samples_per_second": 66.415, |
|
"eval_steps_per_second": 2.158, |
|
"step": 2429 |
|
}, |
|
{ |
|
"epoch": 86.01769911504425, |
|
"grad_norm": 9.006217956542969, |
|
"learning_rate": 1.537698412698413e-05, |
|
"loss": 0.0741, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 86.3716814159292, |
|
"grad_norm": 8.767806053161621, |
|
"learning_rate": 1.5211640211640213e-05, |
|
"loss": 0.0652, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 86.72566371681415, |
|
"grad_norm": 6.8285675048828125, |
|
"learning_rate": 1.5046296296296297e-05, |
|
"loss": 0.0599, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 86.97345132743362, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.8886809349060059, |
|
"eval_runtime": 5.6679, |
|
"eval_samples_per_second": 70.573, |
|
"eval_steps_per_second": 2.294, |
|
"step": 2457 |
|
}, |
|
{ |
|
"epoch": 87.07964601769912, |
|
"grad_norm": 7.630321502685547, |
|
"learning_rate": 1.4880952380952381e-05, |
|
"loss": 0.0805, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 87.43362831858407, |
|
"grad_norm": 8.364428520202637, |
|
"learning_rate": 1.4715608465608465e-05, |
|
"loss": 0.0743, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 87.78761061946902, |
|
"grad_norm": 8.15882682800293, |
|
"learning_rate": 1.455026455026455e-05, |
|
"loss": 0.0774, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.9255210161209106, |
|
"eval_runtime": 5.684, |
|
"eval_samples_per_second": 70.374, |
|
"eval_steps_per_second": 2.287, |
|
"step": 2486 |
|
}, |
|
{ |
|
"epoch": 88.14159292035399, |
|
"grad_norm": 3.7120983600616455, |
|
"learning_rate": 1.4384920634920635e-05, |
|
"loss": 0.0635, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 88.49557522123894, |
|
"grad_norm": 3.117091655731201, |
|
"learning_rate": 1.4219576719576721e-05, |
|
"loss": 0.0722, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 88.84955752212389, |
|
"grad_norm": 4.884605407714844, |
|
"learning_rate": 1.4054232804232805e-05, |
|
"loss": 0.0701, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 88.99115044247787, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.8794758319854736, |
|
"eval_runtime": 5.697, |
|
"eval_samples_per_second": 70.213, |
|
"eval_steps_per_second": 2.282, |
|
"step": 2514 |
|
}, |
|
{ |
|
"epoch": 89.20353982300885, |
|
"grad_norm": 6.620547294616699, |
|
"learning_rate": 1.388888888888889e-05, |
|
"loss": 0.0626, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 89.5575221238938, |
|
"grad_norm": 6.801345348358154, |
|
"learning_rate": 1.3723544973544974e-05, |
|
"loss": 0.0682, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 89.91150442477876, |
|
"grad_norm": 5.619492053985596, |
|
"learning_rate": 1.3558201058201058e-05, |
|
"loss": 0.074, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 89.98230088495575, |
|
"eval_accuracy": 0.8175, |
|
"eval_loss": 0.8634124994277954, |
|
"eval_runtime": 5.7593, |
|
"eval_samples_per_second": 69.453, |
|
"eval_steps_per_second": 2.257, |
|
"step": 2542 |
|
}, |
|
{ |
|
"epoch": 90.26548672566372, |
|
"grad_norm": 10.153610229492188, |
|
"learning_rate": 1.3392857142857144e-05, |
|
"loss": 0.0794, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 90.61946902654867, |
|
"grad_norm": 5.089029312133789, |
|
"learning_rate": 1.3227513227513228e-05, |
|
"loss": 0.0611, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 90.97345132743362, |
|
"grad_norm": 6.003979206085205, |
|
"learning_rate": 1.3062169312169312e-05, |
|
"loss": 0.0497, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 90.97345132743362, |
|
"eval_accuracy": 0.82, |
|
"eval_loss": 0.8793442249298096, |
|
"eval_runtime": 6.0414, |
|
"eval_samples_per_second": 66.21, |
|
"eval_steps_per_second": 2.152, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 91.32743362831859, |
|
"grad_norm": 6.3248090744018555, |
|
"learning_rate": 1.2896825396825398e-05, |
|
"loss": 0.0557, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 91.68141592920354, |
|
"grad_norm": 4.216904640197754, |
|
"learning_rate": 1.2731481481481482e-05, |
|
"loss": 0.0569, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.7925, |
|
"eval_loss": 0.9006764888763428, |
|
"eval_runtime": 5.7399, |
|
"eval_samples_per_second": 69.688, |
|
"eval_steps_per_second": 2.265, |
|
"step": 2599 |
|
}, |
|
{ |
|
"epoch": 92.03539823008849, |
|
"grad_norm": 6.895070552825928, |
|
"learning_rate": 1.2566137566137568e-05, |
|
"loss": 0.0769, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 92.38938053097345, |
|
"grad_norm": 3.7500102519989014, |
|
"learning_rate": 1.2400793650793652e-05, |
|
"loss": 0.0543, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 92.7433628318584, |
|
"grad_norm": 10.693458557128906, |
|
"learning_rate": 1.2235449735449737e-05, |
|
"loss": 0.0722, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 92.99115044247787, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.8700942993164062, |
|
"eval_runtime": 5.6593, |
|
"eval_samples_per_second": 70.681, |
|
"eval_steps_per_second": 2.297, |
|
"step": 2627 |
|
}, |
|
{ |
|
"epoch": 93.09734513274336, |
|
"grad_norm": 5.407220363616943, |
|
"learning_rate": 1.2070105820105821e-05, |
|
"loss": 0.0822, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 93.45132743362832, |
|
"grad_norm": 2.9195971488952637, |
|
"learning_rate": 1.1904761904761905e-05, |
|
"loss": 0.0624, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 93.80530973451327, |
|
"grad_norm": 7.752827167510986, |
|
"learning_rate": 1.1739417989417991e-05, |
|
"loss": 0.0674, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 93.98230088495575, |
|
"eval_accuracy": 0.8225, |
|
"eval_loss": 0.8879609704017639, |
|
"eval_runtime": 5.7797, |
|
"eval_samples_per_second": 69.208, |
|
"eval_steps_per_second": 2.249, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 94.15929203539822, |
|
"grad_norm": 6.314310550689697, |
|
"learning_rate": 1.1574074074074075e-05, |
|
"loss": 0.0685, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 94.51327433628319, |
|
"grad_norm": 9.070263862609863, |
|
"learning_rate": 1.140873015873016e-05, |
|
"loss": 0.0649, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 94.86725663716814, |
|
"grad_norm": 4.211071968078613, |
|
"learning_rate": 1.1243386243386244e-05, |
|
"loss": 0.0643, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 94.97345132743362, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.8854994177818298, |
|
"eval_runtime": 5.6723, |
|
"eval_samples_per_second": 70.518, |
|
"eval_steps_per_second": 2.292, |
|
"step": 2683 |
|
}, |
|
{ |
|
"epoch": 95.22123893805309, |
|
"grad_norm": 6.701349258422852, |
|
"learning_rate": 1.107804232804233e-05, |
|
"loss": 0.054, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 95.57522123893806, |
|
"grad_norm": 9.44869327545166, |
|
"learning_rate": 1.0912698412698414e-05, |
|
"loss": 0.0594, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 95.929203539823, |
|
"grad_norm": 5.749889373779297, |
|
"learning_rate": 1.0747354497354498e-05, |
|
"loss": 0.0583, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.8918322920799255, |
|
"eval_runtime": 5.8266, |
|
"eval_samples_per_second": 68.651, |
|
"eval_steps_per_second": 2.231, |
|
"step": 2712 |
|
}, |
|
{ |
|
"epoch": 96.28318584070796, |
|
"grad_norm": 9.790497779846191, |
|
"learning_rate": 1.0582010582010582e-05, |
|
"loss": 0.0731, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 96.63716814159292, |
|
"grad_norm": 10.20504379272461, |
|
"learning_rate": 1.0416666666666668e-05, |
|
"loss": 0.067, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 96.99115044247787, |
|
"grad_norm": 9.304971694946289, |
|
"learning_rate": 1.0251322751322752e-05, |
|
"loss": 0.0558, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 96.99115044247787, |
|
"eval_accuracy": 0.8275, |
|
"eval_loss": 0.8735535144805908, |
|
"eval_runtime": 5.748, |
|
"eval_samples_per_second": 69.59, |
|
"eval_steps_per_second": 2.262, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 97.34513274336283, |
|
"grad_norm": 2.155658483505249, |
|
"learning_rate": 1.0085978835978836e-05, |
|
"loss": 0.0952, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 97.69911504424779, |
|
"grad_norm": 4.216080665588379, |
|
"learning_rate": 9.92063492063492e-06, |
|
"loss": 0.0622, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 97.98230088495575, |
|
"eval_accuracy": 0.815, |
|
"eval_loss": 0.905790388584137, |
|
"eval_runtime": 6.0004, |
|
"eval_samples_per_second": 66.662, |
|
"eval_steps_per_second": 2.167, |
|
"step": 2768 |
|
}, |
|
{ |
|
"epoch": 98.05309734513274, |
|
"grad_norm": 5.462803840637207, |
|
"learning_rate": 9.755291005291007e-06, |
|
"loss": 0.0576, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 98.40707964601769, |
|
"grad_norm": 6.204135894775391, |
|
"learning_rate": 9.58994708994709e-06, |
|
"loss": 0.0871, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 98.76106194690266, |
|
"grad_norm": 7.025479316711426, |
|
"learning_rate": 9.424603174603175e-06, |
|
"loss": 0.0689, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 98.97345132743362, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.9006683230400085, |
|
"eval_runtime": 5.6446, |
|
"eval_samples_per_second": 70.864, |
|
"eval_steps_per_second": 2.303, |
|
"step": 2796 |
|
}, |
|
{ |
|
"epoch": 99.11504424778761, |
|
"grad_norm": 12.175108909606934, |
|
"learning_rate": 9.259259259259259e-06, |
|
"loss": 0.0653, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 99.46902654867256, |
|
"grad_norm": 2.277102470397949, |
|
"learning_rate": 9.093915343915345e-06, |
|
"loss": 0.0533, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 99.82300884955752, |
|
"grad_norm": 3.374624252319336, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.0782, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.8025, |
|
"eval_loss": 0.9216282367706299, |
|
"eval_runtime": 5.6935, |
|
"eval_samples_per_second": 70.255, |
|
"eval_steps_per_second": 2.283, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 100.17699115044248, |
|
"grad_norm": 4.110525131225586, |
|
"learning_rate": 8.763227513227513e-06, |
|
"loss": 0.0678, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 100.53097345132744, |
|
"grad_norm": 1.8253668546676636, |
|
"learning_rate": 8.597883597883598e-06, |
|
"loss": 0.0506, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 100.88495575221239, |
|
"grad_norm": 5.427482604980469, |
|
"learning_rate": 8.432539682539684e-06, |
|
"loss": 0.0696, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 100.99115044247787, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.9158985614776611, |
|
"eval_runtime": 5.7392, |
|
"eval_samples_per_second": 69.696, |
|
"eval_steps_per_second": 2.265, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 101.23893805309734, |
|
"grad_norm": 11.669960975646973, |
|
"learning_rate": 8.267195767195768e-06, |
|
"loss": 0.0424, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 101.59292035398231, |
|
"grad_norm": 3.504544496536255, |
|
"learning_rate": 8.101851851851852e-06, |
|
"loss": 0.0676, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 101.94690265486726, |
|
"grad_norm": 4.86131477355957, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.0554, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 101.98230088495575, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.9194761514663696, |
|
"eval_runtime": 5.7011, |
|
"eval_samples_per_second": 70.162, |
|
"eval_steps_per_second": 2.28, |
|
"step": 2881 |
|
}, |
|
{ |
|
"epoch": 102.30088495575221, |
|
"grad_norm": 10.070198059082031, |
|
"learning_rate": 7.771164021164022e-06, |
|
"loss": 0.0784, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 102.65486725663717, |
|
"grad_norm": 6.48520040512085, |
|
"learning_rate": 7.605820105820106e-06, |
|
"loss": 0.0585, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 102.97345132743362, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.9314340949058533, |
|
"eval_runtime": 5.7273, |
|
"eval_samples_per_second": 69.841, |
|
"eval_steps_per_second": 2.27, |
|
"step": 2909 |
|
}, |
|
{ |
|
"epoch": 103.00884955752213, |
|
"grad_norm": 3.9195964336395264, |
|
"learning_rate": 7.4404761904761905e-06, |
|
"loss": 0.068, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 103.36283185840708, |
|
"grad_norm": 7.709494590759277, |
|
"learning_rate": 7.275132275132275e-06, |
|
"loss": 0.0555, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 103.71681415929204, |
|
"grad_norm": 5.181375026702881, |
|
"learning_rate": 7.1097883597883606e-06, |
|
"loss": 0.0541, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.825, |
|
"eval_loss": 0.893872082233429, |
|
"eval_runtime": 5.7025, |
|
"eval_samples_per_second": 70.145, |
|
"eval_steps_per_second": 2.28, |
|
"step": 2938 |
|
}, |
|
{ |
|
"epoch": 104.070796460177, |
|
"grad_norm": 11.922988891601562, |
|
"learning_rate": 6.944444444444445e-06, |
|
"loss": 0.0656, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 104.42477876106194, |
|
"grad_norm": 6.153034687042236, |
|
"learning_rate": 6.779100529100529e-06, |
|
"loss": 0.0668, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 104.77876106194691, |
|
"grad_norm": 7.586311340332031, |
|
"learning_rate": 6.613756613756614e-06, |
|
"loss": 0.0636, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 104.99115044247787, |
|
"eval_accuracy": 0.8025, |
|
"eval_loss": 0.9045028686523438, |
|
"eval_runtime": 5.7995, |
|
"eval_samples_per_second": 68.972, |
|
"eval_steps_per_second": 2.242, |
|
"step": 2966 |
|
}, |
|
{ |
|
"epoch": 105.13274336283186, |
|
"grad_norm": 7.499364852905273, |
|
"learning_rate": 6.448412698412699e-06, |
|
"loss": 0.0578, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 105.48672566371681, |
|
"grad_norm": 5.914554595947266, |
|
"learning_rate": 6.283068783068784e-06, |
|
"loss": 0.0683, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 105.84070796460178, |
|
"grad_norm": 5.446691513061523, |
|
"learning_rate": 6.117724867724868e-06, |
|
"loss": 0.0684, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 105.98230088495575, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.8892062306404114, |
|
"eval_runtime": 5.7229, |
|
"eval_samples_per_second": 69.895, |
|
"eval_steps_per_second": 2.272, |
|
"step": 2994 |
|
}, |
|
{ |
|
"epoch": 106.19469026548673, |
|
"grad_norm": 5.555883407592773, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 0.0629, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 106.54867256637168, |
|
"grad_norm": 6.0038042068481445, |
|
"learning_rate": 5.787037037037038e-06, |
|
"loss": 0.0637, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 106.90265486725664, |
|
"grad_norm": 4.660765647888184, |
|
"learning_rate": 5.621693121693122e-06, |
|
"loss": 0.0608, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 106.97345132743362, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.8998861908912659, |
|
"eval_runtime": 5.6896, |
|
"eval_samples_per_second": 70.303, |
|
"eval_steps_per_second": 2.285, |
|
"step": 3022 |
|
}, |
|
{ |
|
"epoch": 107.2566371681416, |
|
"grad_norm": 2.3250956535339355, |
|
"learning_rate": 5.456349206349207e-06, |
|
"loss": 0.065, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 107.61061946902655, |
|
"grad_norm": 3.4695749282836914, |
|
"learning_rate": 5.291005291005291e-06, |
|
"loss": 0.0614, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 107.96460176991151, |
|
"grad_norm": 1.7842631340026855, |
|
"learning_rate": 5.125661375661376e-06, |
|
"loss": 0.0663, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.9033371210098267, |
|
"eval_runtime": 6.1003, |
|
"eval_samples_per_second": 65.571, |
|
"eval_steps_per_second": 2.131, |
|
"step": 3051 |
|
}, |
|
{ |
|
"epoch": 108.31858407079646, |
|
"grad_norm": 4.970130920410156, |
|
"learning_rate": 4.96031746031746e-06, |
|
"loss": 0.0466, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 108.67256637168141, |
|
"grad_norm": 6.935737609863281, |
|
"learning_rate": 4.794973544973545e-06, |
|
"loss": 0.054, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 108.99115044247787, |
|
"eval_accuracy": 0.805, |
|
"eval_loss": 0.9248512387275696, |
|
"eval_runtime": 5.6759, |
|
"eval_samples_per_second": 70.473, |
|
"eval_steps_per_second": 2.29, |
|
"step": 3079 |
|
}, |
|
{ |
|
"epoch": 109.02654867256638, |
|
"grad_norm": 1.8442103862762451, |
|
"learning_rate": 4.6296296296296296e-06, |
|
"loss": 0.0681, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 109.38053097345133, |
|
"grad_norm": 6.584358215332031, |
|
"learning_rate": 4.464285714285715e-06, |
|
"loss": 0.0551, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 109.73451327433628, |
|
"grad_norm": 2.5174403190612793, |
|
"learning_rate": 4.298941798941799e-06, |
|
"loss": 0.0538, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 109.98230088495575, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.9065310955047607, |
|
"eval_runtime": 5.7721, |
|
"eval_samples_per_second": 69.299, |
|
"eval_steps_per_second": 2.252, |
|
"step": 3107 |
|
}, |
|
{ |
|
"epoch": 110.08849557522124, |
|
"grad_norm": 3.406076669692993, |
|
"learning_rate": 4.133597883597884e-06, |
|
"loss": 0.0768, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 110.4424778761062, |
|
"grad_norm": 3.832432746887207, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.0674, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 110.79646017699115, |
|
"grad_norm": 2.3623971939086914, |
|
"learning_rate": 3.802910052910053e-06, |
|
"loss": 0.0696, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 110.97345132743362, |
|
"eval_accuracy": 0.8175, |
|
"eval_loss": 0.9002352952957153, |
|
"eval_runtime": 5.823, |
|
"eval_samples_per_second": 68.693, |
|
"eval_steps_per_second": 2.233, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 111.15044247787611, |
|
"grad_norm": 8.914955139160156, |
|
"learning_rate": 3.6375661375661373e-06, |
|
"loss": 0.059, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 111.50442477876106, |
|
"grad_norm": 6.295753479003906, |
|
"learning_rate": 3.4722222222222224e-06, |
|
"loss": 0.0496, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 111.85840707964601, |
|
"grad_norm": 4.604882717132568, |
|
"learning_rate": 3.306878306878307e-06, |
|
"loss": 0.0585, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.8025, |
|
"eval_loss": 0.9105786085128784, |
|
"eval_runtime": 5.6843, |
|
"eval_samples_per_second": 70.369, |
|
"eval_steps_per_second": 2.287, |
|
"step": 3164 |
|
}, |
|
{ |
|
"epoch": 112.21238938053098, |
|
"grad_norm": 6.111964702606201, |
|
"learning_rate": 3.141534391534392e-06, |
|
"loss": 0.0387, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 112.56637168141593, |
|
"grad_norm": 9.29433536529541, |
|
"learning_rate": 2.9761904761904763e-06, |
|
"loss": 0.0623, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 112.92035398230088, |
|
"grad_norm": 10.9270601272583, |
|
"learning_rate": 2.810846560846561e-06, |
|
"loss": 0.0641, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 112.99115044247787, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.9088242053985596, |
|
"eval_runtime": 5.7312, |
|
"eval_samples_per_second": 69.794, |
|
"eval_steps_per_second": 2.268, |
|
"step": 3192 |
|
}, |
|
{ |
|
"epoch": 113.27433628318585, |
|
"grad_norm": 9.549851417541504, |
|
"learning_rate": 2.6455026455026455e-06, |
|
"loss": 0.0555, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 113.6283185840708, |
|
"grad_norm": 7.800489902496338, |
|
"learning_rate": 2.48015873015873e-06, |
|
"loss": 0.0527, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 113.98230088495575, |
|
"grad_norm": 4.22268533706665, |
|
"learning_rate": 2.3148148148148148e-06, |
|
"loss": 0.0611, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 113.98230088495575, |
|
"eval_accuracy": 0.8075, |
|
"eval_loss": 0.915170431137085, |
|
"eval_runtime": 5.6521, |
|
"eval_samples_per_second": 70.77, |
|
"eval_steps_per_second": 2.3, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 114.33628318584071, |
|
"grad_norm": 6.939643383026123, |
|
"learning_rate": 2.1494708994708994e-06, |
|
"loss": 0.092, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 114.69026548672566, |
|
"grad_norm": 4.18217658996582, |
|
"learning_rate": 1.984126984126984e-06, |
|
"loss": 0.0528, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 114.97345132743362, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.9140109419822693, |
|
"eval_runtime": 5.9093, |
|
"eval_samples_per_second": 67.69, |
|
"eval_steps_per_second": 2.2, |
|
"step": 3248 |
|
}, |
|
{ |
|
"epoch": 115.04424778761062, |
|
"grad_norm": 2.470015287399292, |
|
"learning_rate": 1.8187830687830687e-06, |
|
"loss": 0.0664, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 115.39823008849558, |
|
"grad_norm": 5.625057220458984, |
|
"learning_rate": 1.6534391534391535e-06, |
|
"loss": 0.0567, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 115.75221238938053, |
|
"grad_norm": 8.182403564453125, |
|
"learning_rate": 1.4880952380952381e-06, |
|
"loss": 0.0631, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.9184489250183105, |
|
"eval_runtime": 5.7237, |
|
"eval_samples_per_second": 69.885, |
|
"eval_steps_per_second": 2.271, |
|
"step": 3277 |
|
}, |
|
{ |
|
"epoch": 116.10619469026548, |
|
"grad_norm": 3.7127795219421387, |
|
"learning_rate": 1.3227513227513228e-06, |
|
"loss": 0.0667, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 116.46017699115045, |
|
"grad_norm": 9.592253684997559, |
|
"learning_rate": 1.1574074074074074e-06, |
|
"loss": 0.0559, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 116.8141592920354, |
|
"grad_norm": 12.001859664916992, |
|
"learning_rate": 9.92063492063492e-07, |
|
"loss": 0.0744, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 116.99115044247787, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.9216000437736511, |
|
"eval_runtime": 5.7416, |
|
"eval_samples_per_second": 69.667, |
|
"eval_steps_per_second": 2.264, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 117.16814159292035, |
|
"grad_norm": 5.99811315536499, |
|
"learning_rate": 8.267195767195768e-07, |
|
"loss": 0.051, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 117.52212389380531, |
|
"grad_norm": 4.772040367126465, |
|
"learning_rate": 6.613756613756614e-07, |
|
"loss": 0.0705, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 117.87610619469027, |
|
"grad_norm": 3.2538766860961914, |
|
"learning_rate": 4.96031746031746e-07, |
|
"loss": 0.0407, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 117.98230088495575, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 0.9210975766181946, |
|
"eval_runtime": 5.6807, |
|
"eval_samples_per_second": 70.414, |
|
"eval_steps_per_second": 2.288, |
|
"step": 3333 |
|
}, |
|
{ |
|
"epoch": 118.23008849557522, |
|
"grad_norm": 5.481634140014648, |
|
"learning_rate": 3.306878306878307e-07, |
|
"loss": 0.0682, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 118.58407079646018, |
|
"grad_norm": 7.475958824157715, |
|
"learning_rate": 1.6534391534391535e-07, |
|
"loss": 0.058, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 118.93805309734513, |
|
"grad_norm": 12.294817924499512, |
|
"learning_rate": 0.0, |
|
"loss": 0.0573, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 118.93805309734513, |
|
"eval_accuracy": 0.81, |
|
"eval_loss": 0.92020583152771, |
|
"eval_runtime": 5.7978, |
|
"eval_samples_per_second": 68.992, |
|
"eval_steps_per_second": 2.242, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 118.93805309734513, |
|
"step": 3360, |
|
"total_flos": 1.0779764781475824e+19, |
|
"train_loss": 0.2818299961143306, |
|
"train_runtime": 9749.4057, |
|
"train_samples_per_second": 44.31, |
|
"train_steps_per_second": 0.345 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 120, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0779764781475824e+19, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|