vishalkatheriya18's picture
End of training
8c32461 verified
raw
history blame
87.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 118.93805309734513,
"eval_steps": 500,
"global_step": 3360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.35398230088495575,
"grad_norm": 8.875021934509277,
"learning_rate": 1.4880952380952381e-06,
"loss": 2.2984,
"step": 10
},
{
"epoch": 0.7079646017699115,
"grad_norm": 10.44489860534668,
"learning_rate": 2.9761904761904763e-06,
"loss": 2.302,
"step": 20
},
{
"epoch": 0.9911504424778761,
"eval_accuracy": 0.1575,
"eval_loss": 2.266624689102173,
"eval_runtime": 7.4116,
"eval_samples_per_second": 53.97,
"eval_steps_per_second": 1.754,
"step": 28
},
{
"epoch": 1.0619469026548674,
"grad_norm": 11.131507873535156,
"learning_rate": 4.464285714285715e-06,
"loss": 2.2827,
"step": 30
},
{
"epoch": 1.415929203539823,
"grad_norm": 11.846382141113281,
"learning_rate": 5.9523809523809525e-06,
"loss": 2.2517,
"step": 40
},
{
"epoch": 1.7699115044247788,
"grad_norm": 8.047672271728516,
"learning_rate": 7.4404761904761905e-06,
"loss": 2.2226,
"step": 50
},
{
"epoch": 1.9823008849557522,
"eval_accuracy": 0.315,
"eval_loss": 2.1654422283172607,
"eval_runtime": 5.7675,
"eval_samples_per_second": 69.354,
"eval_steps_per_second": 2.254,
"step": 56
},
{
"epoch": 2.1238938053097347,
"grad_norm": 7.9003190994262695,
"learning_rate": 8.92857142857143e-06,
"loss": 2.1848,
"step": 60
},
{
"epoch": 2.47787610619469,
"grad_norm": 14.892489433288574,
"learning_rate": 1.0416666666666668e-05,
"loss": 2.1238,
"step": 70
},
{
"epoch": 2.831858407079646,
"grad_norm": 13.308501243591309,
"learning_rate": 1.1904761904761905e-05,
"loss": 2.0639,
"step": 80
},
{
"epoch": 2.9734513274336285,
"eval_accuracy": 0.445,
"eval_loss": 1.9969619512557983,
"eval_runtime": 5.7209,
"eval_samples_per_second": 69.919,
"eval_steps_per_second": 2.272,
"step": 84
},
{
"epoch": 3.185840707964602,
"grad_norm": 16.818639755249023,
"learning_rate": 1.3392857142857144e-05,
"loss": 2.005,
"step": 90
},
{
"epoch": 3.5398230088495577,
"grad_norm": 13.497295379638672,
"learning_rate": 1.4880952380952381e-05,
"loss": 1.9232,
"step": 100
},
{
"epoch": 3.893805309734513,
"grad_norm": 16.994335174560547,
"learning_rate": 1.636904761904762e-05,
"loss": 1.8559,
"step": 110
},
{
"epoch": 4.0,
"eval_accuracy": 0.56,
"eval_loss": 1.7373383045196533,
"eval_runtime": 5.8519,
"eval_samples_per_second": 68.354,
"eval_steps_per_second": 2.222,
"step": 113
},
{
"epoch": 4.247787610619469,
"grad_norm": 25.03571891784668,
"learning_rate": 1.785714285714286e-05,
"loss": 1.7371,
"step": 120
},
{
"epoch": 4.601769911504425,
"grad_norm": 27.68413734436035,
"learning_rate": 1.9345238095238097e-05,
"loss": 1.6511,
"step": 130
},
{
"epoch": 4.95575221238938,
"grad_norm": 21.25657081604004,
"learning_rate": 2.0833333333333336e-05,
"loss": 1.5966,
"step": 140
},
{
"epoch": 4.991150442477876,
"eval_accuracy": 0.605,
"eval_loss": 1.48233962059021,
"eval_runtime": 5.791,
"eval_samples_per_second": 69.072,
"eval_steps_per_second": 2.245,
"step": 141
},
{
"epoch": 5.3097345132743365,
"grad_norm": 33.357173919677734,
"learning_rate": 2.2321428571428575e-05,
"loss": 1.4767,
"step": 150
},
{
"epoch": 5.663716814159292,
"grad_norm": 21.856212615966797,
"learning_rate": 2.380952380952381e-05,
"loss": 1.3967,
"step": 160
},
{
"epoch": 5.982300884955752,
"eval_accuracy": 0.6125,
"eval_loss": 1.2925149202346802,
"eval_runtime": 5.8558,
"eval_samples_per_second": 68.308,
"eval_steps_per_second": 2.22,
"step": 169
},
{
"epoch": 6.017699115044247,
"grad_norm": 34.79808807373047,
"learning_rate": 2.529761904761905e-05,
"loss": 1.3046,
"step": 170
},
{
"epoch": 6.371681415929204,
"grad_norm": 27.307743072509766,
"learning_rate": 2.6785714285714288e-05,
"loss": 1.2429,
"step": 180
},
{
"epoch": 6.725663716814159,
"grad_norm": 42.35990524291992,
"learning_rate": 2.8273809523809523e-05,
"loss": 1.204,
"step": 190
},
{
"epoch": 6.9734513274336285,
"eval_accuracy": 0.68,
"eval_loss": 1.0512183904647827,
"eval_runtime": 6.0412,
"eval_samples_per_second": 66.212,
"eval_steps_per_second": 2.152,
"step": 197
},
{
"epoch": 7.079646017699115,
"grad_norm": 35.177921295166016,
"learning_rate": 2.9761904761904762e-05,
"loss": 1.1471,
"step": 200
},
{
"epoch": 7.433628318584071,
"grad_norm": 42.960391998291016,
"learning_rate": 3.125e-05,
"loss": 1.1087,
"step": 210
},
{
"epoch": 7.787610619469026,
"grad_norm": 28.81920623779297,
"learning_rate": 3.273809523809524e-05,
"loss": 1.0206,
"step": 220
},
{
"epoch": 8.0,
"eval_accuracy": 0.7025,
"eval_loss": 0.930656909942627,
"eval_runtime": 5.9577,
"eval_samples_per_second": 67.14,
"eval_steps_per_second": 2.182,
"step": 226
},
{
"epoch": 8.141592920353983,
"grad_norm": 27.41872787475586,
"learning_rate": 3.422619047619048e-05,
"loss": 0.9949,
"step": 230
},
{
"epoch": 8.495575221238939,
"grad_norm": 30.292728424072266,
"learning_rate": 3.571428571428572e-05,
"loss": 0.9248,
"step": 240
},
{
"epoch": 8.849557522123893,
"grad_norm": 41.145442962646484,
"learning_rate": 3.7202380952380956e-05,
"loss": 0.9408,
"step": 250
},
{
"epoch": 8.991150442477876,
"eval_accuracy": 0.7425,
"eval_loss": 0.8286006450653076,
"eval_runtime": 6.1346,
"eval_samples_per_second": 65.204,
"eval_steps_per_second": 2.119,
"step": 254
},
{
"epoch": 9.20353982300885,
"grad_norm": 100.8248519897461,
"learning_rate": 3.8690476190476195e-05,
"loss": 0.9164,
"step": 260
},
{
"epoch": 9.557522123893806,
"grad_norm": 33.74626541137695,
"learning_rate": 4.017857142857143e-05,
"loss": 0.8392,
"step": 270
},
{
"epoch": 9.91150442477876,
"grad_norm": 23.371883392333984,
"learning_rate": 4.166666666666667e-05,
"loss": 0.8501,
"step": 280
},
{
"epoch": 9.982300884955752,
"eval_accuracy": 0.6975,
"eval_loss": 0.8589820265769958,
"eval_runtime": 5.7507,
"eval_samples_per_second": 69.557,
"eval_steps_per_second": 2.261,
"step": 282
},
{
"epoch": 10.265486725663717,
"grad_norm": 27.10701560974121,
"learning_rate": 4.315476190476191e-05,
"loss": 0.8698,
"step": 290
},
{
"epoch": 10.619469026548673,
"grad_norm": 31.57968521118164,
"learning_rate": 4.464285714285715e-05,
"loss": 0.7621,
"step": 300
},
{
"epoch": 10.973451327433628,
"grad_norm": 16.75609588623047,
"learning_rate": 4.613095238095239e-05,
"loss": 0.7545,
"step": 310
},
{
"epoch": 10.973451327433628,
"eval_accuracy": 0.7475,
"eval_loss": 0.7702187895774841,
"eval_runtime": 5.9752,
"eval_samples_per_second": 66.944,
"eval_steps_per_second": 2.176,
"step": 310
},
{
"epoch": 11.327433628318584,
"grad_norm": 38.2448616027832,
"learning_rate": 4.761904761904762e-05,
"loss": 0.7884,
"step": 320
},
{
"epoch": 11.68141592920354,
"grad_norm": 55.255489349365234,
"learning_rate": 4.910714285714286e-05,
"loss": 0.7484,
"step": 330
},
{
"epoch": 12.0,
"eval_accuracy": 0.765,
"eval_loss": 0.7738745212554932,
"eval_runtime": 5.7936,
"eval_samples_per_second": 69.041,
"eval_steps_per_second": 2.244,
"step": 339
},
{
"epoch": 12.035398230088495,
"grad_norm": 48.28477478027344,
"learning_rate": 4.993386243386244e-05,
"loss": 0.7274,
"step": 340
},
{
"epoch": 12.389380530973451,
"grad_norm": 29.47681999206543,
"learning_rate": 4.976851851851852e-05,
"loss": 0.6793,
"step": 350
},
{
"epoch": 12.743362831858407,
"grad_norm": 25.996002197265625,
"learning_rate": 4.960317460317461e-05,
"loss": 0.6909,
"step": 360
},
{
"epoch": 12.991150442477876,
"eval_accuracy": 0.75,
"eval_loss": 0.7344464063644409,
"eval_runtime": 5.7939,
"eval_samples_per_second": 69.038,
"eval_steps_per_second": 2.244,
"step": 367
},
{
"epoch": 13.097345132743364,
"grad_norm": 17.77817153930664,
"learning_rate": 4.943783068783069e-05,
"loss": 0.6684,
"step": 370
},
{
"epoch": 13.451327433628318,
"grad_norm": 42.287635803222656,
"learning_rate": 4.927248677248678e-05,
"loss": 0.6309,
"step": 380
},
{
"epoch": 13.805309734513274,
"grad_norm": 23.582502365112305,
"learning_rate": 4.910714285714286e-05,
"loss": 0.6558,
"step": 390
},
{
"epoch": 13.982300884955752,
"eval_accuracy": 0.775,
"eval_loss": 0.6874340772628784,
"eval_runtime": 5.9948,
"eval_samples_per_second": 66.725,
"eval_steps_per_second": 2.169,
"step": 395
},
{
"epoch": 14.15929203539823,
"grad_norm": 23.51905059814453,
"learning_rate": 4.894179894179895e-05,
"loss": 0.5937,
"step": 400
},
{
"epoch": 14.513274336283185,
"grad_norm": 23.072551727294922,
"learning_rate": 4.8776455026455034e-05,
"loss": 0.6672,
"step": 410
},
{
"epoch": 14.867256637168142,
"grad_norm": 20.066308975219727,
"learning_rate": 4.8611111111111115e-05,
"loss": 0.5923,
"step": 420
},
{
"epoch": 14.973451327433628,
"eval_accuracy": 0.7675,
"eval_loss": 0.6640682816505432,
"eval_runtime": 5.7834,
"eval_samples_per_second": 69.163,
"eval_steps_per_second": 2.248,
"step": 423
},
{
"epoch": 15.221238938053098,
"grad_norm": 82.714599609375,
"learning_rate": 4.84457671957672e-05,
"loss": 0.6061,
"step": 430
},
{
"epoch": 15.575221238938052,
"grad_norm": 23.3370361328125,
"learning_rate": 4.8280423280423284e-05,
"loss": 0.5595,
"step": 440
},
{
"epoch": 15.929203539823009,
"grad_norm": 39.12126159667969,
"learning_rate": 4.811507936507937e-05,
"loss": 0.5764,
"step": 450
},
{
"epoch": 16.0,
"eval_accuracy": 0.7925,
"eval_loss": 0.6109621524810791,
"eval_runtime": 5.8219,
"eval_samples_per_second": 68.706,
"eval_steps_per_second": 2.233,
"step": 452
},
{
"epoch": 16.283185840707965,
"grad_norm": 18.160350799560547,
"learning_rate": 4.794973544973545e-05,
"loss": 0.564,
"step": 460
},
{
"epoch": 16.63716814159292,
"grad_norm": 22.667816162109375,
"learning_rate": 4.778439153439154e-05,
"loss": 0.5416,
"step": 470
},
{
"epoch": 16.991150442477878,
"grad_norm": 43.60499954223633,
"learning_rate": 4.761904761904762e-05,
"loss": 0.5235,
"step": 480
},
{
"epoch": 16.991150442477878,
"eval_accuracy": 0.76,
"eval_loss": 0.680573582649231,
"eval_runtime": 6.0278,
"eval_samples_per_second": 66.359,
"eval_steps_per_second": 2.157,
"step": 480
},
{
"epoch": 17.345132743362832,
"grad_norm": 17.05644416809082,
"learning_rate": 4.745370370370371e-05,
"loss": 0.5301,
"step": 490
},
{
"epoch": 17.699115044247787,
"grad_norm": 14.023980140686035,
"learning_rate": 4.7288359788359796e-05,
"loss": 0.4883,
"step": 500
},
{
"epoch": 17.98230088495575,
"eval_accuracy": 0.76,
"eval_loss": 0.7902570962905884,
"eval_runtime": 5.8085,
"eval_samples_per_second": 68.865,
"eval_steps_per_second": 2.238,
"step": 508
},
{
"epoch": 18.053097345132745,
"grad_norm": 14.874748229980469,
"learning_rate": 4.7123015873015876e-05,
"loss": 0.529,
"step": 510
},
{
"epoch": 18.4070796460177,
"grad_norm": 17.90399742126465,
"learning_rate": 4.6957671957671964e-05,
"loss": 0.455,
"step": 520
},
{
"epoch": 18.761061946902654,
"grad_norm": 17.8863468170166,
"learning_rate": 4.6792328042328045e-05,
"loss": 0.4682,
"step": 530
},
{
"epoch": 18.97345132743363,
"eval_accuracy": 0.7825,
"eval_loss": 0.6469230055809021,
"eval_runtime": 5.8117,
"eval_samples_per_second": 68.826,
"eval_steps_per_second": 2.237,
"step": 536
},
{
"epoch": 19.115044247787612,
"grad_norm": 25.429996490478516,
"learning_rate": 4.662698412698413e-05,
"loss": 0.4268,
"step": 540
},
{
"epoch": 19.469026548672566,
"grad_norm": 28.328603744506836,
"learning_rate": 4.646164021164021e-05,
"loss": 0.4233,
"step": 550
},
{
"epoch": 19.82300884955752,
"grad_norm": 18.160737991333008,
"learning_rate": 4.62962962962963e-05,
"loss": 0.441,
"step": 560
},
{
"epoch": 20.0,
"eval_accuracy": 0.7825,
"eval_loss": 0.6693841814994812,
"eval_runtime": 5.8278,
"eval_samples_per_second": 68.637,
"eval_steps_per_second": 2.231,
"step": 565
},
{
"epoch": 20.17699115044248,
"grad_norm": 18.324892044067383,
"learning_rate": 4.613095238095239e-05,
"loss": 0.4421,
"step": 570
},
{
"epoch": 20.530973451327434,
"grad_norm": 27.71529769897461,
"learning_rate": 4.596560846560847e-05,
"loss": 0.3583,
"step": 580
},
{
"epoch": 20.884955752212388,
"grad_norm": 10.921418190002441,
"learning_rate": 4.580026455026456e-05,
"loss": 0.4201,
"step": 590
},
{
"epoch": 20.991150442477878,
"eval_accuracy": 0.7625,
"eval_loss": 0.7144609689712524,
"eval_runtime": 5.725,
"eval_samples_per_second": 69.869,
"eval_steps_per_second": 2.271,
"step": 593
},
{
"epoch": 21.238938053097346,
"grad_norm": 27.364168167114258,
"learning_rate": 4.563492063492064e-05,
"loss": 0.3886,
"step": 600
},
{
"epoch": 21.5929203539823,
"grad_norm": 22.475797653198242,
"learning_rate": 4.5469576719576725e-05,
"loss": 0.366,
"step": 610
},
{
"epoch": 21.946902654867255,
"grad_norm": 14.525550842285156,
"learning_rate": 4.5304232804232806e-05,
"loss": 0.387,
"step": 620
},
{
"epoch": 21.98230088495575,
"eval_accuracy": 0.7775,
"eval_loss": 0.6505405902862549,
"eval_runtime": 6.0629,
"eval_samples_per_second": 65.975,
"eval_steps_per_second": 2.144,
"step": 621
},
{
"epoch": 22.300884955752213,
"grad_norm": 15.278279304504395,
"learning_rate": 4.5138888888888894e-05,
"loss": 0.3681,
"step": 630
},
{
"epoch": 22.654867256637168,
"grad_norm": 23.836380004882812,
"learning_rate": 4.4973544973544974e-05,
"loss": 0.4034,
"step": 640
},
{
"epoch": 22.97345132743363,
"eval_accuracy": 0.7875,
"eval_loss": 0.6168544292449951,
"eval_runtime": 5.8701,
"eval_samples_per_second": 68.143,
"eval_steps_per_second": 2.215,
"step": 649
},
{
"epoch": 23.008849557522122,
"grad_norm": 15.120473861694336,
"learning_rate": 4.480820105820106e-05,
"loss": 0.3424,
"step": 650
},
{
"epoch": 23.36283185840708,
"grad_norm": 12.84585189819336,
"learning_rate": 4.464285714285715e-05,
"loss": 0.3669,
"step": 660
},
{
"epoch": 23.716814159292035,
"grad_norm": 24.2416934967041,
"learning_rate": 4.447751322751323e-05,
"loss": 0.3041,
"step": 670
},
{
"epoch": 24.0,
"eval_accuracy": 0.795,
"eval_loss": 0.6416059136390686,
"eval_runtime": 5.7657,
"eval_samples_per_second": 69.375,
"eval_steps_per_second": 2.255,
"step": 678
},
{
"epoch": 24.07079646017699,
"grad_norm": 17.557270050048828,
"learning_rate": 4.431216931216932e-05,
"loss": 0.3344,
"step": 680
},
{
"epoch": 24.424778761061948,
"grad_norm": 12.692173957824707,
"learning_rate": 4.41468253968254e-05,
"loss": 0.2928,
"step": 690
},
{
"epoch": 24.778761061946902,
"grad_norm": 15.073899269104004,
"learning_rate": 4.3981481481481486e-05,
"loss": 0.3021,
"step": 700
},
{
"epoch": 24.991150442477878,
"eval_accuracy": 0.775,
"eval_loss": 0.6992344856262207,
"eval_runtime": 5.8878,
"eval_samples_per_second": 67.937,
"eval_steps_per_second": 2.208,
"step": 706
},
{
"epoch": 25.13274336283186,
"grad_norm": 10.598650932312012,
"learning_rate": 4.381613756613757e-05,
"loss": 0.3147,
"step": 710
},
{
"epoch": 25.486725663716815,
"grad_norm": 17.572126388549805,
"learning_rate": 4.3650793650793655e-05,
"loss": 0.302,
"step": 720
},
{
"epoch": 25.84070796460177,
"grad_norm": 30.176406860351562,
"learning_rate": 4.3485449735449736e-05,
"loss": 0.2853,
"step": 730
},
{
"epoch": 25.98230088495575,
"eval_accuracy": 0.7975,
"eval_loss": 0.6566324830055237,
"eval_runtime": 5.7476,
"eval_samples_per_second": 69.594,
"eval_steps_per_second": 2.262,
"step": 734
},
{
"epoch": 26.194690265486727,
"grad_norm": 18.816850662231445,
"learning_rate": 4.332010582010582e-05,
"loss": 0.297,
"step": 740
},
{
"epoch": 26.548672566371682,
"grad_norm": 14.217058181762695,
"learning_rate": 4.315476190476191e-05,
"loss": 0.2849,
"step": 750
},
{
"epoch": 26.902654867256636,
"grad_norm": 17.27300453186035,
"learning_rate": 4.298941798941799e-05,
"loss": 0.27,
"step": 760
},
{
"epoch": 26.97345132743363,
"eval_accuracy": 0.7825,
"eval_loss": 0.696976900100708,
"eval_runtime": 6.0434,
"eval_samples_per_second": 66.188,
"eval_steps_per_second": 2.151,
"step": 762
},
{
"epoch": 27.256637168141594,
"grad_norm": 10.05718994140625,
"learning_rate": 4.282407407407408e-05,
"loss": 0.2331,
"step": 770
},
{
"epoch": 27.61061946902655,
"grad_norm": 12.319819450378418,
"learning_rate": 4.265873015873016e-05,
"loss": 0.2462,
"step": 780
},
{
"epoch": 27.964601769911503,
"grad_norm": 14.027650833129883,
"learning_rate": 4.249338624338625e-05,
"loss": 0.2722,
"step": 790
},
{
"epoch": 28.0,
"eval_accuracy": 0.785,
"eval_loss": 0.6862995624542236,
"eval_runtime": 5.7837,
"eval_samples_per_second": 69.16,
"eval_steps_per_second": 2.248,
"step": 791
},
{
"epoch": 28.31858407079646,
"grad_norm": 11.350334167480469,
"learning_rate": 4.232804232804233e-05,
"loss": 0.2366,
"step": 800
},
{
"epoch": 28.672566371681416,
"grad_norm": 8.22404670715332,
"learning_rate": 4.2162698412698416e-05,
"loss": 0.2143,
"step": 810
},
{
"epoch": 28.991150442477878,
"eval_accuracy": 0.795,
"eval_loss": 0.679432213306427,
"eval_runtime": 5.7657,
"eval_samples_per_second": 69.376,
"eval_steps_per_second": 2.255,
"step": 819
},
{
"epoch": 29.02654867256637,
"grad_norm": 11.07326889038086,
"learning_rate": 4.1997354497354504e-05,
"loss": 0.2443,
"step": 820
},
{
"epoch": 29.38053097345133,
"grad_norm": 28.747331619262695,
"learning_rate": 4.1832010582010584e-05,
"loss": 0.2424,
"step": 830
},
{
"epoch": 29.734513274336283,
"grad_norm": 12.262460708618164,
"learning_rate": 4.166666666666667e-05,
"loss": 0.2238,
"step": 840
},
{
"epoch": 29.98230088495575,
"eval_accuracy": 0.7975,
"eval_loss": 0.6781703233718872,
"eval_runtime": 5.7261,
"eval_samples_per_second": 69.856,
"eval_steps_per_second": 2.27,
"step": 847
},
{
"epoch": 30.088495575221238,
"grad_norm": 12.248478889465332,
"learning_rate": 4.150132275132275e-05,
"loss": 0.2414,
"step": 850
},
{
"epoch": 30.442477876106196,
"grad_norm": 26.66759490966797,
"learning_rate": 4.133597883597884e-05,
"loss": 0.222,
"step": 860
},
{
"epoch": 30.79646017699115,
"grad_norm": 17.73442268371582,
"learning_rate": 4.117063492063492e-05,
"loss": 0.2387,
"step": 870
},
{
"epoch": 30.97345132743363,
"eval_accuracy": 0.81,
"eval_loss": 0.6944553256034851,
"eval_runtime": 5.9266,
"eval_samples_per_second": 67.492,
"eval_steps_per_second": 2.193,
"step": 875
},
{
"epoch": 31.150442477876105,
"grad_norm": 10.954143524169922,
"learning_rate": 4.100529100529101e-05,
"loss": 0.1897,
"step": 880
},
{
"epoch": 31.504424778761063,
"grad_norm": 11.53641414642334,
"learning_rate": 4.083994708994709e-05,
"loss": 0.215,
"step": 890
},
{
"epoch": 31.858407079646017,
"grad_norm": 19.236658096313477,
"learning_rate": 4.067460317460318e-05,
"loss": 0.223,
"step": 900
},
{
"epoch": 32.0,
"eval_accuracy": 0.7825,
"eval_loss": 0.7377186417579651,
"eval_runtime": 5.6881,
"eval_samples_per_second": 70.322,
"eval_steps_per_second": 2.285,
"step": 904
},
{
"epoch": 32.21238938053097,
"grad_norm": 18.005434036254883,
"learning_rate": 4.0509259259259265e-05,
"loss": 0.1703,
"step": 910
},
{
"epoch": 32.56637168141593,
"grad_norm": 10.189879417419434,
"learning_rate": 4.0343915343915346e-05,
"loss": 0.1882,
"step": 920
},
{
"epoch": 32.92035398230089,
"grad_norm": 23.737546920776367,
"learning_rate": 4.017857142857143e-05,
"loss": 0.2211,
"step": 930
},
{
"epoch": 32.991150442477874,
"eval_accuracy": 0.7775,
"eval_loss": 0.7430591583251953,
"eval_runtime": 5.8045,
"eval_samples_per_second": 68.911,
"eval_steps_per_second": 2.24,
"step": 932
},
{
"epoch": 33.27433628318584,
"grad_norm": 12.777314186096191,
"learning_rate": 4.0013227513227514e-05,
"loss": 0.2187,
"step": 940
},
{
"epoch": 33.6283185840708,
"grad_norm": 11.4727783203125,
"learning_rate": 3.98478835978836e-05,
"loss": 0.1952,
"step": 950
},
{
"epoch": 33.982300884955755,
"grad_norm": 9.033596992492676,
"learning_rate": 3.968253968253968e-05,
"loss": 0.1882,
"step": 960
},
{
"epoch": 33.982300884955755,
"eval_accuracy": 0.815,
"eval_loss": 0.702938437461853,
"eval_runtime": 5.7972,
"eval_samples_per_second": 68.998,
"eval_steps_per_second": 2.242,
"step": 960
},
{
"epoch": 34.336283185840706,
"grad_norm": 10.962681770324707,
"learning_rate": 3.951719576719577e-05,
"loss": 0.1788,
"step": 970
},
{
"epoch": 34.690265486725664,
"grad_norm": 9.644322395324707,
"learning_rate": 3.935185185185186e-05,
"loss": 0.1562,
"step": 980
},
{
"epoch": 34.97345132743363,
"eval_accuracy": 0.815,
"eval_loss": 0.6886518597602844,
"eval_runtime": 5.7736,
"eval_samples_per_second": 69.281,
"eval_steps_per_second": 2.252,
"step": 988
},
{
"epoch": 35.04424778761062,
"grad_norm": 7.208992004394531,
"learning_rate": 3.918650793650794e-05,
"loss": 0.1779,
"step": 990
},
{
"epoch": 35.39823008849557,
"grad_norm": 15.789375305175781,
"learning_rate": 3.9021164021164026e-05,
"loss": 0.1652,
"step": 1000
},
{
"epoch": 35.75221238938053,
"grad_norm": 9.542279243469238,
"learning_rate": 3.885582010582011e-05,
"loss": 0.1689,
"step": 1010
},
{
"epoch": 36.0,
"eval_accuracy": 0.7975,
"eval_loss": 0.7189816236495972,
"eval_runtime": 5.7578,
"eval_samples_per_second": 69.47,
"eval_steps_per_second": 2.258,
"step": 1017
},
{
"epoch": 36.10619469026549,
"grad_norm": 12.757901191711426,
"learning_rate": 3.8690476190476195e-05,
"loss": 0.178,
"step": 1020
},
{
"epoch": 36.46017699115044,
"grad_norm": 8.646611213684082,
"learning_rate": 3.8525132275132275e-05,
"loss": 0.1613,
"step": 1030
},
{
"epoch": 36.8141592920354,
"grad_norm": 12.245649337768555,
"learning_rate": 3.835978835978836e-05,
"loss": 0.1886,
"step": 1040
},
{
"epoch": 36.991150442477874,
"eval_accuracy": 0.795,
"eval_loss": 0.7677862048149109,
"eval_runtime": 5.9107,
"eval_samples_per_second": 67.674,
"eval_steps_per_second": 2.199,
"step": 1045
},
{
"epoch": 37.16814159292036,
"grad_norm": 16.76270866394043,
"learning_rate": 3.8194444444444444e-05,
"loss": 0.1777,
"step": 1050
},
{
"epoch": 37.52212389380531,
"grad_norm": 12.296770095825195,
"learning_rate": 3.802910052910053e-05,
"loss": 0.182,
"step": 1060
},
{
"epoch": 37.876106194690266,
"grad_norm": 19.789642333984375,
"learning_rate": 3.786375661375662e-05,
"loss": 0.1887,
"step": 1070
},
{
"epoch": 37.982300884955755,
"eval_accuracy": 0.81,
"eval_loss": 0.7334153652191162,
"eval_runtime": 5.7675,
"eval_samples_per_second": 69.354,
"eval_steps_per_second": 2.254,
"step": 1073
},
{
"epoch": 38.230088495575224,
"grad_norm": 9.019023895263672,
"learning_rate": 3.76984126984127e-05,
"loss": 0.1377,
"step": 1080
},
{
"epoch": 38.584070796460175,
"grad_norm": 8.896963119506836,
"learning_rate": 3.753306878306879e-05,
"loss": 0.1751,
"step": 1090
},
{
"epoch": 38.93805309734513,
"grad_norm": 8.166861534118652,
"learning_rate": 3.736772486772487e-05,
"loss": 0.1531,
"step": 1100
},
{
"epoch": 38.97345132743363,
"eval_accuracy": 0.7925,
"eval_loss": 0.7359188199043274,
"eval_runtime": 5.8237,
"eval_samples_per_second": 68.685,
"eval_steps_per_second": 2.232,
"step": 1101
},
{
"epoch": 39.29203539823009,
"grad_norm": 11.483717918395996,
"learning_rate": 3.7202380952380956e-05,
"loss": 0.1703,
"step": 1110
},
{
"epoch": 39.64601769911504,
"grad_norm": 7.077027797698975,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.1376,
"step": 1120
},
{
"epoch": 40.0,
"grad_norm": 10.199649810791016,
"learning_rate": 3.6871693121693124e-05,
"loss": 0.1662,
"step": 1130
},
{
"epoch": 40.0,
"eval_accuracy": 0.8075,
"eval_loss": 0.7593528032302856,
"eval_runtime": 5.7627,
"eval_samples_per_second": 69.411,
"eval_steps_per_second": 2.256,
"step": 1130
},
{
"epoch": 40.35398230088496,
"grad_norm": 12.18074893951416,
"learning_rate": 3.6706349206349205e-05,
"loss": 0.1502,
"step": 1140
},
{
"epoch": 40.70796460176991,
"grad_norm": 13.122712135314941,
"learning_rate": 3.654100529100529e-05,
"loss": 0.1273,
"step": 1150
},
{
"epoch": 40.991150442477874,
"eval_accuracy": 0.81,
"eval_loss": 0.7341694831848145,
"eval_runtime": 5.7253,
"eval_samples_per_second": 69.866,
"eval_steps_per_second": 2.271,
"step": 1158
},
{
"epoch": 41.06194690265487,
"grad_norm": 12.024120330810547,
"learning_rate": 3.637566137566138e-05,
"loss": 0.1334,
"step": 1160
},
{
"epoch": 41.415929203539825,
"grad_norm": 13.965011596679688,
"learning_rate": 3.621031746031746e-05,
"loss": 0.1394,
"step": 1170
},
{
"epoch": 41.769911504424776,
"grad_norm": 17.121511459350586,
"learning_rate": 3.604497354497355e-05,
"loss": 0.1986,
"step": 1180
},
{
"epoch": 41.982300884955755,
"eval_accuracy": 0.805,
"eval_loss": 0.7780522108078003,
"eval_runtime": 5.9228,
"eval_samples_per_second": 67.536,
"eval_steps_per_second": 2.195,
"step": 1186
},
{
"epoch": 42.123893805309734,
"grad_norm": 11.281728744506836,
"learning_rate": 3.587962962962963e-05,
"loss": 0.1563,
"step": 1190
},
{
"epoch": 42.47787610619469,
"grad_norm": 6.99351167678833,
"learning_rate": 3.571428571428572e-05,
"loss": 0.1541,
"step": 1200
},
{
"epoch": 42.83185840707964,
"grad_norm": 12.189485549926758,
"learning_rate": 3.55489417989418e-05,
"loss": 0.1891,
"step": 1210
},
{
"epoch": 42.97345132743363,
"eval_accuracy": 0.8225,
"eval_loss": 0.7375677227973938,
"eval_runtime": 5.736,
"eval_samples_per_second": 69.735,
"eval_steps_per_second": 2.266,
"step": 1214
},
{
"epoch": 43.1858407079646,
"grad_norm": 7.004004955291748,
"learning_rate": 3.5383597883597885e-05,
"loss": 0.1332,
"step": 1220
},
{
"epoch": 43.53982300884956,
"grad_norm": 15.816964149475098,
"learning_rate": 3.521825396825397e-05,
"loss": 0.1231,
"step": 1230
},
{
"epoch": 43.89380530973451,
"grad_norm": 8.720746040344238,
"learning_rate": 3.5052910052910054e-05,
"loss": 0.1573,
"step": 1240
},
{
"epoch": 44.0,
"eval_accuracy": 0.815,
"eval_loss": 0.730354905128479,
"eval_runtime": 5.7537,
"eval_samples_per_second": 69.52,
"eval_steps_per_second": 2.259,
"step": 1243
},
{
"epoch": 44.24778761061947,
"grad_norm": 11.798288345336914,
"learning_rate": 3.488756613756614e-05,
"loss": 0.1337,
"step": 1250
},
{
"epoch": 44.60176991150443,
"grad_norm": 8.698972702026367,
"learning_rate": 3.472222222222222e-05,
"loss": 0.1536,
"step": 1260
},
{
"epoch": 44.95575221238938,
"grad_norm": 14.468975067138672,
"learning_rate": 3.455687830687831e-05,
"loss": 0.1536,
"step": 1270
},
{
"epoch": 44.991150442477874,
"eval_accuracy": 0.8,
"eval_loss": 0.777264416217804,
"eval_runtime": 5.7994,
"eval_samples_per_second": 68.972,
"eval_steps_per_second": 2.242,
"step": 1271
},
{
"epoch": 45.309734513274336,
"grad_norm": 5.724658966064453,
"learning_rate": 3.439153439153439e-05,
"loss": 0.1362,
"step": 1280
},
{
"epoch": 45.663716814159294,
"grad_norm": 15.177201271057129,
"learning_rate": 3.422619047619048e-05,
"loss": 0.1562,
"step": 1290
},
{
"epoch": 45.982300884955755,
"eval_accuracy": 0.8,
"eval_loss": 0.7622714042663574,
"eval_runtime": 5.6562,
"eval_samples_per_second": 70.719,
"eval_steps_per_second": 2.298,
"step": 1299
},
{
"epoch": 46.017699115044245,
"grad_norm": 14.262038230895996,
"learning_rate": 3.406084656084656e-05,
"loss": 0.1609,
"step": 1300
},
{
"epoch": 46.3716814159292,
"grad_norm": 10.63355541229248,
"learning_rate": 3.3895502645502647e-05,
"loss": 0.1297,
"step": 1310
},
{
"epoch": 46.72566371681416,
"grad_norm": 11.54215145111084,
"learning_rate": 3.3730158730158734e-05,
"loss": 0.1264,
"step": 1320
},
{
"epoch": 46.97345132743363,
"eval_accuracy": 0.7925,
"eval_loss": 0.8314040899276733,
"eval_runtime": 5.9374,
"eval_samples_per_second": 67.369,
"eval_steps_per_second": 2.189,
"step": 1327
},
{
"epoch": 47.07964601769911,
"grad_norm": 7.798260688781738,
"learning_rate": 3.3564814814814815e-05,
"loss": 0.1552,
"step": 1330
},
{
"epoch": 47.43362831858407,
"grad_norm": 10.773923873901367,
"learning_rate": 3.33994708994709e-05,
"loss": 0.1188,
"step": 1340
},
{
"epoch": 47.78761061946903,
"grad_norm": 5.990432262420654,
"learning_rate": 3.3234126984126983e-05,
"loss": 0.1596,
"step": 1350
},
{
"epoch": 48.0,
"eval_accuracy": 0.8175,
"eval_loss": 0.7830512523651123,
"eval_runtime": 5.8001,
"eval_samples_per_second": 68.964,
"eval_steps_per_second": 2.241,
"step": 1356
},
{
"epoch": 48.14159292035398,
"grad_norm": 11.099681854248047,
"learning_rate": 3.306878306878307e-05,
"loss": 0.1342,
"step": 1360
},
{
"epoch": 48.49557522123894,
"grad_norm": 7.706020355224609,
"learning_rate": 3.290343915343915e-05,
"loss": 0.1382,
"step": 1370
},
{
"epoch": 48.849557522123895,
"grad_norm": 5.685822010040283,
"learning_rate": 3.273809523809524e-05,
"loss": 0.1237,
"step": 1380
},
{
"epoch": 48.991150442477874,
"eval_accuracy": 0.8,
"eval_loss": 0.7949317097663879,
"eval_runtime": 5.6938,
"eval_samples_per_second": 70.252,
"eval_steps_per_second": 2.283,
"step": 1384
},
{
"epoch": 49.203539823008846,
"grad_norm": 10.283050537109375,
"learning_rate": 3.257275132275133e-05,
"loss": 0.1161,
"step": 1390
},
{
"epoch": 49.557522123893804,
"grad_norm": 6.40301513671875,
"learning_rate": 3.240740740740741e-05,
"loss": 0.1363,
"step": 1400
},
{
"epoch": 49.91150442477876,
"grad_norm": 17.212697982788086,
"learning_rate": 3.2242063492063495e-05,
"loss": 0.1355,
"step": 1410
},
{
"epoch": 49.982300884955755,
"eval_accuracy": 0.795,
"eval_loss": 0.781341552734375,
"eval_runtime": 5.872,
"eval_samples_per_second": 68.12,
"eval_steps_per_second": 2.214,
"step": 1412
},
{
"epoch": 50.26548672566372,
"grad_norm": 4.185158729553223,
"learning_rate": 3.2076719576719576e-05,
"loss": 0.1294,
"step": 1420
},
{
"epoch": 50.61946902654867,
"grad_norm": 10.888021469116211,
"learning_rate": 3.1911375661375664e-05,
"loss": 0.163,
"step": 1430
},
{
"epoch": 50.97345132743363,
"grad_norm": 8.630338668823242,
"learning_rate": 3.1746031746031745e-05,
"loss": 0.1251,
"step": 1440
},
{
"epoch": 50.97345132743363,
"eval_accuracy": 0.81,
"eval_loss": 0.7647480964660645,
"eval_runtime": 5.8333,
"eval_samples_per_second": 68.571,
"eval_steps_per_second": 2.229,
"step": 1440
},
{
"epoch": 51.32743362831859,
"grad_norm": 19.542953491210938,
"learning_rate": 3.158068783068783e-05,
"loss": 0.1289,
"step": 1450
},
{
"epoch": 51.68141592920354,
"grad_norm": 14.15263843536377,
"learning_rate": 3.141534391534391e-05,
"loss": 0.1181,
"step": 1460
},
{
"epoch": 52.0,
"eval_accuracy": 0.8175,
"eval_loss": 0.7552239894866943,
"eval_runtime": 6.0471,
"eval_samples_per_second": 66.148,
"eval_steps_per_second": 2.15,
"step": 1469
},
{
"epoch": 52.0353982300885,
"grad_norm": 8.696867942810059,
"learning_rate": 3.125e-05,
"loss": 0.1232,
"step": 1470
},
{
"epoch": 52.389380530973455,
"grad_norm": 15.115714073181152,
"learning_rate": 3.108465608465609e-05,
"loss": 0.128,
"step": 1480
},
{
"epoch": 52.743362831858406,
"grad_norm": 26.04839515686035,
"learning_rate": 3.091931216931217e-05,
"loss": 0.1224,
"step": 1490
},
{
"epoch": 52.991150442477874,
"eval_accuracy": 0.795,
"eval_loss": 0.8345909118652344,
"eval_runtime": 5.7656,
"eval_samples_per_second": 69.377,
"eval_steps_per_second": 2.255,
"step": 1497
},
{
"epoch": 53.097345132743364,
"grad_norm": 6.820947647094727,
"learning_rate": 3.075396825396826e-05,
"loss": 0.1088,
"step": 1500
},
{
"epoch": 53.45132743362832,
"grad_norm": 10.287090301513672,
"learning_rate": 3.058862433862434e-05,
"loss": 0.1094,
"step": 1510
},
{
"epoch": 53.80530973451327,
"grad_norm": 12.619726181030273,
"learning_rate": 3.0423280423280425e-05,
"loss": 0.1201,
"step": 1520
},
{
"epoch": 53.982300884955755,
"eval_accuracy": 0.7975,
"eval_loss": 0.7740535140037537,
"eval_runtime": 5.8431,
"eval_samples_per_second": 68.457,
"eval_steps_per_second": 2.225,
"step": 1525
},
{
"epoch": 54.15929203539823,
"grad_norm": 3.6249613761901855,
"learning_rate": 3.0257936507936506e-05,
"loss": 0.1278,
"step": 1530
},
{
"epoch": 54.51327433628319,
"grad_norm": 8.490986824035645,
"learning_rate": 3.0092592592592593e-05,
"loss": 0.1359,
"step": 1540
},
{
"epoch": 54.86725663716814,
"grad_norm": 8.873104095458984,
"learning_rate": 2.9927248677248678e-05,
"loss": 0.1109,
"step": 1550
},
{
"epoch": 54.97345132743363,
"eval_accuracy": 0.785,
"eval_loss": 0.772351861000061,
"eval_runtime": 5.8419,
"eval_samples_per_second": 68.47,
"eval_steps_per_second": 2.225,
"step": 1553
},
{
"epoch": 55.2212389380531,
"grad_norm": 3.0876898765563965,
"learning_rate": 2.9761904761904762e-05,
"loss": 0.1108,
"step": 1560
},
{
"epoch": 55.575221238938056,
"grad_norm": 6.528329372406006,
"learning_rate": 2.959656084656085e-05,
"loss": 0.1236,
"step": 1570
},
{
"epoch": 55.92920353982301,
"grad_norm": 11.930560111999512,
"learning_rate": 2.943121693121693e-05,
"loss": 0.1084,
"step": 1580
},
{
"epoch": 56.0,
"eval_accuracy": 0.805,
"eval_loss": 0.7904237508773804,
"eval_runtime": 5.7739,
"eval_samples_per_second": 69.277,
"eval_steps_per_second": 2.252,
"step": 1582
},
{
"epoch": 56.283185840707965,
"grad_norm": 6.980546951293945,
"learning_rate": 2.9265873015873018e-05,
"loss": 0.1144,
"step": 1590
},
{
"epoch": 56.63716814159292,
"grad_norm": 11.986266136169434,
"learning_rate": 2.91005291005291e-05,
"loss": 0.1084,
"step": 1600
},
{
"epoch": 56.991150442477874,
"grad_norm": 6.527588367462158,
"learning_rate": 2.8935185185185186e-05,
"loss": 0.1187,
"step": 1610
},
{
"epoch": 56.991150442477874,
"eval_accuracy": 0.8125,
"eval_loss": 0.7423826456069946,
"eval_runtime": 5.9051,
"eval_samples_per_second": 67.738,
"eval_steps_per_second": 2.201,
"step": 1610
},
{
"epoch": 57.34513274336283,
"grad_norm": 6.542221546173096,
"learning_rate": 2.876984126984127e-05,
"loss": 0.1191,
"step": 1620
},
{
"epoch": 57.69911504424779,
"grad_norm": 6.406174659729004,
"learning_rate": 2.8604497354497355e-05,
"loss": 0.0935,
"step": 1630
},
{
"epoch": 57.982300884955755,
"eval_accuracy": 0.815,
"eval_loss": 0.7410502433776855,
"eval_runtime": 6.0733,
"eval_samples_per_second": 65.862,
"eval_steps_per_second": 2.141,
"step": 1638
},
{
"epoch": 58.05309734513274,
"grad_norm": 5.907907485961914,
"learning_rate": 2.8439153439153442e-05,
"loss": 0.0776,
"step": 1640
},
{
"epoch": 58.4070796460177,
"grad_norm": 7.151067733764648,
"learning_rate": 2.8273809523809523e-05,
"loss": 0.1022,
"step": 1650
},
{
"epoch": 58.76106194690266,
"grad_norm": 6.808224201202393,
"learning_rate": 2.810846560846561e-05,
"loss": 0.1023,
"step": 1660
},
{
"epoch": 58.97345132743363,
"eval_accuracy": 0.81,
"eval_loss": 0.7475782632827759,
"eval_runtime": 5.7079,
"eval_samples_per_second": 70.078,
"eval_steps_per_second": 2.278,
"step": 1666
},
{
"epoch": 59.11504424778761,
"grad_norm": 7.42250919342041,
"learning_rate": 2.7943121693121695e-05,
"loss": 0.1206,
"step": 1670
},
{
"epoch": 59.469026548672566,
"grad_norm": 11.038393020629883,
"learning_rate": 2.777777777777778e-05,
"loss": 0.1148,
"step": 1680
},
{
"epoch": 59.823008849557525,
"grad_norm": 6.563925266265869,
"learning_rate": 2.7612433862433863e-05,
"loss": 0.1166,
"step": 1690
},
{
"epoch": 60.0,
"eval_accuracy": 0.8175,
"eval_loss": 0.7742089033126831,
"eval_runtime": 5.6896,
"eval_samples_per_second": 70.304,
"eval_steps_per_second": 2.285,
"step": 1695
},
{
"epoch": 60.176991150442475,
"grad_norm": 8.62690544128418,
"learning_rate": 2.7447089947089948e-05,
"loss": 0.1133,
"step": 1700
},
{
"epoch": 60.530973451327434,
"grad_norm": 8.701354026794434,
"learning_rate": 2.7281746031746032e-05,
"loss": 0.0961,
"step": 1710
},
{
"epoch": 60.88495575221239,
"grad_norm": 5.880581855773926,
"learning_rate": 2.7116402116402116e-05,
"loss": 0.099,
"step": 1720
},
{
"epoch": 60.991150442477874,
"eval_accuracy": 0.815,
"eval_loss": 0.7696540951728821,
"eval_runtime": 5.8242,
"eval_samples_per_second": 68.679,
"eval_steps_per_second": 2.232,
"step": 1723
},
{
"epoch": 61.23893805309734,
"grad_norm": 6.893290996551514,
"learning_rate": 2.6951058201058204e-05,
"loss": 0.0968,
"step": 1730
},
{
"epoch": 61.5929203539823,
"grad_norm": 9.383642196655273,
"learning_rate": 2.6785714285714288e-05,
"loss": 0.0927,
"step": 1740
},
{
"epoch": 61.94690265486726,
"grad_norm": 10.903635025024414,
"learning_rate": 2.6620370370370372e-05,
"loss": 0.1157,
"step": 1750
},
{
"epoch": 61.982300884955755,
"eval_accuracy": 0.8,
"eval_loss": 0.8537997603416443,
"eval_runtime": 5.862,
"eval_samples_per_second": 68.236,
"eval_steps_per_second": 2.218,
"step": 1751
},
{
"epoch": 62.30088495575221,
"grad_norm": 11.495246887207031,
"learning_rate": 2.6455026455026456e-05,
"loss": 0.1071,
"step": 1760
},
{
"epoch": 62.65486725663717,
"grad_norm": 8.735861778259277,
"learning_rate": 2.628968253968254e-05,
"loss": 0.1137,
"step": 1770
},
{
"epoch": 62.97345132743363,
"eval_accuracy": 0.8125,
"eval_loss": 0.8544909954071045,
"eval_runtime": 5.719,
"eval_samples_per_second": 69.942,
"eval_steps_per_second": 2.273,
"step": 1779
},
{
"epoch": 63.008849557522126,
"grad_norm": 9.511134147644043,
"learning_rate": 2.6124338624338625e-05,
"loss": 0.0962,
"step": 1780
},
{
"epoch": 63.36283185840708,
"grad_norm": 9.274235725402832,
"learning_rate": 2.5958994708994712e-05,
"loss": 0.09,
"step": 1790
},
{
"epoch": 63.716814159292035,
"grad_norm": 17.82819175720215,
"learning_rate": 2.5793650793650796e-05,
"loss": 0.094,
"step": 1800
},
{
"epoch": 64.0,
"eval_accuracy": 0.7925,
"eval_loss": 0.8463137745857239,
"eval_runtime": 5.7366,
"eval_samples_per_second": 69.727,
"eval_steps_per_second": 2.266,
"step": 1808
},
{
"epoch": 64.070796460177,
"grad_norm": 13.545799255371094,
"learning_rate": 2.562830687830688e-05,
"loss": 0.1186,
"step": 1810
},
{
"epoch": 64.42477876106194,
"grad_norm": 8.884902954101562,
"learning_rate": 2.5462962962962965e-05,
"loss": 0.1027,
"step": 1820
},
{
"epoch": 64.77876106194691,
"grad_norm": 9.229199409484863,
"learning_rate": 2.529761904761905e-05,
"loss": 0.1161,
"step": 1830
},
{
"epoch": 64.99115044247787,
"eval_accuracy": 0.81,
"eval_loss": 0.8350917100906372,
"eval_runtime": 5.7053,
"eval_samples_per_second": 70.11,
"eval_steps_per_second": 2.279,
"step": 1836
},
{
"epoch": 65.13274336283186,
"grad_norm": 4.687648296356201,
"learning_rate": 2.5132275132275137e-05,
"loss": 0.0954,
"step": 1840
},
{
"epoch": 65.48672566371681,
"grad_norm": 8.967106819152832,
"learning_rate": 2.496693121693122e-05,
"loss": 0.1102,
"step": 1850
},
{
"epoch": 65.84070796460178,
"grad_norm": 7.873806953430176,
"learning_rate": 2.4801587301587305e-05,
"loss": 0.08,
"step": 1860
},
{
"epoch": 65.98230088495575,
"eval_accuracy": 0.7925,
"eval_loss": 0.8610497117042542,
"eval_runtime": 5.7201,
"eval_samples_per_second": 69.929,
"eval_steps_per_second": 2.273,
"step": 1864
},
{
"epoch": 66.19469026548673,
"grad_norm": 7.836842060089111,
"learning_rate": 2.463624338624339e-05,
"loss": 0.1068,
"step": 1870
},
{
"epoch": 66.54867256637168,
"grad_norm": 8.14975357055664,
"learning_rate": 2.4470899470899473e-05,
"loss": 0.0787,
"step": 1880
},
{
"epoch": 66.90265486725664,
"grad_norm": 5.568120956420898,
"learning_rate": 2.4305555555555558e-05,
"loss": 0.0799,
"step": 1890
},
{
"epoch": 66.97345132743362,
"eval_accuracy": 0.8075,
"eval_loss": 0.8592771887779236,
"eval_runtime": 5.7815,
"eval_samples_per_second": 69.186,
"eval_steps_per_second": 2.249,
"step": 1892
},
{
"epoch": 67.2566371681416,
"grad_norm": 5.271720886230469,
"learning_rate": 2.4140211640211642e-05,
"loss": 0.0781,
"step": 1900
},
{
"epoch": 67.61061946902655,
"grad_norm": 3.000176429748535,
"learning_rate": 2.3974867724867726e-05,
"loss": 0.0927,
"step": 1910
},
{
"epoch": 67.96460176991151,
"grad_norm": 1.825054407119751,
"learning_rate": 2.380952380952381e-05,
"loss": 0.0783,
"step": 1920
},
{
"epoch": 68.0,
"eval_accuracy": 0.815,
"eval_loss": 0.8423022627830505,
"eval_runtime": 5.6528,
"eval_samples_per_second": 70.762,
"eval_steps_per_second": 2.3,
"step": 1921
},
{
"epoch": 68.31858407079646,
"grad_norm": 9.14376449584961,
"learning_rate": 2.3644179894179898e-05,
"loss": 0.1037,
"step": 1930
},
{
"epoch": 68.67256637168141,
"grad_norm": 11.533547401428223,
"learning_rate": 2.3478835978835982e-05,
"loss": 0.0851,
"step": 1940
},
{
"epoch": 68.99115044247787,
"eval_accuracy": 0.82,
"eval_loss": 0.8265037536621094,
"eval_runtime": 5.9247,
"eval_samples_per_second": 67.514,
"eval_steps_per_second": 2.194,
"step": 1949
},
{
"epoch": 69.02654867256638,
"grad_norm": 8.8108549118042,
"learning_rate": 2.3313492063492066e-05,
"loss": 0.1005,
"step": 1950
},
{
"epoch": 69.38053097345133,
"grad_norm": 7.721718788146973,
"learning_rate": 2.314814814814815e-05,
"loss": 0.0839,
"step": 1960
},
{
"epoch": 69.73451327433628,
"grad_norm": 9.032380104064941,
"learning_rate": 2.2982804232804235e-05,
"loss": 0.0775,
"step": 1970
},
{
"epoch": 69.98230088495575,
"eval_accuracy": 0.805,
"eval_loss": 0.8707697987556458,
"eval_runtime": 5.6882,
"eval_samples_per_second": 70.321,
"eval_steps_per_second": 2.285,
"step": 1977
},
{
"epoch": 70.08849557522124,
"grad_norm": 7.697175979614258,
"learning_rate": 2.281746031746032e-05,
"loss": 0.1005,
"step": 1980
},
{
"epoch": 70.4424778761062,
"grad_norm": 11.258265495300293,
"learning_rate": 2.2652116402116403e-05,
"loss": 0.1152,
"step": 1990
},
{
"epoch": 70.79646017699115,
"grad_norm": 6.715971946716309,
"learning_rate": 2.2486772486772487e-05,
"loss": 0.0902,
"step": 2000
},
{
"epoch": 70.97345132743362,
"eval_accuracy": 0.81,
"eval_loss": 0.8180540204048157,
"eval_runtime": 5.7974,
"eval_samples_per_second": 68.996,
"eval_steps_per_second": 2.242,
"step": 2005
},
{
"epoch": 71.15044247787611,
"grad_norm": 8.447669982910156,
"learning_rate": 2.2321428571428575e-05,
"loss": 0.0664,
"step": 2010
},
{
"epoch": 71.50442477876106,
"grad_norm": 10.469520568847656,
"learning_rate": 2.215608465608466e-05,
"loss": 0.0893,
"step": 2020
},
{
"epoch": 71.85840707964601,
"grad_norm": 6.971242904663086,
"learning_rate": 2.1990740740740743e-05,
"loss": 0.0904,
"step": 2030
},
{
"epoch": 72.0,
"eval_accuracy": 0.82,
"eval_loss": 0.8296825885772705,
"eval_runtime": 5.7586,
"eval_samples_per_second": 69.462,
"eval_steps_per_second": 2.258,
"step": 2034
},
{
"epoch": 72.21238938053098,
"grad_norm": 6.059171199798584,
"learning_rate": 2.1825396825396827e-05,
"loss": 0.0767,
"step": 2040
},
{
"epoch": 72.56637168141593,
"grad_norm": 9.134629249572754,
"learning_rate": 2.166005291005291e-05,
"loss": 0.0897,
"step": 2050
},
{
"epoch": 72.92035398230088,
"grad_norm": 6.5583415031433105,
"learning_rate": 2.1494708994708996e-05,
"loss": 0.0898,
"step": 2060
},
{
"epoch": 72.99115044247787,
"eval_accuracy": 0.82,
"eval_loss": 0.8464323282241821,
"eval_runtime": 5.7076,
"eval_samples_per_second": 70.082,
"eval_steps_per_second": 2.278,
"step": 2062
},
{
"epoch": 73.27433628318585,
"grad_norm": 12.021257400512695,
"learning_rate": 2.132936507936508e-05,
"loss": 0.1061,
"step": 2070
},
{
"epoch": 73.6283185840708,
"grad_norm": 4.3469367027282715,
"learning_rate": 2.1164021164021164e-05,
"loss": 0.0744,
"step": 2080
},
{
"epoch": 73.98230088495575,
"grad_norm": 9.244330406188965,
"learning_rate": 2.0998677248677252e-05,
"loss": 0.1013,
"step": 2090
},
{
"epoch": 73.98230088495575,
"eval_accuracy": 0.81,
"eval_loss": 0.8324652314186096,
"eval_runtime": 5.8164,
"eval_samples_per_second": 68.771,
"eval_steps_per_second": 2.235,
"step": 2090
},
{
"epoch": 74.33628318584071,
"grad_norm": 4.678669452667236,
"learning_rate": 2.0833333333333336e-05,
"loss": 0.1001,
"step": 2100
},
{
"epoch": 74.69026548672566,
"grad_norm": 9.78695297241211,
"learning_rate": 2.066798941798942e-05,
"loss": 0.0726,
"step": 2110
},
{
"epoch": 74.97345132743362,
"eval_accuracy": 0.8,
"eval_loss": 0.8771929144859314,
"eval_runtime": 5.7555,
"eval_samples_per_second": 69.499,
"eval_steps_per_second": 2.259,
"step": 2118
},
{
"epoch": 75.04424778761062,
"grad_norm": 3.598501443862915,
"learning_rate": 2.0502645502645504e-05,
"loss": 0.0975,
"step": 2120
},
{
"epoch": 75.39823008849558,
"grad_norm": 10.049243927001953,
"learning_rate": 2.033730158730159e-05,
"loss": 0.0858,
"step": 2130
},
{
"epoch": 75.75221238938053,
"grad_norm": 10.887565612792969,
"learning_rate": 2.0171957671957673e-05,
"loss": 0.0745,
"step": 2140
},
{
"epoch": 76.0,
"eval_accuracy": 0.8125,
"eval_loss": 0.8505265712738037,
"eval_runtime": 5.6825,
"eval_samples_per_second": 70.391,
"eval_steps_per_second": 2.288,
"step": 2147
},
{
"epoch": 76.10619469026548,
"grad_norm": 9.399345397949219,
"learning_rate": 2.0006613756613757e-05,
"loss": 0.0909,
"step": 2150
},
{
"epoch": 76.46017699115045,
"grad_norm": 3.6555662155151367,
"learning_rate": 1.984126984126984e-05,
"loss": 0.0799,
"step": 2160
},
{
"epoch": 76.8141592920354,
"grad_norm": 5.8622894287109375,
"learning_rate": 1.967592592592593e-05,
"loss": 0.0891,
"step": 2170
},
{
"epoch": 76.99115044247787,
"eval_accuracy": 0.81,
"eval_loss": 0.8693811297416687,
"eval_runtime": 5.7342,
"eval_samples_per_second": 69.757,
"eval_steps_per_second": 2.267,
"step": 2175
},
{
"epoch": 77.16814159292035,
"grad_norm": 3.6685290336608887,
"learning_rate": 1.9510582010582013e-05,
"loss": 0.0909,
"step": 2180
},
{
"epoch": 77.52212389380531,
"grad_norm": 9.96608829498291,
"learning_rate": 1.9345238095238097e-05,
"loss": 0.0962,
"step": 2190
},
{
"epoch": 77.87610619469027,
"grad_norm": 7.857000827789307,
"learning_rate": 1.917989417989418e-05,
"loss": 0.0791,
"step": 2200
},
{
"epoch": 77.98230088495575,
"eval_accuracy": 0.81,
"eval_loss": 0.8765752911567688,
"eval_runtime": 5.7248,
"eval_samples_per_second": 69.872,
"eval_steps_per_second": 2.271,
"step": 2203
},
{
"epoch": 78.23008849557522,
"grad_norm": 10.403280258178711,
"learning_rate": 1.9014550264550266e-05,
"loss": 0.0622,
"step": 2210
},
{
"epoch": 78.58407079646018,
"grad_norm": 6.770401477813721,
"learning_rate": 1.884920634920635e-05,
"loss": 0.0689,
"step": 2220
},
{
"epoch": 78.93805309734513,
"grad_norm": 10.228433609008789,
"learning_rate": 1.8683862433862434e-05,
"loss": 0.0639,
"step": 2230
},
{
"epoch": 78.97345132743362,
"eval_accuracy": 0.8125,
"eval_loss": 0.8461715579032898,
"eval_runtime": 6.0121,
"eval_samples_per_second": 66.532,
"eval_steps_per_second": 2.162,
"step": 2231
},
{
"epoch": 79.29203539823008,
"grad_norm": 6.610928535461426,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.0705,
"step": 2240
},
{
"epoch": 79.64601769911505,
"grad_norm": 9.066596984863281,
"learning_rate": 1.8353174603174602e-05,
"loss": 0.0795,
"step": 2250
},
{
"epoch": 80.0,
"grad_norm": 11.537944793701172,
"learning_rate": 1.818783068783069e-05,
"loss": 0.0676,
"step": 2260
},
{
"epoch": 80.0,
"eval_accuracy": 0.8075,
"eval_loss": 0.8991250395774841,
"eval_runtime": 5.7259,
"eval_samples_per_second": 69.858,
"eval_steps_per_second": 2.27,
"step": 2260
},
{
"epoch": 80.35398230088495,
"grad_norm": 5.221861839294434,
"learning_rate": 1.8022486772486774e-05,
"loss": 0.0932,
"step": 2270
},
{
"epoch": 80.70796460176992,
"grad_norm": 6.904067039489746,
"learning_rate": 1.785714285714286e-05,
"loss": 0.0904,
"step": 2280
},
{
"epoch": 80.99115044247787,
"eval_accuracy": 0.815,
"eval_loss": 0.8550813794136047,
"eval_runtime": 5.7321,
"eval_samples_per_second": 69.782,
"eval_steps_per_second": 2.268,
"step": 2288
},
{
"epoch": 81.06194690265487,
"grad_norm": 10.940287590026855,
"learning_rate": 1.7691798941798943e-05,
"loss": 0.0942,
"step": 2290
},
{
"epoch": 81.41592920353982,
"grad_norm": 4.109130382537842,
"learning_rate": 1.7526455026455027e-05,
"loss": 0.0791,
"step": 2300
},
{
"epoch": 81.76991150442478,
"grad_norm": 10.844812393188477,
"learning_rate": 1.736111111111111e-05,
"loss": 0.0788,
"step": 2310
},
{
"epoch": 81.98230088495575,
"eval_accuracy": 0.795,
"eval_loss": 0.9301651120185852,
"eval_runtime": 5.6474,
"eval_samples_per_second": 70.829,
"eval_steps_per_second": 2.302,
"step": 2316
},
{
"epoch": 82.12389380530973,
"grad_norm": 3.5678551197052,
"learning_rate": 1.7195767195767195e-05,
"loss": 0.0603,
"step": 2320
},
{
"epoch": 82.47787610619469,
"grad_norm": 7.562065124511719,
"learning_rate": 1.703042328042328e-05,
"loss": 0.093,
"step": 2330
},
{
"epoch": 82.83185840707965,
"grad_norm": 10.092254638671875,
"learning_rate": 1.6865079365079367e-05,
"loss": 0.0787,
"step": 2340
},
{
"epoch": 82.97345132743362,
"eval_accuracy": 0.8025,
"eval_loss": 0.8706057071685791,
"eval_runtime": 5.8345,
"eval_samples_per_second": 68.558,
"eval_steps_per_second": 2.228,
"step": 2344
},
{
"epoch": 83.1858407079646,
"grad_norm": 4.765111923217773,
"learning_rate": 1.669973544973545e-05,
"loss": 0.0579,
"step": 2350
},
{
"epoch": 83.53982300884955,
"grad_norm": 6.426796913146973,
"learning_rate": 1.6534391534391536e-05,
"loss": 0.0697,
"step": 2360
},
{
"epoch": 83.89380530973452,
"grad_norm": 7.382542610168457,
"learning_rate": 1.636904761904762e-05,
"loss": 0.0918,
"step": 2370
},
{
"epoch": 84.0,
"eval_accuracy": 0.805,
"eval_loss": 0.868044912815094,
"eval_runtime": 5.7723,
"eval_samples_per_second": 69.297,
"eval_steps_per_second": 2.252,
"step": 2373
},
{
"epoch": 84.24778761061947,
"grad_norm": 5.388473987579346,
"learning_rate": 1.6203703703703704e-05,
"loss": 0.0752,
"step": 2380
},
{
"epoch": 84.60176991150442,
"grad_norm": 6.751432418823242,
"learning_rate": 1.6038359788359788e-05,
"loss": 0.0671,
"step": 2390
},
{
"epoch": 84.95575221238938,
"grad_norm": 8.372601509094238,
"learning_rate": 1.5873015873015872e-05,
"loss": 0.0681,
"step": 2400
},
{
"epoch": 84.99115044247787,
"eval_accuracy": 0.8125,
"eval_loss": 0.8481296300888062,
"eval_runtime": 5.711,
"eval_samples_per_second": 70.04,
"eval_steps_per_second": 2.276,
"step": 2401
},
{
"epoch": 85.30973451327434,
"grad_norm": 6.79984712600708,
"learning_rate": 1.5707671957671957e-05,
"loss": 0.0634,
"step": 2410
},
{
"epoch": 85.66371681415929,
"grad_norm": 9.60888671875,
"learning_rate": 1.5542328042328044e-05,
"loss": 0.115,
"step": 2420
},
{
"epoch": 85.98230088495575,
"eval_accuracy": 0.8025,
"eval_loss": 0.8552606105804443,
"eval_runtime": 6.0227,
"eval_samples_per_second": 66.415,
"eval_steps_per_second": 2.158,
"step": 2429
},
{
"epoch": 86.01769911504425,
"grad_norm": 9.006217956542969,
"learning_rate": 1.537698412698413e-05,
"loss": 0.0741,
"step": 2430
},
{
"epoch": 86.3716814159292,
"grad_norm": 8.767806053161621,
"learning_rate": 1.5211640211640213e-05,
"loss": 0.0652,
"step": 2440
},
{
"epoch": 86.72566371681415,
"grad_norm": 6.8285675048828125,
"learning_rate": 1.5046296296296297e-05,
"loss": 0.0599,
"step": 2450
},
{
"epoch": 86.97345132743362,
"eval_accuracy": 0.805,
"eval_loss": 0.8886809349060059,
"eval_runtime": 5.6679,
"eval_samples_per_second": 70.573,
"eval_steps_per_second": 2.294,
"step": 2457
},
{
"epoch": 87.07964601769912,
"grad_norm": 7.630321502685547,
"learning_rate": 1.4880952380952381e-05,
"loss": 0.0805,
"step": 2460
},
{
"epoch": 87.43362831858407,
"grad_norm": 8.364428520202637,
"learning_rate": 1.4715608465608465e-05,
"loss": 0.0743,
"step": 2470
},
{
"epoch": 87.78761061946902,
"grad_norm": 8.15882682800293,
"learning_rate": 1.455026455026455e-05,
"loss": 0.0774,
"step": 2480
},
{
"epoch": 88.0,
"eval_accuracy": 0.81,
"eval_loss": 0.9255210161209106,
"eval_runtime": 5.684,
"eval_samples_per_second": 70.374,
"eval_steps_per_second": 2.287,
"step": 2486
},
{
"epoch": 88.14159292035399,
"grad_norm": 3.7120983600616455,
"learning_rate": 1.4384920634920635e-05,
"loss": 0.0635,
"step": 2490
},
{
"epoch": 88.49557522123894,
"grad_norm": 3.117091655731201,
"learning_rate": 1.4219576719576721e-05,
"loss": 0.0722,
"step": 2500
},
{
"epoch": 88.84955752212389,
"grad_norm": 4.884605407714844,
"learning_rate": 1.4054232804232805e-05,
"loss": 0.0701,
"step": 2510
},
{
"epoch": 88.99115044247787,
"eval_accuracy": 0.81,
"eval_loss": 0.8794758319854736,
"eval_runtime": 5.697,
"eval_samples_per_second": 70.213,
"eval_steps_per_second": 2.282,
"step": 2514
},
{
"epoch": 89.20353982300885,
"grad_norm": 6.620547294616699,
"learning_rate": 1.388888888888889e-05,
"loss": 0.0626,
"step": 2520
},
{
"epoch": 89.5575221238938,
"grad_norm": 6.801345348358154,
"learning_rate": 1.3723544973544974e-05,
"loss": 0.0682,
"step": 2530
},
{
"epoch": 89.91150442477876,
"grad_norm": 5.619492053985596,
"learning_rate": 1.3558201058201058e-05,
"loss": 0.074,
"step": 2540
},
{
"epoch": 89.98230088495575,
"eval_accuracy": 0.8175,
"eval_loss": 0.8634124994277954,
"eval_runtime": 5.7593,
"eval_samples_per_second": 69.453,
"eval_steps_per_second": 2.257,
"step": 2542
},
{
"epoch": 90.26548672566372,
"grad_norm": 10.153610229492188,
"learning_rate": 1.3392857142857144e-05,
"loss": 0.0794,
"step": 2550
},
{
"epoch": 90.61946902654867,
"grad_norm": 5.089029312133789,
"learning_rate": 1.3227513227513228e-05,
"loss": 0.0611,
"step": 2560
},
{
"epoch": 90.97345132743362,
"grad_norm": 6.003979206085205,
"learning_rate": 1.3062169312169312e-05,
"loss": 0.0497,
"step": 2570
},
{
"epoch": 90.97345132743362,
"eval_accuracy": 0.82,
"eval_loss": 0.8793442249298096,
"eval_runtime": 6.0414,
"eval_samples_per_second": 66.21,
"eval_steps_per_second": 2.152,
"step": 2570
},
{
"epoch": 91.32743362831859,
"grad_norm": 6.3248090744018555,
"learning_rate": 1.2896825396825398e-05,
"loss": 0.0557,
"step": 2580
},
{
"epoch": 91.68141592920354,
"grad_norm": 4.216904640197754,
"learning_rate": 1.2731481481481482e-05,
"loss": 0.0569,
"step": 2590
},
{
"epoch": 92.0,
"eval_accuracy": 0.7925,
"eval_loss": 0.9006764888763428,
"eval_runtime": 5.7399,
"eval_samples_per_second": 69.688,
"eval_steps_per_second": 2.265,
"step": 2599
},
{
"epoch": 92.03539823008849,
"grad_norm": 6.895070552825928,
"learning_rate": 1.2566137566137568e-05,
"loss": 0.0769,
"step": 2600
},
{
"epoch": 92.38938053097345,
"grad_norm": 3.7500102519989014,
"learning_rate": 1.2400793650793652e-05,
"loss": 0.0543,
"step": 2610
},
{
"epoch": 92.7433628318584,
"grad_norm": 10.693458557128906,
"learning_rate": 1.2235449735449737e-05,
"loss": 0.0722,
"step": 2620
},
{
"epoch": 92.99115044247787,
"eval_accuracy": 0.815,
"eval_loss": 0.8700942993164062,
"eval_runtime": 5.6593,
"eval_samples_per_second": 70.681,
"eval_steps_per_second": 2.297,
"step": 2627
},
{
"epoch": 93.09734513274336,
"grad_norm": 5.407220363616943,
"learning_rate": 1.2070105820105821e-05,
"loss": 0.0822,
"step": 2630
},
{
"epoch": 93.45132743362832,
"grad_norm": 2.9195971488952637,
"learning_rate": 1.1904761904761905e-05,
"loss": 0.0624,
"step": 2640
},
{
"epoch": 93.80530973451327,
"grad_norm": 7.752827167510986,
"learning_rate": 1.1739417989417991e-05,
"loss": 0.0674,
"step": 2650
},
{
"epoch": 93.98230088495575,
"eval_accuracy": 0.8225,
"eval_loss": 0.8879609704017639,
"eval_runtime": 5.7797,
"eval_samples_per_second": 69.208,
"eval_steps_per_second": 2.249,
"step": 2655
},
{
"epoch": 94.15929203539822,
"grad_norm": 6.314310550689697,
"learning_rate": 1.1574074074074075e-05,
"loss": 0.0685,
"step": 2660
},
{
"epoch": 94.51327433628319,
"grad_norm": 9.070263862609863,
"learning_rate": 1.140873015873016e-05,
"loss": 0.0649,
"step": 2670
},
{
"epoch": 94.86725663716814,
"grad_norm": 4.211071968078613,
"learning_rate": 1.1243386243386244e-05,
"loss": 0.0643,
"step": 2680
},
{
"epoch": 94.97345132743362,
"eval_accuracy": 0.8075,
"eval_loss": 0.8854994177818298,
"eval_runtime": 5.6723,
"eval_samples_per_second": 70.518,
"eval_steps_per_second": 2.292,
"step": 2683
},
{
"epoch": 95.22123893805309,
"grad_norm": 6.701349258422852,
"learning_rate": 1.107804232804233e-05,
"loss": 0.054,
"step": 2690
},
{
"epoch": 95.57522123893806,
"grad_norm": 9.44869327545166,
"learning_rate": 1.0912698412698414e-05,
"loss": 0.0594,
"step": 2700
},
{
"epoch": 95.929203539823,
"grad_norm": 5.749889373779297,
"learning_rate": 1.0747354497354498e-05,
"loss": 0.0583,
"step": 2710
},
{
"epoch": 96.0,
"eval_accuracy": 0.815,
"eval_loss": 0.8918322920799255,
"eval_runtime": 5.8266,
"eval_samples_per_second": 68.651,
"eval_steps_per_second": 2.231,
"step": 2712
},
{
"epoch": 96.28318584070796,
"grad_norm": 9.790497779846191,
"learning_rate": 1.0582010582010582e-05,
"loss": 0.0731,
"step": 2720
},
{
"epoch": 96.63716814159292,
"grad_norm": 10.20504379272461,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.067,
"step": 2730
},
{
"epoch": 96.99115044247787,
"grad_norm": 9.304971694946289,
"learning_rate": 1.0251322751322752e-05,
"loss": 0.0558,
"step": 2740
},
{
"epoch": 96.99115044247787,
"eval_accuracy": 0.8275,
"eval_loss": 0.8735535144805908,
"eval_runtime": 5.748,
"eval_samples_per_second": 69.59,
"eval_steps_per_second": 2.262,
"step": 2740
},
{
"epoch": 97.34513274336283,
"grad_norm": 2.155658483505249,
"learning_rate": 1.0085978835978836e-05,
"loss": 0.0952,
"step": 2750
},
{
"epoch": 97.69911504424779,
"grad_norm": 4.216080665588379,
"learning_rate": 9.92063492063492e-06,
"loss": 0.0622,
"step": 2760
},
{
"epoch": 97.98230088495575,
"eval_accuracy": 0.815,
"eval_loss": 0.905790388584137,
"eval_runtime": 6.0004,
"eval_samples_per_second": 66.662,
"eval_steps_per_second": 2.167,
"step": 2768
},
{
"epoch": 98.05309734513274,
"grad_norm": 5.462803840637207,
"learning_rate": 9.755291005291007e-06,
"loss": 0.0576,
"step": 2770
},
{
"epoch": 98.40707964601769,
"grad_norm": 6.204135894775391,
"learning_rate": 9.58994708994709e-06,
"loss": 0.0871,
"step": 2780
},
{
"epoch": 98.76106194690266,
"grad_norm": 7.025479316711426,
"learning_rate": 9.424603174603175e-06,
"loss": 0.0689,
"step": 2790
},
{
"epoch": 98.97345132743362,
"eval_accuracy": 0.8075,
"eval_loss": 0.9006683230400085,
"eval_runtime": 5.6446,
"eval_samples_per_second": 70.864,
"eval_steps_per_second": 2.303,
"step": 2796
},
{
"epoch": 99.11504424778761,
"grad_norm": 12.175108909606934,
"learning_rate": 9.259259259259259e-06,
"loss": 0.0653,
"step": 2800
},
{
"epoch": 99.46902654867256,
"grad_norm": 2.277102470397949,
"learning_rate": 9.093915343915345e-06,
"loss": 0.0533,
"step": 2810
},
{
"epoch": 99.82300884955752,
"grad_norm": 3.374624252319336,
"learning_rate": 8.92857142857143e-06,
"loss": 0.0782,
"step": 2820
},
{
"epoch": 100.0,
"eval_accuracy": 0.8025,
"eval_loss": 0.9216282367706299,
"eval_runtime": 5.6935,
"eval_samples_per_second": 70.255,
"eval_steps_per_second": 2.283,
"step": 2825
},
{
"epoch": 100.17699115044248,
"grad_norm": 4.110525131225586,
"learning_rate": 8.763227513227513e-06,
"loss": 0.0678,
"step": 2830
},
{
"epoch": 100.53097345132744,
"grad_norm": 1.8253668546676636,
"learning_rate": 8.597883597883598e-06,
"loss": 0.0506,
"step": 2840
},
{
"epoch": 100.88495575221239,
"grad_norm": 5.427482604980469,
"learning_rate": 8.432539682539684e-06,
"loss": 0.0696,
"step": 2850
},
{
"epoch": 100.99115044247787,
"eval_accuracy": 0.8075,
"eval_loss": 0.9158985614776611,
"eval_runtime": 5.7392,
"eval_samples_per_second": 69.696,
"eval_steps_per_second": 2.265,
"step": 2853
},
{
"epoch": 101.23893805309734,
"grad_norm": 11.669960975646973,
"learning_rate": 8.267195767195768e-06,
"loss": 0.0424,
"step": 2860
},
{
"epoch": 101.59292035398231,
"grad_norm": 3.504544496536255,
"learning_rate": 8.101851851851852e-06,
"loss": 0.0676,
"step": 2870
},
{
"epoch": 101.94690265486726,
"grad_norm": 4.86131477355957,
"learning_rate": 7.936507936507936e-06,
"loss": 0.0554,
"step": 2880
},
{
"epoch": 101.98230088495575,
"eval_accuracy": 0.8125,
"eval_loss": 0.9194761514663696,
"eval_runtime": 5.7011,
"eval_samples_per_second": 70.162,
"eval_steps_per_second": 2.28,
"step": 2881
},
{
"epoch": 102.30088495575221,
"grad_norm": 10.070198059082031,
"learning_rate": 7.771164021164022e-06,
"loss": 0.0784,
"step": 2890
},
{
"epoch": 102.65486725663717,
"grad_norm": 6.48520040512085,
"learning_rate": 7.605820105820106e-06,
"loss": 0.0585,
"step": 2900
},
{
"epoch": 102.97345132743362,
"eval_accuracy": 0.8125,
"eval_loss": 0.9314340949058533,
"eval_runtime": 5.7273,
"eval_samples_per_second": 69.841,
"eval_steps_per_second": 2.27,
"step": 2909
},
{
"epoch": 103.00884955752213,
"grad_norm": 3.9195964336395264,
"learning_rate": 7.4404761904761905e-06,
"loss": 0.068,
"step": 2910
},
{
"epoch": 103.36283185840708,
"grad_norm": 7.709494590759277,
"learning_rate": 7.275132275132275e-06,
"loss": 0.0555,
"step": 2920
},
{
"epoch": 103.71681415929204,
"grad_norm": 5.181375026702881,
"learning_rate": 7.1097883597883606e-06,
"loss": 0.0541,
"step": 2930
},
{
"epoch": 104.0,
"eval_accuracy": 0.825,
"eval_loss": 0.893872082233429,
"eval_runtime": 5.7025,
"eval_samples_per_second": 70.145,
"eval_steps_per_second": 2.28,
"step": 2938
},
{
"epoch": 104.070796460177,
"grad_norm": 11.922988891601562,
"learning_rate": 6.944444444444445e-06,
"loss": 0.0656,
"step": 2940
},
{
"epoch": 104.42477876106194,
"grad_norm": 6.153034687042236,
"learning_rate": 6.779100529100529e-06,
"loss": 0.0668,
"step": 2950
},
{
"epoch": 104.77876106194691,
"grad_norm": 7.586311340332031,
"learning_rate": 6.613756613756614e-06,
"loss": 0.0636,
"step": 2960
},
{
"epoch": 104.99115044247787,
"eval_accuracy": 0.8025,
"eval_loss": 0.9045028686523438,
"eval_runtime": 5.7995,
"eval_samples_per_second": 68.972,
"eval_steps_per_second": 2.242,
"step": 2966
},
{
"epoch": 105.13274336283186,
"grad_norm": 7.499364852905273,
"learning_rate": 6.448412698412699e-06,
"loss": 0.0578,
"step": 2970
},
{
"epoch": 105.48672566371681,
"grad_norm": 5.914554595947266,
"learning_rate": 6.283068783068784e-06,
"loss": 0.0683,
"step": 2980
},
{
"epoch": 105.84070796460178,
"grad_norm": 5.446691513061523,
"learning_rate": 6.117724867724868e-06,
"loss": 0.0684,
"step": 2990
},
{
"epoch": 105.98230088495575,
"eval_accuracy": 0.8075,
"eval_loss": 0.8892062306404114,
"eval_runtime": 5.7229,
"eval_samples_per_second": 69.895,
"eval_steps_per_second": 2.272,
"step": 2994
},
{
"epoch": 106.19469026548673,
"grad_norm": 5.555883407592773,
"learning_rate": 5.9523809523809525e-06,
"loss": 0.0629,
"step": 3000
},
{
"epoch": 106.54867256637168,
"grad_norm": 6.0038042068481445,
"learning_rate": 5.787037037037038e-06,
"loss": 0.0637,
"step": 3010
},
{
"epoch": 106.90265486725664,
"grad_norm": 4.660765647888184,
"learning_rate": 5.621693121693122e-06,
"loss": 0.0608,
"step": 3020
},
{
"epoch": 106.97345132743362,
"eval_accuracy": 0.8075,
"eval_loss": 0.8998861908912659,
"eval_runtime": 5.6896,
"eval_samples_per_second": 70.303,
"eval_steps_per_second": 2.285,
"step": 3022
},
{
"epoch": 107.2566371681416,
"grad_norm": 2.3250956535339355,
"learning_rate": 5.456349206349207e-06,
"loss": 0.065,
"step": 3030
},
{
"epoch": 107.61061946902655,
"grad_norm": 3.4695749282836914,
"learning_rate": 5.291005291005291e-06,
"loss": 0.0614,
"step": 3040
},
{
"epoch": 107.96460176991151,
"grad_norm": 1.7842631340026855,
"learning_rate": 5.125661375661376e-06,
"loss": 0.0663,
"step": 3050
},
{
"epoch": 108.0,
"eval_accuracy": 0.8075,
"eval_loss": 0.9033371210098267,
"eval_runtime": 6.1003,
"eval_samples_per_second": 65.571,
"eval_steps_per_second": 2.131,
"step": 3051
},
{
"epoch": 108.31858407079646,
"grad_norm": 4.970130920410156,
"learning_rate": 4.96031746031746e-06,
"loss": 0.0466,
"step": 3060
},
{
"epoch": 108.67256637168141,
"grad_norm": 6.935737609863281,
"learning_rate": 4.794973544973545e-06,
"loss": 0.054,
"step": 3070
},
{
"epoch": 108.99115044247787,
"eval_accuracy": 0.805,
"eval_loss": 0.9248512387275696,
"eval_runtime": 5.6759,
"eval_samples_per_second": 70.473,
"eval_steps_per_second": 2.29,
"step": 3079
},
{
"epoch": 109.02654867256638,
"grad_norm": 1.8442103862762451,
"learning_rate": 4.6296296296296296e-06,
"loss": 0.0681,
"step": 3080
},
{
"epoch": 109.38053097345133,
"grad_norm": 6.584358215332031,
"learning_rate": 4.464285714285715e-06,
"loss": 0.0551,
"step": 3090
},
{
"epoch": 109.73451327433628,
"grad_norm": 2.5174403190612793,
"learning_rate": 4.298941798941799e-06,
"loss": 0.0538,
"step": 3100
},
{
"epoch": 109.98230088495575,
"eval_accuracy": 0.81,
"eval_loss": 0.9065310955047607,
"eval_runtime": 5.7721,
"eval_samples_per_second": 69.299,
"eval_steps_per_second": 2.252,
"step": 3107
},
{
"epoch": 110.08849557522124,
"grad_norm": 3.406076669692993,
"learning_rate": 4.133597883597884e-06,
"loss": 0.0768,
"step": 3110
},
{
"epoch": 110.4424778761062,
"grad_norm": 3.832432746887207,
"learning_rate": 3.968253968253968e-06,
"loss": 0.0674,
"step": 3120
},
{
"epoch": 110.79646017699115,
"grad_norm": 2.3623971939086914,
"learning_rate": 3.802910052910053e-06,
"loss": 0.0696,
"step": 3130
},
{
"epoch": 110.97345132743362,
"eval_accuracy": 0.8175,
"eval_loss": 0.9002352952957153,
"eval_runtime": 5.823,
"eval_samples_per_second": 68.693,
"eval_steps_per_second": 2.233,
"step": 3135
},
{
"epoch": 111.15044247787611,
"grad_norm": 8.914955139160156,
"learning_rate": 3.6375661375661373e-06,
"loss": 0.059,
"step": 3140
},
{
"epoch": 111.50442477876106,
"grad_norm": 6.295753479003906,
"learning_rate": 3.4722222222222224e-06,
"loss": 0.0496,
"step": 3150
},
{
"epoch": 111.85840707964601,
"grad_norm": 4.604882717132568,
"learning_rate": 3.306878306878307e-06,
"loss": 0.0585,
"step": 3160
},
{
"epoch": 112.0,
"eval_accuracy": 0.8025,
"eval_loss": 0.9105786085128784,
"eval_runtime": 5.6843,
"eval_samples_per_second": 70.369,
"eval_steps_per_second": 2.287,
"step": 3164
},
{
"epoch": 112.21238938053098,
"grad_norm": 6.111964702606201,
"learning_rate": 3.141534391534392e-06,
"loss": 0.0387,
"step": 3170
},
{
"epoch": 112.56637168141593,
"grad_norm": 9.29433536529541,
"learning_rate": 2.9761904761904763e-06,
"loss": 0.0623,
"step": 3180
},
{
"epoch": 112.92035398230088,
"grad_norm": 10.9270601272583,
"learning_rate": 2.810846560846561e-06,
"loss": 0.0641,
"step": 3190
},
{
"epoch": 112.99115044247787,
"eval_accuracy": 0.81,
"eval_loss": 0.9088242053985596,
"eval_runtime": 5.7312,
"eval_samples_per_second": 69.794,
"eval_steps_per_second": 2.268,
"step": 3192
},
{
"epoch": 113.27433628318585,
"grad_norm": 9.549851417541504,
"learning_rate": 2.6455026455026455e-06,
"loss": 0.0555,
"step": 3200
},
{
"epoch": 113.6283185840708,
"grad_norm": 7.800489902496338,
"learning_rate": 2.48015873015873e-06,
"loss": 0.0527,
"step": 3210
},
{
"epoch": 113.98230088495575,
"grad_norm": 4.22268533706665,
"learning_rate": 2.3148148148148148e-06,
"loss": 0.0611,
"step": 3220
},
{
"epoch": 113.98230088495575,
"eval_accuracy": 0.8075,
"eval_loss": 0.915170431137085,
"eval_runtime": 5.6521,
"eval_samples_per_second": 70.77,
"eval_steps_per_second": 2.3,
"step": 3220
},
{
"epoch": 114.33628318584071,
"grad_norm": 6.939643383026123,
"learning_rate": 2.1494708994708994e-06,
"loss": 0.092,
"step": 3230
},
{
"epoch": 114.69026548672566,
"grad_norm": 4.18217658996582,
"learning_rate": 1.984126984126984e-06,
"loss": 0.0528,
"step": 3240
},
{
"epoch": 114.97345132743362,
"eval_accuracy": 0.8125,
"eval_loss": 0.9140109419822693,
"eval_runtime": 5.9093,
"eval_samples_per_second": 67.69,
"eval_steps_per_second": 2.2,
"step": 3248
},
{
"epoch": 115.04424778761062,
"grad_norm": 2.470015287399292,
"learning_rate": 1.8187830687830687e-06,
"loss": 0.0664,
"step": 3250
},
{
"epoch": 115.39823008849558,
"grad_norm": 5.625057220458984,
"learning_rate": 1.6534391534391535e-06,
"loss": 0.0567,
"step": 3260
},
{
"epoch": 115.75221238938053,
"grad_norm": 8.182403564453125,
"learning_rate": 1.4880952380952381e-06,
"loss": 0.0631,
"step": 3270
},
{
"epoch": 116.0,
"eval_accuracy": 0.81,
"eval_loss": 0.9184489250183105,
"eval_runtime": 5.7237,
"eval_samples_per_second": 69.885,
"eval_steps_per_second": 2.271,
"step": 3277
},
{
"epoch": 116.10619469026548,
"grad_norm": 3.7127795219421387,
"learning_rate": 1.3227513227513228e-06,
"loss": 0.0667,
"step": 3280
},
{
"epoch": 116.46017699115045,
"grad_norm": 9.592253684997559,
"learning_rate": 1.1574074074074074e-06,
"loss": 0.0559,
"step": 3290
},
{
"epoch": 116.8141592920354,
"grad_norm": 12.001859664916992,
"learning_rate": 9.92063492063492e-07,
"loss": 0.0744,
"step": 3300
},
{
"epoch": 116.99115044247787,
"eval_accuracy": 0.8125,
"eval_loss": 0.9216000437736511,
"eval_runtime": 5.7416,
"eval_samples_per_second": 69.667,
"eval_steps_per_second": 2.264,
"step": 3305
},
{
"epoch": 117.16814159292035,
"grad_norm": 5.99811315536499,
"learning_rate": 8.267195767195768e-07,
"loss": 0.051,
"step": 3310
},
{
"epoch": 117.52212389380531,
"grad_norm": 4.772040367126465,
"learning_rate": 6.613756613756614e-07,
"loss": 0.0705,
"step": 3320
},
{
"epoch": 117.87610619469027,
"grad_norm": 3.2538766860961914,
"learning_rate": 4.96031746031746e-07,
"loss": 0.0407,
"step": 3330
},
{
"epoch": 117.98230088495575,
"eval_accuracy": 0.8125,
"eval_loss": 0.9210975766181946,
"eval_runtime": 5.6807,
"eval_samples_per_second": 70.414,
"eval_steps_per_second": 2.288,
"step": 3333
},
{
"epoch": 118.23008849557522,
"grad_norm": 5.481634140014648,
"learning_rate": 3.306878306878307e-07,
"loss": 0.0682,
"step": 3340
},
{
"epoch": 118.58407079646018,
"grad_norm": 7.475958824157715,
"learning_rate": 1.6534391534391535e-07,
"loss": 0.058,
"step": 3350
},
{
"epoch": 118.93805309734513,
"grad_norm": 12.294817924499512,
"learning_rate": 0.0,
"loss": 0.0573,
"step": 3360
},
{
"epoch": 118.93805309734513,
"eval_accuracy": 0.81,
"eval_loss": 0.92020583152771,
"eval_runtime": 5.7978,
"eval_samples_per_second": 68.992,
"eval_steps_per_second": 2.242,
"step": 3360
},
{
"epoch": 118.93805309734513,
"step": 3360,
"total_flos": 1.0779764781475824e+19,
"train_loss": 0.2818299961143306,
"train_runtime": 9749.4057,
"train_samples_per_second": 44.31,
"train_steps_per_second": 0.345
}
],
"logging_steps": 10,
"max_steps": 3360,
"num_input_tokens_seen": 0,
"num_train_epochs": 120,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0779764781475824e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}