zephyr-7b-sft-qlora / trainer_state.json
Dataset-t-t-t-t-t-t-t-t's picture
Model save
98a2ca5 verified
raw
history blame
163 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.265343793262575,
"eval_steps": 500,
"global_step": 4600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.7683433317951084e-05,
"grad_norm": 0.3952319025993347,
"learning_rate": 1.1534025374855825e-07,
"loss": 1.182,
"step": 1
},
{
"epoch": 0.0002884171665897554,
"grad_norm": 0.3334461748600006,
"learning_rate": 5.767012687427913e-07,
"loss": 1.0887,
"step": 5
},
{
"epoch": 0.0005768343331795108,
"grad_norm": 0.41704559326171875,
"learning_rate": 1.1534025374855826e-06,
"loss": 1.2132,
"step": 10
},
{
"epoch": 0.0008652514997692663,
"grad_norm": 0.4982852637767792,
"learning_rate": 1.7301038062283738e-06,
"loss": 1.1888,
"step": 15
},
{
"epoch": 0.0011536686663590216,
"grad_norm": 0.3702298104763031,
"learning_rate": 2.3068050749711653e-06,
"loss": 1.2105,
"step": 20
},
{
"epoch": 0.001442085832948777,
"grad_norm": 0.3640645444393158,
"learning_rate": 2.8835063437139563e-06,
"loss": 1.1714,
"step": 25
},
{
"epoch": 0.0017305029995385325,
"grad_norm": 0.31508558988571167,
"learning_rate": 3.4602076124567477e-06,
"loss": 1.0438,
"step": 30
},
{
"epoch": 0.0020189201661282878,
"grad_norm": 0.3910152018070221,
"learning_rate": 4.036908881199539e-06,
"loss": 1.212,
"step": 35
},
{
"epoch": 0.0023073373327180432,
"grad_norm": 0.32711583375930786,
"learning_rate": 4.6136101499423305e-06,
"loss": 1.1552,
"step": 40
},
{
"epoch": 0.0025957544993077987,
"grad_norm": 0.37455540895462036,
"learning_rate": 5.190311418685121e-06,
"loss": 1.1355,
"step": 45
},
{
"epoch": 0.002884171665897554,
"grad_norm": 0.32155269384384155,
"learning_rate": 5.7670126874279126e-06,
"loss": 1.1375,
"step": 50
},
{
"epoch": 0.0031725888324873096,
"grad_norm": 0.29815641045570374,
"learning_rate": 6.3437139561707036e-06,
"loss": 1.1193,
"step": 55
},
{
"epoch": 0.003461005999077065,
"grad_norm": 0.39492201805114746,
"learning_rate": 6.920415224913495e-06,
"loss": 1.1053,
"step": 60
},
{
"epoch": 0.0037494231656668205,
"grad_norm": 0.3298701345920563,
"learning_rate": 7.497116493656286e-06,
"loss": 1.107,
"step": 65
},
{
"epoch": 0.0040378403322565756,
"grad_norm": 0.3114672005176544,
"learning_rate": 8.073817762399077e-06,
"loss": 1.0677,
"step": 70
},
{
"epoch": 0.0043262574988463314,
"grad_norm": 0.3159383535385132,
"learning_rate": 8.650519031141868e-06,
"loss": 1.0959,
"step": 75
},
{
"epoch": 0.0046146746654360865,
"grad_norm": 0.2858622074127197,
"learning_rate": 9.227220299884661e-06,
"loss": 1.0435,
"step": 80
},
{
"epoch": 0.004903091832025842,
"grad_norm": 0.3337515890598297,
"learning_rate": 9.803921568627451e-06,
"loss": 0.9889,
"step": 85
},
{
"epoch": 0.005191508998615597,
"grad_norm": 0.3027825951576233,
"learning_rate": 1.0380622837370241e-05,
"loss": 1.1145,
"step": 90
},
{
"epoch": 0.005479926165205353,
"grad_norm": 0.34131115674972534,
"learning_rate": 1.0957324106113035e-05,
"loss": 1.0596,
"step": 95
},
{
"epoch": 0.005768343331795108,
"grad_norm": 0.3263566792011261,
"learning_rate": 1.1534025374855825e-05,
"loss": 0.9887,
"step": 100
},
{
"epoch": 0.006056760498384864,
"grad_norm": 0.325528085231781,
"learning_rate": 1.2110726643598615e-05,
"loss": 1.0143,
"step": 105
},
{
"epoch": 0.006345177664974619,
"grad_norm": 0.3773256242275238,
"learning_rate": 1.2687427912341407e-05,
"loss": 1.0,
"step": 110
},
{
"epoch": 0.006633594831564375,
"grad_norm": 0.2968287765979767,
"learning_rate": 1.3264129181084197e-05,
"loss": 0.9572,
"step": 115
},
{
"epoch": 0.00692201199815413,
"grad_norm": 0.29874077439308167,
"learning_rate": 1.384083044982699e-05,
"loss": 1.0344,
"step": 120
},
{
"epoch": 0.007210429164743885,
"grad_norm": 0.3251142203807831,
"learning_rate": 1.4417531718569783e-05,
"loss": 1.0183,
"step": 125
},
{
"epoch": 0.007498846331333641,
"grad_norm": 0.29589974880218506,
"learning_rate": 1.4994232987312573e-05,
"loss": 1.047,
"step": 130
},
{
"epoch": 0.007787263497923396,
"grad_norm": 0.3242173194885254,
"learning_rate": 1.5570934256055363e-05,
"loss": 1.0461,
"step": 135
},
{
"epoch": 0.008075680664513151,
"grad_norm": 0.31147414445877075,
"learning_rate": 1.6147635524798155e-05,
"loss": 1.047,
"step": 140
},
{
"epoch": 0.008364097831102908,
"grad_norm": 0.31779709458351135,
"learning_rate": 1.6724336793540947e-05,
"loss": 1.0784,
"step": 145
},
{
"epoch": 0.008652514997692663,
"grad_norm": 0.3391679525375366,
"learning_rate": 1.7301038062283735e-05,
"loss": 1.0576,
"step": 150
},
{
"epoch": 0.008940932164282418,
"grad_norm": 0.3228215277194977,
"learning_rate": 1.787773933102653e-05,
"loss": 1.0145,
"step": 155
},
{
"epoch": 0.009229349330872173,
"grad_norm": 0.30271971225738525,
"learning_rate": 1.8454440599769322e-05,
"loss": 0.9874,
"step": 160
},
{
"epoch": 0.00951776649746193,
"grad_norm": 0.30643004179000854,
"learning_rate": 1.903114186851211e-05,
"loss": 0.9733,
"step": 165
},
{
"epoch": 0.009806183664051685,
"grad_norm": 0.36777183413505554,
"learning_rate": 1.9607843137254903e-05,
"loss": 1.0242,
"step": 170
},
{
"epoch": 0.01009460083064144,
"grad_norm": 0.3419516086578369,
"learning_rate": 2.0184544405997694e-05,
"loss": 1.1211,
"step": 175
},
{
"epoch": 0.010383017997231195,
"grad_norm": 0.3591030538082123,
"learning_rate": 2.0761245674740483e-05,
"loss": 1.0323,
"step": 180
},
{
"epoch": 0.01067143516382095,
"grad_norm": 0.38365352153778076,
"learning_rate": 2.1337946943483278e-05,
"loss": 0.9613,
"step": 185
},
{
"epoch": 0.010959852330410707,
"grad_norm": 0.3436645269393921,
"learning_rate": 2.191464821222607e-05,
"loss": 1.0753,
"step": 190
},
{
"epoch": 0.011248269497000462,
"grad_norm": 0.341776967048645,
"learning_rate": 2.249134948096886e-05,
"loss": 1.064,
"step": 195
},
{
"epoch": 0.011536686663590217,
"grad_norm": 0.38297685980796814,
"learning_rate": 2.306805074971165e-05,
"loss": 1.0105,
"step": 200
},
{
"epoch": 0.011825103830179972,
"grad_norm": 0.3430030643939972,
"learning_rate": 2.3644752018454442e-05,
"loss": 1.0103,
"step": 205
},
{
"epoch": 0.012113520996769728,
"grad_norm": 0.3319534361362457,
"learning_rate": 2.422145328719723e-05,
"loss": 1.0671,
"step": 210
},
{
"epoch": 0.012401938163359483,
"grad_norm": 0.3615305423736572,
"learning_rate": 2.4798154555940022e-05,
"loss": 0.9236,
"step": 215
},
{
"epoch": 0.012690355329949238,
"grad_norm": 0.4457886517047882,
"learning_rate": 2.5374855824682814e-05,
"loss": 1.0461,
"step": 220
},
{
"epoch": 0.012978772496538993,
"grad_norm": 0.7715578675270081,
"learning_rate": 2.5951557093425606e-05,
"loss": 1.0131,
"step": 225
},
{
"epoch": 0.01326718966312875,
"grad_norm": 0.4368738830089569,
"learning_rate": 2.6528258362168395e-05,
"loss": 1.0255,
"step": 230
},
{
"epoch": 0.013555606829718505,
"grad_norm": 0.38978299498558044,
"learning_rate": 2.7104959630911193e-05,
"loss": 0.9773,
"step": 235
},
{
"epoch": 0.01384402399630826,
"grad_norm": 0.35930851101875305,
"learning_rate": 2.768166089965398e-05,
"loss": 1.0043,
"step": 240
},
{
"epoch": 0.014132441162898015,
"grad_norm": 0.37871646881103516,
"learning_rate": 2.8258362168396773e-05,
"loss": 1.0082,
"step": 245
},
{
"epoch": 0.01442085832948777,
"grad_norm": 0.3493201732635498,
"learning_rate": 2.8835063437139565e-05,
"loss": 0.9856,
"step": 250
},
{
"epoch": 0.014709275496077527,
"grad_norm": 0.364734947681427,
"learning_rate": 2.9411764705882354e-05,
"loss": 1.0379,
"step": 255
},
{
"epoch": 0.014997692662667282,
"grad_norm": 0.3644263446331024,
"learning_rate": 2.9988465974625146e-05,
"loss": 1.006,
"step": 260
},
{
"epoch": 0.015286109829257037,
"grad_norm": 0.3671714961528778,
"learning_rate": 3.0565167243367934e-05,
"loss": 0.9499,
"step": 265
},
{
"epoch": 0.015574526995846792,
"grad_norm": 0.384804904460907,
"learning_rate": 3.1141868512110726e-05,
"loss": 1.0438,
"step": 270
},
{
"epoch": 0.015862944162436547,
"grad_norm": 0.36940938234329224,
"learning_rate": 3.171856978085352e-05,
"loss": 0.9476,
"step": 275
},
{
"epoch": 0.016151361329026302,
"grad_norm": 0.38267725706100464,
"learning_rate": 3.229527104959631e-05,
"loss": 0.9689,
"step": 280
},
{
"epoch": 0.01643977849561606,
"grad_norm": 0.3497903347015381,
"learning_rate": 3.28719723183391e-05,
"loss": 0.9143,
"step": 285
},
{
"epoch": 0.016728195662205816,
"grad_norm": 0.3465529978275299,
"learning_rate": 3.344867358708189e-05,
"loss": 0.9616,
"step": 290
},
{
"epoch": 0.01701661282879557,
"grad_norm": 0.3548210859298706,
"learning_rate": 3.4025374855824685e-05,
"loss": 0.9695,
"step": 295
},
{
"epoch": 0.017305029995385326,
"grad_norm": 0.3769378662109375,
"learning_rate": 3.460207612456747e-05,
"loss": 0.963,
"step": 300
},
{
"epoch": 0.01759344716197508,
"grad_norm": 0.3663967549800873,
"learning_rate": 3.517877739331027e-05,
"loss": 1.0924,
"step": 305
},
{
"epoch": 0.017881864328564836,
"grad_norm": 0.38498544692993164,
"learning_rate": 3.575547866205306e-05,
"loss": 1.0481,
"step": 310
},
{
"epoch": 0.01817028149515459,
"grad_norm": 0.3465900123119354,
"learning_rate": 3.633217993079585e-05,
"loss": 1.0396,
"step": 315
},
{
"epoch": 0.018458698661744346,
"grad_norm": 0.3498382270336151,
"learning_rate": 3.6908881199538644e-05,
"loss": 1.0005,
"step": 320
},
{
"epoch": 0.0187471158283341,
"grad_norm": 0.3397336006164551,
"learning_rate": 3.748558246828143e-05,
"loss": 0.9682,
"step": 325
},
{
"epoch": 0.01903553299492386,
"grad_norm": 0.33760690689086914,
"learning_rate": 3.806228373702422e-05,
"loss": 0.9975,
"step": 330
},
{
"epoch": 0.019323950161513614,
"grad_norm": 0.32710301876068115,
"learning_rate": 3.863898500576701e-05,
"loss": 0.985,
"step": 335
},
{
"epoch": 0.01961236732810337,
"grad_norm": 0.40678462386131287,
"learning_rate": 3.9215686274509805e-05,
"loss": 0.9664,
"step": 340
},
{
"epoch": 0.019900784494693124,
"grad_norm": 0.38339948654174805,
"learning_rate": 3.97923875432526e-05,
"loss": 0.9962,
"step": 345
},
{
"epoch": 0.02018920166128288,
"grad_norm": 0.3516389727592468,
"learning_rate": 4.036908881199539e-05,
"loss": 0.9385,
"step": 350
},
{
"epoch": 0.020477618827872635,
"grad_norm": 0.3469911515712738,
"learning_rate": 4.094579008073818e-05,
"loss": 0.9795,
"step": 355
},
{
"epoch": 0.02076603599446239,
"grad_norm": 0.351566344499588,
"learning_rate": 4.1522491349480966e-05,
"loss": 1.0131,
"step": 360
},
{
"epoch": 0.021054453161052145,
"grad_norm": 0.3254294991493225,
"learning_rate": 4.209919261822376e-05,
"loss": 0.9784,
"step": 365
},
{
"epoch": 0.0213428703276419,
"grad_norm": 0.352115660905838,
"learning_rate": 4.2675893886966556e-05,
"loss": 1.0013,
"step": 370
},
{
"epoch": 0.021631287494231658,
"grad_norm": 0.35616523027420044,
"learning_rate": 4.325259515570935e-05,
"loss": 1.0209,
"step": 375
},
{
"epoch": 0.021919704660821413,
"grad_norm": 0.3402170240879059,
"learning_rate": 4.382929642445214e-05,
"loss": 0.976,
"step": 380
},
{
"epoch": 0.022208121827411168,
"grad_norm": 0.30762144923210144,
"learning_rate": 4.440599769319493e-05,
"loss": 0.8757,
"step": 385
},
{
"epoch": 0.022496538994000923,
"grad_norm": 0.33472269773483276,
"learning_rate": 4.498269896193772e-05,
"loss": 1.0687,
"step": 390
},
{
"epoch": 0.022784956160590678,
"grad_norm": 0.3568858802318573,
"learning_rate": 4.555940023068051e-05,
"loss": 1.0279,
"step": 395
},
{
"epoch": 0.023073373327180433,
"grad_norm": 0.3303862512111664,
"learning_rate": 4.61361014994233e-05,
"loss": 1.0061,
"step": 400
},
{
"epoch": 0.023361790493770188,
"grad_norm": 0.3586498498916626,
"learning_rate": 4.671280276816609e-05,
"loss": 1.0007,
"step": 405
},
{
"epoch": 0.023650207660359943,
"grad_norm": 0.34804537892341614,
"learning_rate": 4.7289504036908884e-05,
"loss": 0.9913,
"step": 410
},
{
"epoch": 0.0239386248269497,
"grad_norm": 0.33361154794692993,
"learning_rate": 4.7866205305651676e-05,
"loss": 0.9615,
"step": 415
},
{
"epoch": 0.024227041993539457,
"grad_norm": 0.30743229389190674,
"learning_rate": 4.844290657439446e-05,
"loss": 1.0062,
"step": 420
},
{
"epoch": 0.024515459160129212,
"grad_norm": 0.3414464294910431,
"learning_rate": 4.901960784313725e-05,
"loss": 1.0266,
"step": 425
},
{
"epoch": 0.024803876326718967,
"grad_norm": 0.311254620552063,
"learning_rate": 4.9596309111880045e-05,
"loss": 0.9525,
"step": 430
},
{
"epoch": 0.025092293493308722,
"grad_norm": 0.3211973011493683,
"learning_rate": 5.017301038062284e-05,
"loss": 1.0204,
"step": 435
},
{
"epoch": 0.025380710659898477,
"grad_norm": 0.32264503836631775,
"learning_rate": 5.074971164936563e-05,
"loss": 0.9187,
"step": 440
},
{
"epoch": 0.025669127826488232,
"grad_norm": 0.3149093985557556,
"learning_rate": 5.132641291810843e-05,
"loss": 1.0324,
"step": 445
},
{
"epoch": 0.025957544993077987,
"grad_norm": 0.31910112500190735,
"learning_rate": 5.190311418685121e-05,
"loss": 0.9924,
"step": 450
},
{
"epoch": 0.026245962159667742,
"grad_norm": 0.329057514667511,
"learning_rate": 5.2479815455594004e-05,
"loss": 1.0235,
"step": 455
},
{
"epoch": 0.0265343793262575,
"grad_norm": 0.32927969098091125,
"learning_rate": 5.305651672433679e-05,
"loss": 0.9986,
"step": 460
},
{
"epoch": 0.026822796492847256,
"grad_norm": 0.30113425850868225,
"learning_rate": 5.363321799307959e-05,
"loss": 0.9996,
"step": 465
},
{
"epoch": 0.02711121365943701,
"grad_norm": 0.31802427768707275,
"learning_rate": 5.4209919261822386e-05,
"loss": 0.903,
"step": 470
},
{
"epoch": 0.027399630826026766,
"grad_norm": 0.31492453813552856,
"learning_rate": 5.478662053056517e-05,
"loss": 0.9627,
"step": 475
},
{
"epoch": 0.02768804799261652,
"grad_norm": 0.32527875900268555,
"learning_rate": 5.536332179930796e-05,
"loss": 0.9842,
"step": 480
},
{
"epoch": 0.027976465159206276,
"grad_norm": 0.3000083267688751,
"learning_rate": 5.594002306805075e-05,
"loss": 0.9275,
"step": 485
},
{
"epoch": 0.02826488232579603,
"grad_norm": 0.30580878257751465,
"learning_rate": 5.651672433679355e-05,
"loss": 1.0111,
"step": 490
},
{
"epoch": 0.028553299492385786,
"grad_norm": 0.3029692769050598,
"learning_rate": 5.709342560553633e-05,
"loss": 0.9997,
"step": 495
},
{
"epoch": 0.02884171665897554,
"grad_norm": 0.29320913553237915,
"learning_rate": 5.767012687427913e-05,
"loss": 0.9728,
"step": 500
},
{
"epoch": 0.0291301338255653,
"grad_norm": 0.27277612686157227,
"learning_rate": 5.8246828143021916e-05,
"loss": 0.9481,
"step": 505
},
{
"epoch": 0.029418550992155054,
"grad_norm": 0.3065517544746399,
"learning_rate": 5.882352941176471e-05,
"loss": 1.0068,
"step": 510
},
{
"epoch": 0.02970696815874481,
"grad_norm": 0.30595871806144714,
"learning_rate": 5.940023068050749e-05,
"loss": 1.0394,
"step": 515
},
{
"epoch": 0.029995385325334564,
"grad_norm": 0.2905437648296356,
"learning_rate": 5.997693194925029e-05,
"loss": 0.8914,
"step": 520
},
{
"epoch": 0.03028380249192432,
"grad_norm": 0.30169710516929626,
"learning_rate": 6.0553633217993076e-05,
"loss": 1.0714,
"step": 525
},
{
"epoch": 0.030572219658514074,
"grad_norm": 0.30245259404182434,
"learning_rate": 6.113033448673587e-05,
"loss": 0.9748,
"step": 530
},
{
"epoch": 0.03086063682510383,
"grad_norm": 0.31071239709854126,
"learning_rate": 6.170703575547867e-05,
"loss": 1.0307,
"step": 535
},
{
"epoch": 0.031149053991693584,
"grad_norm": 0.301554799079895,
"learning_rate": 6.228373702422145e-05,
"loss": 0.9904,
"step": 540
},
{
"epoch": 0.03143747115828334,
"grad_norm": 0.29832157492637634,
"learning_rate": 6.286043829296425e-05,
"loss": 0.965,
"step": 545
},
{
"epoch": 0.031725888324873094,
"grad_norm": 0.2960033118724823,
"learning_rate": 6.343713956170704e-05,
"loss": 0.9661,
"step": 550
},
{
"epoch": 0.03201430549146285,
"grad_norm": 0.2793910503387451,
"learning_rate": 6.401384083044983e-05,
"loss": 0.9691,
"step": 555
},
{
"epoch": 0.032302722658052604,
"grad_norm": 0.2931232750415802,
"learning_rate": 6.459054209919262e-05,
"loss": 1.0152,
"step": 560
},
{
"epoch": 0.03259113982464236,
"grad_norm": 0.29276397824287415,
"learning_rate": 6.516724336793542e-05,
"loss": 0.9644,
"step": 565
},
{
"epoch": 0.03287955699123212,
"grad_norm": 0.2859160304069519,
"learning_rate": 6.57439446366782e-05,
"loss": 0.8926,
"step": 570
},
{
"epoch": 0.033167974157821876,
"grad_norm": 0.2981337308883667,
"learning_rate": 6.6320645905421e-05,
"loss": 0.9805,
"step": 575
},
{
"epoch": 0.03345639132441163,
"grad_norm": 0.28318145871162415,
"learning_rate": 6.689734717416379e-05,
"loss": 0.9828,
"step": 580
},
{
"epoch": 0.033744808491001387,
"grad_norm": 0.2922738194465637,
"learning_rate": 6.747404844290659e-05,
"loss": 0.9495,
"step": 585
},
{
"epoch": 0.03403322565759114,
"grad_norm": 0.3307567536830902,
"learning_rate": 6.805074971164937e-05,
"loss": 0.975,
"step": 590
},
{
"epoch": 0.0343216428241809,
"grad_norm": 0.2792339622974396,
"learning_rate": 6.862745098039216e-05,
"loss": 1.0021,
"step": 595
},
{
"epoch": 0.03461005999077065,
"grad_norm": 0.26365357637405396,
"learning_rate": 6.920415224913494e-05,
"loss": 1.0316,
"step": 600
},
{
"epoch": 0.03489847715736041,
"grad_norm": 0.285918265581131,
"learning_rate": 6.978085351787774e-05,
"loss": 1.0025,
"step": 605
},
{
"epoch": 0.03518689432395016,
"grad_norm": 0.290382444858551,
"learning_rate": 7.035755478662054e-05,
"loss": 1.0198,
"step": 610
},
{
"epoch": 0.03547531149053992,
"grad_norm": 0.2909998595714569,
"learning_rate": 7.093425605536332e-05,
"loss": 1.0522,
"step": 615
},
{
"epoch": 0.03576372865712967,
"grad_norm": 0.2691628038883209,
"learning_rate": 7.151095732410612e-05,
"loss": 1.0285,
"step": 620
},
{
"epoch": 0.03605214582371943,
"grad_norm": 0.2793739140033722,
"learning_rate": 7.20876585928489e-05,
"loss": 0.9431,
"step": 625
},
{
"epoch": 0.03634056299030918,
"grad_norm": 0.28252139687538147,
"learning_rate": 7.26643598615917e-05,
"loss": 0.954,
"step": 630
},
{
"epoch": 0.03662898015689894,
"grad_norm": 0.2551520764827728,
"learning_rate": 7.324106113033449e-05,
"loss": 0.9477,
"step": 635
},
{
"epoch": 0.03691739732348869,
"grad_norm": 0.2769528925418854,
"learning_rate": 7.381776239907729e-05,
"loss": 1.0228,
"step": 640
},
{
"epoch": 0.03720581449007845,
"grad_norm": 0.26769739389419556,
"learning_rate": 7.439446366782007e-05,
"loss": 0.9844,
"step": 645
},
{
"epoch": 0.0374942316566682,
"grad_norm": 0.2822119891643524,
"learning_rate": 7.497116493656286e-05,
"loss": 1.0532,
"step": 650
},
{
"epoch": 0.03778264882325796,
"grad_norm": 0.2787601053714752,
"learning_rate": 7.554786620530564e-05,
"loss": 1.0154,
"step": 655
},
{
"epoch": 0.03807106598984772,
"grad_norm": 0.27694109082221985,
"learning_rate": 7.612456747404844e-05,
"loss": 0.9775,
"step": 660
},
{
"epoch": 0.038359483156437474,
"grad_norm": 0.4112897217273712,
"learning_rate": 7.670126874279123e-05,
"loss": 1.0071,
"step": 665
},
{
"epoch": 0.03864790032302723,
"grad_norm": 0.26005199551582336,
"learning_rate": 7.727797001153403e-05,
"loss": 0.9632,
"step": 670
},
{
"epoch": 0.038936317489616984,
"grad_norm": 0.25056615471839905,
"learning_rate": 7.785467128027682e-05,
"loss": 0.9773,
"step": 675
},
{
"epoch": 0.03922473465620674,
"grad_norm": 0.27164942026138306,
"learning_rate": 7.843137254901961e-05,
"loss": 0.9927,
"step": 680
},
{
"epoch": 0.039513151822796494,
"grad_norm": 0.26238757371902466,
"learning_rate": 7.900807381776241e-05,
"loss": 0.9612,
"step": 685
},
{
"epoch": 0.03980156898938625,
"grad_norm": 0.28629186749458313,
"learning_rate": 7.95847750865052e-05,
"loss": 0.9579,
"step": 690
},
{
"epoch": 0.040089986155976004,
"grad_norm": 0.2650497555732727,
"learning_rate": 8.016147635524799e-05,
"loss": 0.9667,
"step": 695
},
{
"epoch": 0.04037840332256576,
"grad_norm": 0.26934972405433655,
"learning_rate": 8.073817762399078e-05,
"loss": 0.9257,
"step": 700
},
{
"epoch": 0.040666820489155514,
"grad_norm": 0.27391955256462097,
"learning_rate": 8.131487889273358e-05,
"loss": 1.0725,
"step": 705
},
{
"epoch": 0.04095523765574527,
"grad_norm": 0.2905539274215698,
"learning_rate": 8.189158016147636e-05,
"loss": 0.9979,
"step": 710
},
{
"epoch": 0.041243654822335024,
"grad_norm": 0.26050031185150146,
"learning_rate": 8.246828143021915e-05,
"loss": 0.9901,
"step": 715
},
{
"epoch": 0.04153207198892478,
"grad_norm": 0.4822568893432617,
"learning_rate": 8.304498269896193e-05,
"loss": 0.9753,
"step": 720
},
{
"epoch": 0.041820489155514534,
"grad_norm": 0.27065780758857727,
"learning_rate": 8.362168396770473e-05,
"loss": 0.961,
"step": 725
},
{
"epoch": 0.04210890632210429,
"grad_norm": 0.27039390802383423,
"learning_rate": 8.419838523644751e-05,
"loss": 1.0218,
"step": 730
},
{
"epoch": 0.042397323488694044,
"grad_norm": 0.267991304397583,
"learning_rate": 8.477508650519031e-05,
"loss": 0.8937,
"step": 735
},
{
"epoch": 0.0426857406552838,
"grad_norm": 0.2698671519756317,
"learning_rate": 8.535178777393311e-05,
"loss": 1.0203,
"step": 740
},
{
"epoch": 0.04297415782187356,
"grad_norm": 0.25605538487434387,
"learning_rate": 8.59284890426759e-05,
"loss": 1.0398,
"step": 745
},
{
"epoch": 0.043262574988463316,
"grad_norm": 0.26644793152809143,
"learning_rate": 8.65051903114187e-05,
"loss": 1.0212,
"step": 750
},
{
"epoch": 0.04355099215505307,
"grad_norm": 0.2879778742790222,
"learning_rate": 8.708189158016148e-05,
"loss": 0.9854,
"step": 755
},
{
"epoch": 0.043839409321642826,
"grad_norm": 0.26750192046165466,
"learning_rate": 8.765859284890428e-05,
"loss": 1.0168,
"step": 760
},
{
"epoch": 0.04412782648823258,
"grad_norm": 0.2743099331855774,
"learning_rate": 8.823529411764706e-05,
"loss": 0.9447,
"step": 765
},
{
"epoch": 0.044416243654822336,
"grad_norm": 0.27284887433052063,
"learning_rate": 8.881199538638986e-05,
"loss": 1.016,
"step": 770
},
{
"epoch": 0.04470466082141209,
"grad_norm": 0.26251500844955444,
"learning_rate": 8.938869665513265e-05,
"loss": 0.9275,
"step": 775
},
{
"epoch": 0.044993077988001846,
"grad_norm": 0.26898619532585144,
"learning_rate": 8.996539792387543e-05,
"loss": 0.9258,
"step": 780
},
{
"epoch": 0.0452814951545916,
"grad_norm": 0.2636859118938446,
"learning_rate": 9.054209919261822e-05,
"loss": 1.1368,
"step": 785
},
{
"epoch": 0.045569912321181356,
"grad_norm": 0.25750333070755005,
"learning_rate": 9.111880046136102e-05,
"loss": 0.9829,
"step": 790
},
{
"epoch": 0.04585832948777111,
"grad_norm": 0.26251962780952454,
"learning_rate": 9.16955017301038e-05,
"loss": 1.0722,
"step": 795
},
{
"epoch": 0.046146746654360866,
"grad_norm": 0.24186044931411743,
"learning_rate": 9.22722029988466e-05,
"loss": 0.9681,
"step": 800
},
{
"epoch": 0.04643516382095062,
"grad_norm": 0.2631891965866089,
"learning_rate": 9.28489042675894e-05,
"loss": 1.0082,
"step": 805
},
{
"epoch": 0.046723580987540377,
"grad_norm": 0.25769105553627014,
"learning_rate": 9.342560553633218e-05,
"loss": 0.9419,
"step": 810
},
{
"epoch": 0.04701199815413013,
"grad_norm": 0.26983222365379333,
"learning_rate": 9.400230680507498e-05,
"loss": 0.9698,
"step": 815
},
{
"epoch": 0.04730041532071989,
"grad_norm": 0.268951952457428,
"learning_rate": 9.457900807381777e-05,
"loss": 1.0199,
"step": 820
},
{
"epoch": 0.04758883248730964,
"grad_norm": 0.2618368864059448,
"learning_rate": 9.515570934256057e-05,
"loss": 1.0474,
"step": 825
},
{
"epoch": 0.0478772496538994,
"grad_norm": 0.2535788118839264,
"learning_rate": 9.573241061130335e-05,
"loss": 1.051,
"step": 830
},
{
"epoch": 0.04816566682048916,
"grad_norm": 0.24797338247299194,
"learning_rate": 9.630911188004614e-05,
"loss": 0.9787,
"step": 835
},
{
"epoch": 0.048454083987078914,
"grad_norm": 0.2542094886302948,
"learning_rate": 9.688581314878892e-05,
"loss": 1.0301,
"step": 840
},
{
"epoch": 0.04874250115366867,
"grad_norm": 0.34137168526649475,
"learning_rate": 9.746251441753172e-05,
"loss": 0.8916,
"step": 845
},
{
"epoch": 0.049030918320258424,
"grad_norm": 0.25905948877334595,
"learning_rate": 9.80392156862745e-05,
"loss": 1.0086,
"step": 850
},
{
"epoch": 0.04931933548684818,
"grad_norm": 0.24208292365074158,
"learning_rate": 9.86159169550173e-05,
"loss": 0.962,
"step": 855
},
{
"epoch": 0.049607752653437934,
"grad_norm": 0.2500937879085541,
"learning_rate": 9.919261822376009e-05,
"loss": 0.983,
"step": 860
},
{
"epoch": 0.04989616982002769,
"grad_norm": 0.2481968104839325,
"learning_rate": 9.976931949250289e-05,
"loss": 0.9798,
"step": 865
},
{
"epoch": 0.050184586986617444,
"grad_norm": 0.25975415110588074,
"learning_rate": 0.00010034602076124569,
"loss": 0.9621,
"step": 870
},
{
"epoch": 0.0504730041532072,
"grad_norm": 0.25389575958251953,
"learning_rate": 0.00010092272202998847,
"loss": 0.9959,
"step": 875
},
{
"epoch": 0.050761421319796954,
"grad_norm": 0.26200932264328003,
"learning_rate": 0.00010149942329873126,
"loss": 0.9432,
"step": 880
},
{
"epoch": 0.05104983848638671,
"grad_norm": 0.25433865189552307,
"learning_rate": 0.00010207612456747407,
"loss": 1.0272,
"step": 885
},
{
"epoch": 0.051338255652976464,
"grad_norm": 0.29402443766593933,
"learning_rate": 0.00010265282583621685,
"loss": 1.018,
"step": 890
},
{
"epoch": 0.05162667281956622,
"grad_norm": 0.2625313699245453,
"learning_rate": 0.00010322952710495964,
"loss": 1.0326,
"step": 895
},
{
"epoch": 0.051915089986155974,
"grad_norm": 0.2682657241821289,
"learning_rate": 0.00010380622837370242,
"loss": 1.0215,
"step": 900
},
{
"epoch": 0.05220350715274573,
"grad_norm": 0.27114447951316833,
"learning_rate": 0.00010438292964244522,
"loss": 0.9736,
"step": 905
},
{
"epoch": 0.052491924319335484,
"grad_norm": 0.2469518631696701,
"learning_rate": 0.00010495963091118801,
"loss": 0.93,
"step": 910
},
{
"epoch": 0.05278034148592524,
"grad_norm": 0.262253999710083,
"learning_rate": 0.00010553633217993079,
"loss": 0.9477,
"step": 915
},
{
"epoch": 0.053068758652515,
"grad_norm": 0.25354915857315063,
"learning_rate": 0.00010611303344867358,
"loss": 0.9926,
"step": 920
},
{
"epoch": 0.053357175819104756,
"grad_norm": 0.24856913089752197,
"learning_rate": 0.00010668973471741639,
"loss": 0.9726,
"step": 925
},
{
"epoch": 0.05364559298569451,
"grad_norm": 0.24939557909965515,
"learning_rate": 0.00010726643598615918,
"loss": 0.9575,
"step": 930
},
{
"epoch": 0.053934010152284266,
"grad_norm": 0.2722608745098114,
"learning_rate": 0.00010784313725490196,
"loss": 1.0017,
"step": 935
},
{
"epoch": 0.05422242731887402,
"grad_norm": 0.25203198194503784,
"learning_rate": 0.00010841983852364477,
"loss": 0.9141,
"step": 940
},
{
"epoch": 0.054510844485463776,
"grad_norm": 0.2586802840232849,
"learning_rate": 0.00010899653979238756,
"loss": 1.0066,
"step": 945
},
{
"epoch": 0.05479926165205353,
"grad_norm": 0.24033570289611816,
"learning_rate": 0.00010957324106113034,
"loss": 1.0113,
"step": 950
},
{
"epoch": 0.055087678818643286,
"grad_norm": 0.2373732328414917,
"learning_rate": 0.00011014994232987313,
"loss": 1.0172,
"step": 955
},
{
"epoch": 0.05537609598523304,
"grad_norm": 0.25045233964920044,
"learning_rate": 0.00011072664359861593,
"loss": 0.9548,
"step": 960
},
{
"epoch": 0.055664513151822796,
"grad_norm": 0.25307127833366394,
"learning_rate": 0.00011130334486735871,
"loss": 0.8803,
"step": 965
},
{
"epoch": 0.05595293031841255,
"grad_norm": 0.2580971121788025,
"learning_rate": 0.0001118800461361015,
"loss": 1.0257,
"step": 970
},
{
"epoch": 0.056241347485002306,
"grad_norm": 0.3492274284362793,
"learning_rate": 0.00011245674740484428,
"loss": 0.9915,
"step": 975
},
{
"epoch": 0.05652976465159206,
"grad_norm": 0.3969261944293976,
"learning_rate": 0.0001130334486735871,
"loss": 0.9871,
"step": 980
},
{
"epoch": 0.056818181818181816,
"grad_norm": 0.2512189447879791,
"learning_rate": 0.00011361014994232988,
"loss": 0.9999,
"step": 985
},
{
"epoch": 0.05710659898477157,
"grad_norm": 0.24583379924297333,
"learning_rate": 0.00011418685121107266,
"loss": 1.019,
"step": 990
},
{
"epoch": 0.057395016151361326,
"grad_norm": 0.23418952524662018,
"learning_rate": 0.00011476355247981545,
"loss": 0.9976,
"step": 995
},
{
"epoch": 0.05768343331795108,
"grad_norm": 0.24816179275512695,
"learning_rate": 0.00011534025374855826,
"loss": 0.9787,
"step": 1000
},
{
"epoch": 0.05797185048454084,
"grad_norm": 0.238878071308136,
"learning_rate": 0.00011591695501730105,
"loss": 0.9831,
"step": 1005
},
{
"epoch": 0.0582602676511306,
"grad_norm": 0.240176260471344,
"learning_rate": 0.00011649365628604383,
"loss": 0.9604,
"step": 1010
},
{
"epoch": 0.05854868481772035,
"grad_norm": 0.24366143345832825,
"learning_rate": 0.00011707035755478663,
"loss": 1.0633,
"step": 1015
},
{
"epoch": 0.05883710198431011,
"grad_norm": 0.24254244565963745,
"learning_rate": 0.00011764705882352942,
"loss": 1.0299,
"step": 1020
},
{
"epoch": 0.05912551915089986,
"grad_norm": 0.2483944445848465,
"learning_rate": 0.0001182237600922722,
"loss": 1.0325,
"step": 1025
},
{
"epoch": 0.05941393631748962,
"grad_norm": 0.23639345169067383,
"learning_rate": 0.00011880046136101499,
"loss": 0.9192,
"step": 1030
},
{
"epoch": 0.059702353484079373,
"grad_norm": 0.26320794224739075,
"learning_rate": 0.0001193771626297578,
"loss": 0.973,
"step": 1035
},
{
"epoch": 0.05999077065066913,
"grad_norm": 0.26271867752075195,
"learning_rate": 0.00011995386389850058,
"loss": 1.0339,
"step": 1040
},
{
"epoch": 0.060279187817258884,
"grad_norm": 0.2515929043292999,
"learning_rate": 0.00012053056516724337,
"loss": 0.9777,
"step": 1045
},
{
"epoch": 0.06056760498384864,
"grad_norm": 0.24450047314167023,
"learning_rate": 0.00012110726643598615,
"loss": 0.9781,
"step": 1050
},
{
"epoch": 0.060856022150438394,
"grad_norm": 0.247002974152565,
"learning_rate": 0.00012168396770472896,
"loss": 0.9742,
"step": 1055
},
{
"epoch": 0.06114443931702815,
"grad_norm": 0.22039633989334106,
"learning_rate": 0.00012226066897347174,
"loss": 0.9602,
"step": 1060
},
{
"epoch": 0.061432856483617904,
"grad_norm": 0.25299662351608276,
"learning_rate": 0.00012283737024221453,
"loss": 0.9429,
"step": 1065
},
{
"epoch": 0.06172127365020766,
"grad_norm": 0.24021919071674347,
"learning_rate": 0.00012341407151095733,
"loss": 1.0543,
"step": 1070
},
{
"epoch": 0.062009690816797414,
"grad_norm": 0.2851802408695221,
"learning_rate": 0.00012399077277970013,
"loss": 1.0169,
"step": 1075
},
{
"epoch": 0.06229810798338717,
"grad_norm": 0.2532206177711487,
"learning_rate": 0.0001245674740484429,
"loss": 0.9388,
"step": 1080
},
{
"epoch": 0.06258652514997692,
"grad_norm": 0.2355235517024994,
"learning_rate": 0.0001251441753171857,
"loss": 0.9283,
"step": 1085
},
{
"epoch": 0.06287494231656668,
"grad_norm": 0.2673757076263428,
"learning_rate": 0.0001257208765859285,
"loss": 1.0022,
"step": 1090
},
{
"epoch": 0.06316335948315643,
"grad_norm": 0.22847038507461548,
"learning_rate": 0.0001262975778546713,
"loss": 0.9481,
"step": 1095
},
{
"epoch": 0.06345177664974619,
"grad_norm": 0.25772714614868164,
"learning_rate": 0.00012687427912341407,
"loss": 0.9909,
"step": 1100
},
{
"epoch": 0.06374019381633594,
"grad_norm": 0.238713800907135,
"learning_rate": 0.00012745098039215687,
"loss": 0.9379,
"step": 1105
},
{
"epoch": 0.0640286109829257,
"grad_norm": 0.24460141360759735,
"learning_rate": 0.00012802768166089967,
"loss": 0.9398,
"step": 1110
},
{
"epoch": 0.06431702814951545,
"grad_norm": 0.23570501804351807,
"learning_rate": 0.00012860438292964244,
"loss": 0.9292,
"step": 1115
},
{
"epoch": 0.06460544531610521,
"grad_norm": 0.26408931612968445,
"learning_rate": 0.00012918108419838524,
"loss": 1.026,
"step": 1120
},
{
"epoch": 0.06489386248269496,
"grad_norm": 0.2372530698776245,
"learning_rate": 0.00012975778546712804,
"loss": 0.9906,
"step": 1125
},
{
"epoch": 0.06518227964928472,
"grad_norm": 0.2314678579568863,
"learning_rate": 0.00013033448673587084,
"loss": 0.9447,
"step": 1130
},
{
"epoch": 0.06547069681587447,
"grad_norm": 0.25254136323928833,
"learning_rate": 0.0001309111880046136,
"loss": 1.0364,
"step": 1135
},
{
"epoch": 0.06575911398246424,
"grad_norm": 0.23922473192214966,
"learning_rate": 0.0001314878892733564,
"loss": 1.0091,
"step": 1140
},
{
"epoch": 0.066047531149054,
"grad_norm": 0.24500273168087006,
"learning_rate": 0.0001320645905420992,
"loss": 0.9951,
"step": 1145
},
{
"epoch": 0.06633594831564375,
"grad_norm": 0.23815661668777466,
"learning_rate": 0.000132641291810842,
"loss": 1.0065,
"step": 1150
},
{
"epoch": 0.06662436548223351,
"grad_norm": 0.26173415780067444,
"learning_rate": 0.00013321799307958477,
"loss": 1.0159,
"step": 1155
},
{
"epoch": 0.06691278264882326,
"grad_norm": 0.22709496319293976,
"learning_rate": 0.00013379469434832757,
"loss": 0.9121,
"step": 1160
},
{
"epoch": 0.06720119981541302,
"grad_norm": 0.2595439553260803,
"learning_rate": 0.00013437139561707037,
"loss": 1.0136,
"step": 1165
},
{
"epoch": 0.06748961698200277,
"grad_norm": 0.23945558071136475,
"learning_rate": 0.00013494809688581317,
"loss": 0.9508,
"step": 1170
},
{
"epoch": 0.06777803414859253,
"grad_norm": 0.2526959478855133,
"learning_rate": 0.00013552479815455594,
"loss": 0.9304,
"step": 1175
},
{
"epoch": 0.06806645131518228,
"grad_norm": 0.2385508418083191,
"learning_rate": 0.00013610149942329874,
"loss": 1.012,
"step": 1180
},
{
"epoch": 0.06835486848177204,
"grad_norm": 0.25558724999427795,
"learning_rate": 0.00013667820069204154,
"loss": 1.0289,
"step": 1185
},
{
"epoch": 0.0686432856483618,
"grad_norm": 0.26076334714889526,
"learning_rate": 0.0001372549019607843,
"loss": 0.9564,
"step": 1190
},
{
"epoch": 0.06893170281495155,
"grad_norm": 0.24157829582691193,
"learning_rate": 0.0001378316032295271,
"loss": 1.0265,
"step": 1195
},
{
"epoch": 0.0692201199815413,
"grad_norm": 0.2505204379558563,
"learning_rate": 0.00013840830449826988,
"loss": 0.965,
"step": 1200
},
{
"epoch": 0.06950853714813106,
"grad_norm": 0.2583898603916168,
"learning_rate": 0.0001389850057670127,
"loss": 1.0161,
"step": 1205
},
{
"epoch": 0.06979695431472081,
"grad_norm": 0.24660265445709229,
"learning_rate": 0.00013956170703575548,
"loss": 1.0086,
"step": 1210
},
{
"epoch": 0.07008537148131057,
"grad_norm": 0.2303483486175537,
"learning_rate": 0.00014013840830449828,
"loss": 1.0004,
"step": 1215
},
{
"epoch": 0.07037378864790032,
"grad_norm": 0.25441575050354004,
"learning_rate": 0.00014071510957324108,
"loss": 1.0218,
"step": 1220
},
{
"epoch": 0.07066220581449008,
"grad_norm": 0.2441866099834442,
"learning_rate": 0.00014129181084198387,
"loss": 0.9947,
"step": 1225
},
{
"epoch": 0.07095062298107983,
"grad_norm": 0.2431473582983017,
"learning_rate": 0.00014186851211072665,
"loss": 0.977,
"step": 1230
},
{
"epoch": 0.07123904014766959,
"grad_norm": 0.22348998486995697,
"learning_rate": 0.00014244521337946944,
"loss": 0.9626,
"step": 1235
},
{
"epoch": 0.07152745731425934,
"grad_norm": 0.25038719177246094,
"learning_rate": 0.00014302191464821224,
"loss": 1.0234,
"step": 1240
},
{
"epoch": 0.0718158744808491,
"grad_norm": 0.24543331563472748,
"learning_rate": 0.00014359861591695501,
"loss": 0.9782,
"step": 1245
},
{
"epoch": 0.07210429164743885,
"grad_norm": 0.2646369934082031,
"learning_rate": 0.0001441753171856978,
"loss": 1.0049,
"step": 1250
},
{
"epoch": 0.07239270881402861,
"grad_norm": 0.24707183241844177,
"learning_rate": 0.00014475201845444058,
"loss": 1.0426,
"step": 1255
},
{
"epoch": 0.07268112598061836,
"grad_norm": 0.24609191715717316,
"learning_rate": 0.0001453287197231834,
"loss": 0.9978,
"step": 1260
},
{
"epoch": 0.07296954314720812,
"grad_norm": 0.2498229593038559,
"learning_rate": 0.00014590542099192618,
"loss": 1.0299,
"step": 1265
},
{
"epoch": 0.07325796031379787,
"grad_norm": 0.24294817447662354,
"learning_rate": 0.00014648212226066898,
"loss": 0.9387,
"step": 1270
},
{
"epoch": 0.07354637748038763,
"grad_norm": 0.22789110243320465,
"learning_rate": 0.00014705882352941178,
"loss": 0.9859,
"step": 1275
},
{
"epoch": 0.07383479464697738,
"grad_norm": 0.2392035871744156,
"learning_rate": 0.00014763552479815458,
"loss": 0.9821,
"step": 1280
},
{
"epoch": 0.07412321181356714,
"grad_norm": 0.24138358235359192,
"learning_rate": 0.00014821222606689735,
"loss": 0.9644,
"step": 1285
},
{
"epoch": 0.0744116289801569,
"grad_norm": 0.2574746012687683,
"learning_rate": 0.00014878892733564015,
"loss": 0.9894,
"step": 1290
},
{
"epoch": 0.07470004614674665,
"grad_norm": 0.2577558755874634,
"learning_rate": 0.00014936562860438295,
"loss": 1.0049,
"step": 1295
},
{
"epoch": 0.0749884633133364,
"grad_norm": 0.2638446092605591,
"learning_rate": 0.00014994232987312572,
"loss": 0.9866,
"step": 1300
},
{
"epoch": 0.07527688047992616,
"grad_norm": 0.2279583364725113,
"learning_rate": 0.00015051903114186852,
"loss": 0.9697,
"step": 1305
},
{
"epoch": 0.07556529764651591,
"grad_norm": 0.25132206082344055,
"learning_rate": 0.0001510957324106113,
"loss": 0.9654,
"step": 1310
},
{
"epoch": 0.07585371481310568,
"grad_norm": 0.24250829219818115,
"learning_rate": 0.00015167243367935411,
"loss": 0.9594,
"step": 1315
},
{
"epoch": 0.07614213197969544,
"grad_norm": 0.24679099023342133,
"learning_rate": 0.00015224913494809689,
"loss": 0.9514,
"step": 1320
},
{
"epoch": 0.07643054914628519,
"grad_norm": 0.26517555117607117,
"learning_rate": 0.00015282583621683968,
"loss": 0.9575,
"step": 1325
},
{
"epoch": 0.07671896631287495,
"grad_norm": 0.23794426023960114,
"learning_rate": 0.00015340253748558246,
"loss": 0.9982,
"step": 1330
},
{
"epoch": 0.0770073834794647,
"grad_norm": 0.2488831728696823,
"learning_rate": 0.00015397923875432528,
"loss": 0.9454,
"step": 1335
},
{
"epoch": 0.07729580064605446,
"grad_norm": 0.26782914996147156,
"learning_rate": 0.00015455594002306805,
"loss": 1.0235,
"step": 1340
},
{
"epoch": 0.07758421781264421,
"grad_norm": 0.25021234154701233,
"learning_rate": 0.00015513264129181085,
"loss": 0.9243,
"step": 1345
},
{
"epoch": 0.07787263497923397,
"grad_norm": 0.2522822618484497,
"learning_rate": 0.00015570934256055365,
"loss": 1.0428,
"step": 1350
},
{
"epoch": 0.07816105214582372,
"grad_norm": 0.27001574635505676,
"learning_rate": 0.00015628604382929645,
"loss": 0.9755,
"step": 1355
},
{
"epoch": 0.07844946931241348,
"grad_norm": 0.24071645736694336,
"learning_rate": 0.00015686274509803922,
"loss": 1.013,
"step": 1360
},
{
"epoch": 0.07873788647900323,
"grad_norm": 0.24303098022937775,
"learning_rate": 0.00015743944636678202,
"loss": 0.9862,
"step": 1365
},
{
"epoch": 0.07902630364559299,
"grad_norm": 0.2542005479335785,
"learning_rate": 0.00015801614763552482,
"loss": 0.9709,
"step": 1370
},
{
"epoch": 0.07931472081218274,
"grad_norm": 0.2585870325565338,
"learning_rate": 0.0001585928489042676,
"loss": 1.0085,
"step": 1375
},
{
"epoch": 0.0796031379787725,
"grad_norm": 0.2629243731498718,
"learning_rate": 0.0001591695501730104,
"loss": 0.985,
"step": 1380
},
{
"epoch": 0.07989155514536225,
"grad_norm": 0.24008338153362274,
"learning_rate": 0.00015974625144175316,
"loss": 0.9839,
"step": 1385
},
{
"epoch": 0.08017997231195201,
"grad_norm": 0.2442033439874649,
"learning_rate": 0.00016032295271049598,
"loss": 0.8798,
"step": 1390
},
{
"epoch": 0.08046838947854176,
"grad_norm": 0.250362366437912,
"learning_rate": 0.00016089965397923876,
"loss": 0.9301,
"step": 1395
},
{
"epoch": 0.08075680664513152,
"grad_norm": 0.2477293759584427,
"learning_rate": 0.00016147635524798155,
"loss": 0.9561,
"step": 1400
},
{
"epoch": 0.08104522381172127,
"grad_norm": 0.23329582810401917,
"learning_rate": 0.00016205305651672435,
"loss": 0.9505,
"step": 1405
},
{
"epoch": 0.08133364097831103,
"grad_norm": 0.24549901485443115,
"learning_rate": 0.00016262975778546715,
"loss": 1.0284,
"step": 1410
},
{
"epoch": 0.08162205814490078,
"grad_norm": 0.24419653415679932,
"learning_rate": 0.00016320645905420992,
"loss": 0.9114,
"step": 1415
},
{
"epoch": 0.08191047531149054,
"grad_norm": 0.24551044404506683,
"learning_rate": 0.00016378316032295272,
"loss": 0.9574,
"step": 1420
},
{
"epoch": 0.0821988924780803,
"grad_norm": 0.29641515016555786,
"learning_rate": 0.00016435986159169552,
"loss": 0.9821,
"step": 1425
},
{
"epoch": 0.08248730964467005,
"grad_norm": 0.24953129887580872,
"learning_rate": 0.0001649365628604383,
"loss": 0.9966,
"step": 1430
},
{
"epoch": 0.0827757268112598,
"grad_norm": 0.25181591510772705,
"learning_rate": 0.0001655132641291811,
"loss": 1.023,
"step": 1435
},
{
"epoch": 0.08306414397784956,
"grad_norm": 0.2478877305984497,
"learning_rate": 0.00016608996539792386,
"loss": 0.9762,
"step": 1440
},
{
"epoch": 0.08335256114443931,
"grad_norm": 0.24414442479610443,
"learning_rate": 0.0001666666666666667,
"loss": 0.9339,
"step": 1445
},
{
"epoch": 0.08364097831102907,
"grad_norm": 0.24295495450496674,
"learning_rate": 0.00016724336793540946,
"loss": 1.0144,
"step": 1450
},
{
"epoch": 0.08392939547761882,
"grad_norm": 0.25291165709495544,
"learning_rate": 0.00016782006920415226,
"loss": 0.916,
"step": 1455
},
{
"epoch": 0.08421781264420858,
"grad_norm": 0.23744194209575653,
"learning_rate": 0.00016839677047289503,
"loss": 0.952,
"step": 1460
},
{
"epoch": 0.08450622981079833,
"grad_norm": 0.24316394329071045,
"learning_rate": 0.00016897347174163786,
"loss": 0.9725,
"step": 1465
},
{
"epoch": 0.08479464697738809,
"grad_norm": 0.23748493194580078,
"learning_rate": 0.00016955017301038063,
"loss": 0.9831,
"step": 1470
},
{
"epoch": 0.08508306414397784,
"grad_norm": 0.25356602668762207,
"learning_rate": 0.00017012687427912343,
"loss": 0.9632,
"step": 1475
},
{
"epoch": 0.0853714813105676,
"grad_norm": 0.24660415947437286,
"learning_rate": 0.00017070357554786622,
"loss": 0.9319,
"step": 1480
},
{
"epoch": 0.08565989847715735,
"grad_norm": 0.25426214933395386,
"learning_rate": 0.000171280276816609,
"loss": 1.0245,
"step": 1485
},
{
"epoch": 0.08594831564374712,
"grad_norm": 0.23765899240970612,
"learning_rate": 0.0001718569780853518,
"loss": 0.9202,
"step": 1490
},
{
"epoch": 0.08623673281033688,
"grad_norm": 0.24204228818416595,
"learning_rate": 0.00017243367935409457,
"loss": 0.9974,
"step": 1495
},
{
"epoch": 0.08652514997692663,
"grad_norm": 0.23034018278121948,
"learning_rate": 0.0001730103806228374,
"loss": 0.9251,
"step": 1500
},
{
"epoch": 0.08681356714351639,
"grad_norm": 0.24768561124801636,
"learning_rate": 0.00017358708189158016,
"loss": 0.957,
"step": 1505
},
{
"epoch": 0.08710198431010614,
"grad_norm": 0.24252378940582275,
"learning_rate": 0.00017416378316032296,
"loss": 0.9347,
"step": 1510
},
{
"epoch": 0.0873904014766959,
"grad_norm": 0.24422116577625275,
"learning_rate": 0.00017474048442906573,
"loss": 0.956,
"step": 1515
},
{
"epoch": 0.08767881864328565,
"grad_norm": 0.25470009446144104,
"learning_rate": 0.00017531718569780856,
"loss": 0.9355,
"step": 1520
},
{
"epoch": 0.08796723580987541,
"grad_norm": 0.240427628159523,
"learning_rate": 0.00017589388696655133,
"loss": 1.0345,
"step": 1525
},
{
"epoch": 0.08825565297646516,
"grad_norm": 0.2679055631160736,
"learning_rate": 0.00017647058823529413,
"loss": 1.0215,
"step": 1530
},
{
"epoch": 0.08854407014305492,
"grad_norm": 0.2706778943538666,
"learning_rate": 0.00017704728950403693,
"loss": 0.9951,
"step": 1535
},
{
"epoch": 0.08883248730964467,
"grad_norm": 0.24882011115550995,
"learning_rate": 0.00017762399077277973,
"loss": 1.0267,
"step": 1540
},
{
"epoch": 0.08912090447623443,
"grad_norm": 0.24369126558303833,
"learning_rate": 0.0001782006920415225,
"loss": 1.046,
"step": 1545
},
{
"epoch": 0.08940932164282418,
"grad_norm": 0.27035751938819885,
"learning_rate": 0.0001787773933102653,
"loss": 1.0522,
"step": 1550
},
{
"epoch": 0.08969773880941394,
"grad_norm": 0.25707873702049255,
"learning_rate": 0.0001793540945790081,
"loss": 0.9507,
"step": 1555
},
{
"epoch": 0.08998615597600369,
"grad_norm": 0.26456013321876526,
"learning_rate": 0.00017993079584775087,
"loss": 0.9941,
"step": 1560
},
{
"epoch": 0.09027457314259345,
"grad_norm": 0.26937803626060486,
"learning_rate": 0.00018050749711649367,
"loss": 1.0267,
"step": 1565
},
{
"epoch": 0.0905629903091832,
"grad_norm": 0.2615615725517273,
"learning_rate": 0.00018108419838523644,
"loss": 0.984,
"step": 1570
},
{
"epoch": 0.09085140747577296,
"grad_norm": 0.23720060288906097,
"learning_rate": 0.00018166089965397926,
"loss": 0.9401,
"step": 1575
},
{
"epoch": 0.09113982464236271,
"grad_norm": 0.24640457332134247,
"learning_rate": 0.00018223760092272203,
"loss": 1.086,
"step": 1580
},
{
"epoch": 0.09142824180895247,
"grad_norm": 0.2521013915538788,
"learning_rate": 0.00018281430219146483,
"loss": 0.9619,
"step": 1585
},
{
"epoch": 0.09171665897554222,
"grad_norm": 0.23948408663272858,
"learning_rate": 0.0001833910034602076,
"loss": 0.9835,
"step": 1590
},
{
"epoch": 0.09200507614213198,
"grad_norm": 0.25325456261634827,
"learning_rate": 0.00018396770472895043,
"loss": 1.0552,
"step": 1595
},
{
"epoch": 0.09229349330872173,
"grad_norm": 0.24731087684631348,
"learning_rate": 0.0001845444059976932,
"loss": 0.9253,
"step": 1600
},
{
"epoch": 0.09258191047531149,
"grad_norm": 0.26164206862449646,
"learning_rate": 0.000185121107266436,
"loss": 0.9396,
"step": 1605
},
{
"epoch": 0.09287032764190124,
"grad_norm": 0.25318196415901184,
"learning_rate": 0.0001856978085351788,
"loss": 0.9431,
"step": 1610
},
{
"epoch": 0.093158744808491,
"grad_norm": 0.2592536211013794,
"learning_rate": 0.00018627450980392157,
"loss": 0.9955,
"step": 1615
},
{
"epoch": 0.09344716197508075,
"grad_norm": 0.2497592270374298,
"learning_rate": 0.00018685121107266437,
"loss": 0.9844,
"step": 1620
},
{
"epoch": 0.09373557914167051,
"grad_norm": 0.2648375630378723,
"learning_rate": 0.00018742791234140714,
"loss": 0.9655,
"step": 1625
},
{
"epoch": 0.09402399630826026,
"grad_norm": 0.25172188878059387,
"learning_rate": 0.00018800461361014997,
"loss": 1.0322,
"step": 1630
},
{
"epoch": 0.09431241347485002,
"grad_norm": 0.24844340980052948,
"learning_rate": 0.00018858131487889274,
"loss": 0.9636,
"step": 1635
},
{
"epoch": 0.09460083064143977,
"grad_norm": 0.25023674964904785,
"learning_rate": 0.00018915801614763554,
"loss": 0.9601,
"step": 1640
},
{
"epoch": 0.09488924780802953,
"grad_norm": 0.2417484074831009,
"learning_rate": 0.0001897347174163783,
"loss": 0.9748,
"step": 1645
},
{
"epoch": 0.09517766497461928,
"grad_norm": 0.2597021162509918,
"learning_rate": 0.00019031141868512113,
"loss": 0.9672,
"step": 1650
},
{
"epoch": 0.09546608214120904,
"grad_norm": 0.25209182500839233,
"learning_rate": 0.0001908881199538639,
"loss": 0.9766,
"step": 1655
},
{
"epoch": 0.0957544993077988,
"grad_norm": 0.2704354226589203,
"learning_rate": 0.0001914648212226067,
"loss": 0.9658,
"step": 1660
},
{
"epoch": 0.09604291647438856,
"grad_norm": 0.2553963363170624,
"learning_rate": 0.00019204152249134948,
"loss": 0.972,
"step": 1665
},
{
"epoch": 0.09633133364097832,
"grad_norm": 0.25183454155921936,
"learning_rate": 0.00019261822376009227,
"loss": 0.9312,
"step": 1670
},
{
"epoch": 0.09661975080756807,
"grad_norm": 0.27272742986679077,
"learning_rate": 0.00019319492502883507,
"loss": 1.0585,
"step": 1675
},
{
"epoch": 0.09690816797415783,
"grad_norm": 0.25347381830215454,
"learning_rate": 0.00019377162629757784,
"loss": 1.0013,
"step": 1680
},
{
"epoch": 0.09719658514074758,
"grad_norm": 0.26412150263786316,
"learning_rate": 0.00019434832756632067,
"loss": 0.9175,
"step": 1685
},
{
"epoch": 0.09748500230733734,
"grad_norm": 0.2841266393661499,
"learning_rate": 0.00019492502883506344,
"loss": 0.8907,
"step": 1690
},
{
"epoch": 0.09777341947392709,
"grad_norm": 0.2843879163265228,
"learning_rate": 0.00019550173010380624,
"loss": 0.9952,
"step": 1695
},
{
"epoch": 0.09806183664051685,
"grad_norm": 0.24573901295661926,
"learning_rate": 0.000196078431372549,
"loss": 1.0093,
"step": 1700
},
{
"epoch": 0.0983502538071066,
"grad_norm": 0.25996410846710205,
"learning_rate": 0.00019665513264129184,
"loss": 1.0403,
"step": 1705
},
{
"epoch": 0.09863867097369636,
"grad_norm": 0.26386144757270813,
"learning_rate": 0.0001972318339100346,
"loss": 1.0211,
"step": 1710
},
{
"epoch": 0.09892708814028611,
"grad_norm": 0.26584669947624207,
"learning_rate": 0.0001978085351787774,
"loss": 0.9985,
"step": 1715
},
{
"epoch": 0.09921550530687587,
"grad_norm": 0.25835517048835754,
"learning_rate": 0.00019838523644752018,
"loss": 0.9615,
"step": 1720
},
{
"epoch": 0.09950392247346562,
"grad_norm": 0.2537446618080139,
"learning_rate": 0.000198961937716263,
"loss": 0.9851,
"step": 1725
},
{
"epoch": 0.09979233964005538,
"grad_norm": 0.2637675702571869,
"learning_rate": 0.00019953863898500578,
"loss": 0.9991,
"step": 1730
},
{
"epoch": 0.10008075680664513,
"grad_norm": 0.2486466020345688,
"learning_rate": 0.00019999999797274117,
"loss": 0.928,
"step": 1735
},
{
"epoch": 0.10036917397323489,
"grad_norm": 0.31705260276794434,
"learning_rate": 0.0001999999270186907,
"loss": 0.9909,
"step": 1740
},
{
"epoch": 0.10065759113982464,
"grad_norm": 0.2822314500808716,
"learning_rate": 0.0001999997547017808,
"loss": 0.9688,
"step": 1745
},
{
"epoch": 0.1009460083064144,
"grad_norm": 0.2564781606197357,
"learning_rate": 0.0001999994810221862,
"loss": 0.9515,
"step": 1750
},
{
"epoch": 0.10123442547300415,
"grad_norm": 0.2958817183971405,
"learning_rate": 0.00019999910598018426,
"loss": 0.9859,
"step": 1755
},
{
"epoch": 0.10152284263959391,
"grad_norm": 0.25060567259788513,
"learning_rate": 0.00019999862957615513,
"loss": 1.0043,
"step": 1760
},
{
"epoch": 0.10181125980618366,
"grad_norm": 0.2674092650413513,
"learning_rate": 0.00019999805181058176,
"loss": 0.9626,
"step": 1765
},
{
"epoch": 0.10209967697277342,
"grad_norm": 0.2575248181819916,
"learning_rate": 0.00019999737268404973,
"loss": 1.0265,
"step": 1770
},
{
"epoch": 0.10238809413936317,
"grad_norm": 0.2554805278778076,
"learning_rate": 0.00019999659219724749,
"loss": 0.9661,
"step": 1775
},
{
"epoch": 0.10267651130595293,
"grad_norm": 0.26680126786231995,
"learning_rate": 0.00019999571035096608,
"loss": 1.0231,
"step": 1780
},
{
"epoch": 0.10296492847254268,
"grad_norm": 0.25776219367980957,
"learning_rate": 0.00019999472714609943,
"loss": 0.9058,
"step": 1785
},
{
"epoch": 0.10325334563913244,
"grad_norm": 0.2542843818664551,
"learning_rate": 0.00019999364258364413,
"loss": 0.9773,
"step": 1790
},
{
"epoch": 0.10354176280572219,
"grad_norm": 0.2621992826461792,
"learning_rate": 0.0001999924566646995,
"loss": 0.9559,
"step": 1795
},
{
"epoch": 0.10383017997231195,
"grad_norm": 0.2683923840522766,
"learning_rate": 0.00019999116939046764,
"loss": 1.0355,
"step": 1800
},
{
"epoch": 0.1041185971389017,
"grad_norm": 0.24701032042503357,
"learning_rate": 0.0001999897807622534,
"loss": 1.0906,
"step": 1805
},
{
"epoch": 0.10440701430549146,
"grad_norm": 0.25396963953971863,
"learning_rate": 0.0001999882907814643,
"loss": 1.0226,
"step": 1810
},
{
"epoch": 0.10469543147208121,
"grad_norm": 0.28205832839012146,
"learning_rate": 0.00019998669944961062,
"loss": 0.9224,
"step": 1815
},
{
"epoch": 0.10498384863867097,
"grad_norm": 0.26078683137893677,
"learning_rate": 0.0001999850067683054,
"loss": 0.9427,
"step": 1820
},
{
"epoch": 0.10527226580526072,
"grad_norm": 0.25481727719306946,
"learning_rate": 0.00019998321273926437,
"loss": 1.0042,
"step": 1825
},
{
"epoch": 0.10556068297185048,
"grad_norm": 0.25570574402809143,
"learning_rate": 0.00019998131736430604,
"loss": 0.9722,
"step": 1830
},
{
"epoch": 0.10584910013844025,
"grad_norm": 0.2734397351741791,
"learning_rate": 0.00019997932064535158,
"loss": 1.001,
"step": 1835
},
{
"epoch": 0.10613751730503,
"grad_norm": 0.27242162823677063,
"learning_rate": 0.00019997722258442499,
"loss": 0.9647,
"step": 1840
},
{
"epoch": 0.10642593447161976,
"grad_norm": 0.2732183635234833,
"learning_rate": 0.00019997502318365286,
"loss": 0.9697,
"step": 1845
},
{
"epoch": 0.10671435163820951,
"grad_norm": 0.26898330450057983,
"learning_rate": 0.00019997272244526456,
"loss": 0.9284,
"step": 1850
},
{
"epoch": 0.10700276880479927,
"grad_norm": 0.2656812071800232,
"learning_rate": 0.00019997032037159224,
"loss": 1.0368,
"step": 1855
},
{
"epoch": 0.10729118597138902,
"grad_norm": 0.2728678584098816,
"learning_rate": 0.00019996781696507069,
"loss": 1.0147,
"step": 1860
},
{
"epoch": 0.10757960313797878,
"grad_norm": 0.2543455958366394,
"learning_rate": 0.00019996521222823743,
"loss": 0.954,
"step": 1865
},
{
"epoch": 0.10786802030456853,
"grad_norm": 0.27658751606941223,
"learning_rate": 0.00019996250616373268,
"loss": 0.9796,
"step": 1870
},
{
"epoch": 0.10815643747115829,
"grad_norm": 0.27136722207069397,
"learning_rate": 0.00019995969877429945,
"loss": 0.9125,
"step": 1875
},
{
"epoch": 0.10844485463774804,
"grad_norm": 0.2712014317512512,
"learning_rate": 0.0001999567900627833,
"loss": 1.0053,
"step": 1880
},
{
"epoch": 0.1087332718043378,
"grad_norm": 0.2740635573863983,
"learning_rate": 0.0001999537800321327,
"loss": 0.9951,
"step": 1885
},
{
"epoch": 0.10902168897092755,
"grad_norm": 0.26667481660842896,
"learning_rate": 0.0001999506686853986,
"loss": 1.0062,
"step": 1890
},
{
"epoch": 0.10931010613751731,
"grad_norm": 0.2604423463344574,
"learning_rate": 0.0001999474560257348,
"loss": 0.9852,
"step": 1895
},
{
"epoch": 0.10959852330410706,
"grad_norm": 0.27640554308891296,
"learning_rate": 0.00019994414205639775,
"loss": 0.959,
"step": 1900
},
{
"epoch": 0.10988694047069682,
"grad_norm": 0.25489839911460876,
"learning_rate": 0.00019994072678074655,
"loss": 0.9957,
"step": 1905
},
{
"epoch": 0.11017535763728657,
"grad_norm": 0.2796529233455658,
"learning_rate": 0.00019993721020224308,
"loss": 0.9418,
"step": 1910
},
{
"epoch": 0.11046377480387633,
"grad_norm": 0.2622373402118683,
"learning_rate": 0.00019993359232445176,
"loss": 0.9573,
"step": 1915
},
{
"epoch": 0.11075219197046608,
"grad_norm": 0.2514156997203827,
"learning_rate": 0.0001999298731510399,
"loss": 0.9373,
"step": 1920
},
{
"epoch": 0.11104060913705584,
"grad_norm": 0.2672327160835266,
"learning_rate": 0.00019992605268577727,
"loss": 0.9097,
"step": 1925
},
{
"epoch": 0.11132902630364559,
"grad_norm": 0.26772674918174744,
"learning_rate": 0.00019992213093253643,
"loss": 1.0108,
"step": 1930
},
{
"epoch": 0.11161744347023535,
"grad_norm": 0.2462950050830841,
"learning_rate": 0.00019991810789529257,
"loss": 1.0006,
"step": 1935
},
{
"epoch": 0.1119058606368251,
"grad_norm": 0.26759883761405945,
"learning_rate": 0.0001999139835781236,
"loss": 0.9758,
"step": 1940
},
{
"epoch": 0.11219427780341486,
"grad_norm": 0.2841535806655884,
"learning_rate": 0.00019990975798521,
"loss": 1.0408,
"step": 1945
},
{
"epoch": 0.11248269497000461,
"grad_norm": 0.2822214365005493,
"learning_rate": 0.00019990543112083503,
"loss": 0.9317,
"step": 1950
},
{
"epoch": 0.11277111213659437,
"grad_norm": 0.2670351564884186,
"learning_rate": 0.00019990100298938442,
"loss": 0.9536,
"step": 1955
},
{
"epoch": 0.11305952930318412,
"grad_norm": 0.27470991015434265,
"learning_rate": 0.00019989647359534672,
"loss": 1.0404,
"step": 1960
},
{
"epoch": 0.11334794646977388,
"grad_norm": 0.2892574071884155,
"learning_rate": 0.00019989184294331308,
"loss": 0.9912,
"step": 1965
},
{
"epoch": 0.11363636363636363,
"grad_norm": 0.28786224126815796,
"learning_rate": 0.0001998871110379772,
"loss": 1.048,
"step": 1970
},
{
"epoch": 0.11392478080295339,
"grad_norm": 0.2730783522129059,
"learning_rate": 0.0001998822778841355,
"loss": 1.0148,
"step": 1975
},
{
"epoch": 0.11421319796954314,
"grad_norm": 0.25908493995666504,
"learning_rate": 0.00019987734348668706,
"loss": 0.9237,
"step": 1980
},
{
"epoch": 0.1145016151361329,
"grad_norm": 0.2924931049346924,
"learning_rate": 0.00019987230785063344,
"loss": 1.0084,
"step": 1985
},
{
"epoch": 0.11479003230272265,
"grad_norm": 0.2685001790523529,
"learning_rate": 0.00019986717098107896,
"loss": 0.977,
"step": 1990
},
{
"epoch": 0.11507844946931241,
"grad_norm": 0.26407670974731445,
"learning_rate": 0.0001998619328832305,
"loss": 1.0132,
"step": 1995
},
{
"epoch": 0.11536686663590216,
"grad_norm": 0.2581160366535187,
"learning_rate": 0.00019985659356239758,
"loss": 1.0553,
"step": 2000
},
{
"epoch": 0.11565528380249192,
"grad_norm": 0.2579261064529419,
"learning_rate": 0.0001998511530239922,
"loss": 0.992,
"step": 2005
},
{
"epoch": 0.11594370096908169,
"grad_norm": 0.27874529361724854,
"learning_rate": 0.00019984561127352914,
"loss": 1.0208,
"step": 2010
},
{
"epoch": 0.11623211813567144,
"grad_norm": 0.2448752522468567,
"learning_rate": 0.00019983996831662566,
"loss": 1.0272,
"step": 2015
},
{
"epoch": 0.1165205353022612,
"grad_norm": 0.2515913248062134,
"learning_rate": 0.00019983422415900158,
"loss": 1.0251,
"step": 2020
},
{
"epoch": 0.11680895246885095,
"grad_norm": 0.2612157464027405,
"learning_rate": 0.0001998283788064794,
"loss": 0.9298,
"step": 2025
},
{
"epoch": 0.1170973696354407,
"grad_norm": 0.2781950533390045,
"learning_rate": 0.00019982243226498411,
"loss": 1.0191,
"step": 2030
},
{
"epoch": 0.11738578680203046,
"grad_norm": 0.27393776178359985,
"learning_rate": 0.00019981638454054333,
"loss": 0.8712,
"step": 2035
},
{
"epoch": 0.11767420396862022,
"grad_norm": 0.271932452917099,
"learning_rate": 0.00019981023563928716,
"loss": 0.9644,
"step": 2040
},
{
"epoch": 0.11796262113520997,
"grad_norm": 0.2659457325935364,
"learning_rate": 0.00019980398556744837,
"loss": 0.9295,
"step": 2045
},
{
"epoch": 0.11825103830179973,
"grad_norm": 0.2813827395439148,
"learning_rate": 0.00019979763433136216,
"loss": 0.975,
"step": 2050
},
{
"epoch": 0.11853945546838948,
"grad_norm": 0.24046528339385986,
"learning_rate": 0.00019979118193746637,
"loss": 0.9836,
"step": 2055
},
{
"epoch": 0.11882787263497924,
"grad_norm": 0.27069780230522156,
"learning_rate": 0.00019978462839230133,
"loss": 1.0503,
"step": 2060
},
{
"epoch": 0.11911628980156899,
"grad_norm": 0.2609676718711853,
"learning_rate": 0.00019977797370250986,
"loss": 0.959,
"step": 2065
},
{
"epoch": 0.11940470696815875,
"grad_norm": 0.2760465145111084,
"learning_rate": 0.0001997712178748374,
"loss": 1.0014,
"step": 2070
},
{
"epoch": 0.1196931241347485,
"grad_norm": 0.2539708614349365,
"learning_rate": 0.00019976436091613184,
"loss": 1.0215,
"step": 2075
},
{
"epoch": 0.11998154130133826,
"grad_norm": 0.27062153816223145,
"learning_rate": 0.0001997574028333436,
"loss": 0.964,
"step": 2080
},
{
"epoch": 0.12026995846792801,
"grad_norm": 0.26900675892829895,
"learning_rate": 0.00019975034363352556,
"loss": 0.935,
"step": 2085
},
{
"epoch": 0.12055837563451777,
"grad_norm": 0.27462172508239746,
"learning_rate": 0.0001997431833238332,
"loss": 0.974,
"step": 2090
},
{
"epoch": 0.12084679280110752,
"grad_norm": 0.3665010333061218,
"learning_rate": 0.00019973592191152437,
"loss": 1.0159,
"step": 2095
},
{
"epoch": 0.12113520996769728,
"grad_norm": 0.28900420665740967,
"learning_rate": 0.00019972855940395947,
"loss": 1.0202,
"step": 2100
},
{
"epoch": 0.12142362713428703,
"grad_norm": 0.2706412374973297,
"learning_rate": 0.00019972109580860132,
"loss": 0.9766,
"step": 2105
},
{
"epoch": 0.12171204430087679,
"grad_norm": 0.28748854994773865,
"learning_rate": 0.00019971353113301527,
"loss": 1.095,
"step": 2110
},
{
"epoch": 0.12200046146746654,
"grad_norm": 0.2745112180709839,
"learning_rate": 0.0001997058653848691,
"loss": 0.9995,
"step": 2115
},
{
"epoch": 0.1222888786340563,
"grad_norm": 0.27372869849205017,
"learning_rate": 0.00019969809857193306,
"loss": 0.9582,
"step": 2120
},
{
"epoch": 0.12257729580064605,
"grad_norm": 0.2714395821094513,
"learning_rate": 0.00019969023070207973,
"loss": 0.9423,
"step": 2125
},
{
"epoch": 0.12286571296723581,
"grad_norm": 0.26695722341537476,
"learning_rate": 0.0001996822617832843,
"loss": 0.9192,
"step": 2130
},
{
"epoch": 0.12315413013382556,
"grad_norm": 0.2779480814933777,
"learning_rate": 0.00019967419182362429,
"loss": 0.9577,
"step": 2135
},
{
"epoch": 0.12344254730041532,
"grad_norm": 0.279851496219635,
"learning_rate": 0.0001996660208312796,
"loss": 0.9946,
"step": 2140
},
{
"epoch": 0.12373096446700507,
"grad_norm": 0.2676329016685486,
"learning_rate": 0.00019965774881453263,
"loss": 1.0293,
"step": 2145
},
{
"epoch": 0.12401938163359483,
"grad_norm": 0.2577393054962158,
"learning_rate": 0.00019964937578176816,
"loss": 0.9845,
"step": 2150
},
{
"epoch": 0.12430779880018458,
"grad_norm": 0.2870205342769623,
"learning_rate": 0.00019964090174147327,
"loss": 0.9747,
"step": 2155
},
{
"epoch": 0.12459621596677434,
"grad_norm": 0.2597945034503937,
"learning_rate": 0.00019963232670223752,
"loss": 0.9896,
"step": 2160
},
{
"epoch": 0.12488463313336409,
"grad_norm": 0.3189765512943268,
"learning_rate": 0.00019962365067275286,
"loss": 0.9538,
"step": 2165
},
{
"epoch": 0.12517305029995385,
"grad_norm": 0.27205929160118103,
"learning_rate": 0.00019961487366181355,
"loss": 0.9626,
"step": 2170
},
{
"epoch": 0.1254614674665436,
"grad_norm": 0.26647019386291504,
"learning_rate": 0.0001996059956783162,
"loss": 1.0142,
"step": 2175
},
{
"epoch": 0.12574988463313336,
"grad_norm": 0.2724989652633667,
"learning_rate": 0.00019959701673125983,
"loss": 1.0228,
"step": 2180
},
{
"epoch": 0.1260383017997231,
"grad_norm": 0.27627307176589966,
"learning_rate": 0.00019958793682974574,
"loss": 0.9744,
"step": 2185
},
{
"epoch": 0.12632671896631287,
"grad_norm": 0.2836136221885681,
"learning_rate": 0.00019957875598297759,
"loss": 1.0011,
"step": 2190
},
{
"epoch": 0.12661513613290262,
"grad_norm": 0.26454490423202515,
"learning_rate": 0.00019956947420026136,
"loss": 1.0463,
"step": 2195
},
{
"epoch": 0.12690355329949238,
"grad_norm": 0.29074445366859436,
"learning_rate": 0.00019956009149100533,
"loss": 0.9643,
"step": 2200
},
{
"epoch": 0.12719197046608213,
"grad_norm": 0.2764613926410675,
"learning_rate": 0.00019955060786472012,
"loss": 0.9245,
"step": 2205
},
{
"epoch": 0.1274803876326719,
"grad_norm": 0.2702649235725403,
"learning_rate": 0.00019954102333101856,
"loss": 0.9734,
"step": 2210
},
{
"epoch": 0.12776880479926164,
"grad_norm": 0.28136304020881653,
"learning_rate": 0.00019953133789961584,
"loss": 0.9782,
"step": 2215
},
{
"epoch": 0.1280572219658514,
"grad_norm": 0.29559558629989624,
"learning_rate": 0.0001995215515803294,
"loss": 0.9708,
"step": 2220
},
{
"epoch": 0.12834563913244115,
"grad_norm": 0.2811656892299652,
"learning_rate": 0.00019951166438307894,
"loss": 0.9839,
"step": 2225
},
{
"epoch": 0.1286340562990309,
"grad_norm": 0.27432867884635925,
"learning_rate": 0.00019950167631788642,
"loss": 0.9697,
"step": 2230
},
{
"epoch": 0.12892247346562066,
"grad_norm": 0.28106796741485596,
"learning_rate": 0.000199491587394876,
"loss": 0.9526,
"step": 2235
},
{
"epoch": 0.12921089063221042,
"grad_norm": 0.2755594253540039,
"learning_rate": 0.00019948139762427416,
"loss": 0.9943,
"step": 2240
},
{
"epoch": 0.12949930779880017,
"grad_norm": 0.27341076731681824,
"learning_rate": 0.00019947110701640952,
"loss": 0.9661,
"step": 2245
},
{
"epoch": 0.12978772496538993,
"grad_norm": 0.2582038938999176,
"learning_rate": 0.000199460715581713,
"loss": 0.9083,
"step": 2250
},
{
"epoch": 0.13007614213197968,
"grad_norm": 0.2739073932170868,
"learning_rate": 0.00019945022333071752,
"loss": 1.0518,
"step": 2255
},
{
"epoch": 0.13036455929856944,
"grad_norm": 0.2646303176879883,
"learning_rate": 0.0001994396302740585,
"loss": 0.9709,
"step": 2260
},
{
"epoch": 0.1306529764651592,
"grad_norm": 0.2723826766014099,
"learning_rate": 0.00019942893642247326,
"loss": 0.9845,
"step": 2265
},
{
"epoch": 0.13094139363174895,
"grad_norm": 0.27351605892181396,
"learning_rate": 0.00019941814178680144,
"loss": 1.0138,
"step": 2270
},
{
"epoch": 0.13122981079833873,
"grad_norm": 0.2802083492279053,
"learning_rate": 0.00019940724637798477,
"loss": 0.9364,
"step": 2275
},
{
"epoch": 0.13151822796492849,
"grad_norm": 0.27607461810112,
"learning_rate": 0.00019939625020706724,
"loss": 0.9931,
"step": 2280
},
{
"epoch": 0.13180664513151824,
"grad_norm": 0.270385205745697,
"learning_rate": 0.0001993851532851948,
"loss": 0.9763,
"step": 2285
},
{
"epoch": 0.132095062298108,
"grad_norm": 0.2873282730579376,
"learning_rate": 0.00019937395562361564,
"loss": 1.0417,
"step": 2290
},
{
"epoch": 0.13238347946469775,
"grad_norm": 0.2726912796497345,
"learning_rate": 0.0001993626572336801,
"loss": 0.9555,
"step": 2295
},
{
"epoch": 0.1326718966312875,
"grad_norm": 0.2793363332748413,
"learning_rate": 0.00019935125812684047,
"loss": 0.9883,
"step": 2300
},
{
"epoch": 0.13296031379787726,
"grad_norm": 0.2792257070541382,
"learning_rate": 0.0001993397583146513,
"loss": 1.0003,
"step": 2305
},
{
"epoch": 0.13324873096446702,
"grad_norm": 0.27051353454589844,
"learning_rate": 0.00019932815780876904,
"loss": 0.9726,
"step": 2310
},
{
"epoch": 0.13353714813105677,
"grad_norm": 0.28619712591171265,
"learning_rate": 0.00019931645662095237,
"loss": 0.9621,
"step": 2315
},
{
"epoch": 0.13382556529764653,
"grad_norm": 0.27812543511390686,
"learning_rate": 0.00019930465476306197,
"loss": 0.9909,
"step": 2320
},
{
"epoch": 0.13411398246423628,
"grad_norm": 0.27520883083343506,
"learning_rate": 0.0001992927522470605,
"loss": 1.0185,
"step": 2325
},
{
"epoch": 0.13440239963082604,
"grad_norm": 0.27513301372528076,
"learning_rate": 0.00019928074908501272,
"loss": 0.9595,
"step": 2330
},
{
"epoch": 0.1346908167974158,
"grad_norm": 0.29639777541160583,
"learning_rate": 0.0001992686452890854,
"loss": 0.9819,
"step": 2335
},
{
"epoch": 0.13497923396400555,
"grad_norm": 0.2893521189689636,
"learning_rate": 0.00019925644087154734,
"loss": 0.9894,
"step": 2340
},
{
"epoch": 0.1352676511305953,
"grad_norm": 0.267421156167984,
"learning_rate": 0.0001992441358447692,
"loss": 0.9882,
"step": 2345
},
{
"epoch": 0.13555606829718506,
"grad_norm": 0.2774795591831207,
"learning_rate": 0.00019923173022122378,
"loss": 0.9404,
"step": 2350
},
{
"epoch": 0.1358444854637748,
"grad_norm": 0.30167555809020996,
"learning_rate": 0.00019921922401348576,
"loss": 0.9631,
"step": 2355
},
{
"epoch": 0.13613290263036457,
"grad_norm": 0.2823658287525177,
"learning_rate": 0.00019920661723423183,
"loss": 0.9271,
"step": 2360
},
{
"epoch": 0.13642131979695432,
"grad_norm": 0.2752264142036438,
"learning_rate": 0.00019919390989624054,
"loss": 0.981,
"step": 2365
},
{
"epoch": 0.13670973696354408,
"grad_norm": 0.284186989068985,
"learning_rate": 0.00019918110201239247,
"loss": 1.0279,
"step": 2370
},
{
"epoch": 0.13699815413013383,
"grad_norm": 0.2601034343242645,
"learning_rate": 0.00019916819359567001,
"loss": 1.0219,
"step": 2375
},
{
"epoch": 0.1372865712967236,
"grad_norm": 0.3391975164413452,
"learning_rate": 0.00019915518465915758,
"loss": 0.9432,
"step": 2380
},
{
"epoch": 0.13757498846331334,
"grad_norm": 0.3057229816913605,
"learning_rate": 0.0001991420752160414,
"loss": 1.0415,
"step": 2385
},
{
"epoch": 0.1378634056299031,
"grad_norm": 0.2857256829738617,
"learning_rate": 0.00019912886527960954,
"loss": 0.9896,
"step": 2390
},
{
"epoch": 0.13815182279649285,
"grad_norm": 0.4211989641189575,
"learning_rate": 0.00019911555486325203,
"loss": 1.0471,
"step": 2395
},
{
"epoch": 0.1384402399630826,
"grad_norm": 0.26847025752067566,
"learning_rate": 0.0001991021439804607,
"loss": 1.0071,
"step": 2400
},
{
"epoch": 0.13872865712967236,
"grad_norm": 0.27097341418266296,
"learning_rate": 0.00019908863264482917,
"loss": 0.9493,
"step": 2405
},
{
"epoch": 0.13901707429626212,
"grad_norm": 0.2873136103153229,
"learning_rate": 0.00019907502087005297,
"loss": 1.0064,
"step": 2410
},
{
"epoch": 0.13930549146285187,
"grad_norm": 0.2804831564426422,
"learning_rate": 0.00019906130866992935,
"loss": 0.9483,
"step": 2415
},
{
"epoch": 0.13959390862944163,
"grad_norm": 0.27144983410835266,
"learning_rate": 0.00019904749605835742,
"loss": 0.9541,
"step": 2420
},
{
"epoch": 0.13988232579603138,
"grad_norm": 0.2791461944580078,
"learning_rate": 0.00019903358304933805,
"loss": 1.0228,
"step": 2425
},
{
"epoch": 0.14017074296262114,
"grad_norm": 0.2839184105396271,
"learning_rate": 0.00019901956965697387,
"loss": 0.9853,
"step": 2430
},
{
"epoch": 0.1404591601292109,
"grad_norm": 0.2938236594200134,
"learning_rate": 0.0001990054558954693,
"loss": 1.0175,
"step": 2435
},
{
"epoch": 0.14074757729580065,
"grad_norm": 0.26195093989372253,
"learning_rate": 0.00019899124177913041,
"loss": 0.9927,
"step": 2440
},
{
"epoch": 0.1410359944623904,
"grad_norm": 0.282997727394104,
"learning_rate": 0.0001989769273223651,
"loss": 0.9148,
"step": 2445
},
{
"epoch": 0.14132441162898016,
"grad_norm": 0.2869815230369568,
"learning_rate": 0.00019896251253968288,
"loss": 0.9978,
"step": 2450
},
{
"epoch": 0.1416128287955699,
"grad_norm": 0.30306002497673035,
"learning_rate": 0.000198947997445695,
"loss": 0.9793,
"step": 2455
},
{
"epoch": 0.14190124596215967,
"grad_norm": 0.2726587951183319,
"learning_rate": 0.0001989333820551144,
"loss": 0.8918,
"step": 2460
},
{
"epoch": 0.14218966312874942,
"grad_norm": 0.3028129041194916,
"learning_rate": 0.00019891866638275564,
"loss": 1.0184,
"step": 2465
},
{
"epoch": 0.14247808029533918,
"grad_norm": 0.27245384454727173,
"learning_rate": 0.00019890385044353501,
"loss": 0.9187,
"step": 2470
},
{
"epoch": 0.14276649746192893,
"grad_norm": 0.26684272289276123,
"learning_rate": 0.00019888893425247032,
"loss": 0.94,
"step": 2475
},
{
"epoch": 0.1430549146285187,
"grad_norm": 0.26761725544929504,
"learning_rate": 0.00019887391782468113,
"loss": 0.9606,
"step": 2480
},
{
"epoch": 0.14334333179510844,
"grad_norm": 0.2789659798145294,
"learning_rate": 0.00019885880117538846,
"loss": 0.9361,
"step": 2485
},
{
"epoch": 0.1436317489616982,
"grad_norm": 0.2568376362323761,
"learning_rate": 0.000198843584319915,
"loss": 1.0155,
"step": 2490
},
{
"epoch": 0.14392016612828795,
"grad_norm": 0.29699787497520447,
"learning_rate": 0.00019882826727368508,
"loss": 1.0136,
"step": 2495
},
{
"epoch": 0.1442085832948777,
"grad_norm": 0.3011142313480377,
"learning_rate": 0.0001988128500522244,
"loss": 0.9967,
"step": 2500
},
{
"epoch": 0.14449700046146746,
"grad_norm": 0.27386248111724854,
"learning_rate": 0.00019879733267116035,
"loss": 1.0263,
"step": 2505
},
{
"epoch": 0.14478541762805722,
"grad_norm": 0.31453463435173035,
"learning_rate": 0.00019878171514622187,
"loss": 0.9307,
"step": 2510
},
{
"epoch": 0.14507383479464697,
"grad_norm": 0.2672314941883087,
"learning_rate": 0.0001987659974932392,
"loss": 0.9441,
"step": 2515
},
{
"epoch": 0.14536225196123673,
"grad_norm": 0.2847091257572174,
"learning_rate": 0.00019875017972814435,
"loss": 0.9868,
"step": 2520
},
{
"epoch": 0.14565066912782648,
"grad_norm": 0.28868651390075684,
"learning_rate": 0.0001987342618669706,
"loss": 0.9296,
"step": 2525
},
{
"epoch": 0.14593908629441624,
"grad_norm": 0.29168251156806946,
"learning_rate": 0.00019871824392585276,
"loss": 0.9317,
"step": 2530
},
{
"epoch": 0.146227503461006,
"grad_norm": 0.2743743062019348,
"learning_rate": 0.00019870212592102711,
"loss": 1.0277,
"step": 2535
},
{
"epoch": 0.14651592062759575,
"grad_norm": 0.2812393605709076,
"learning_rate": 0.00019868590786883134,
"loss": 1.0553,
"step": 2540
},
{
"epoch": 0.1468043377941855,
"grad_norm": 0.2678181231021881,
"learning_rate": 0.00019866958978570452,
"loss": 0.8821,
"step": 2545
},
{
"epoch": 0.14709275496077526,
"grad_norm": 0.3037974238395691,
"learning_rate": 0.00019865317168818713,
"loss": 0.9625,
"step": 2550
},
{
"epoch": 0.147381172127365,
"grad_norm": 0.2820071578025818,
"learning_rate": 0.00019863665359292108,
"loss": 1.0259,
"step": 2555
},
{
"epoch": 0.14766958929395477,
"grad_norm": 0.2591807544231415,
"learning_rate": 0.0001986200355166495,
"loss": 0.9521,
"step": 2560
},
{
"epoch": 0.14795800646054452,
"grad_norm": 0.26036834716796875,
"learning_rate": 0.0001986033174762171,
"loss": 0.94,
"step": 2565
},
{
"epoch": 0.14824642362713428,
"grad_norm": 0.27297431230545044,
"learning_rate": 0.0001985864994885697,
"loss": 0.9859,
"step": 2570
},
{
"epoch": 0.14853484079372403,
"grad_norm": 0.27806761860847473,
"learning_rate": 0.00019856958157075445,
"loss": 1.0,
"step": 2575
},
{
"epoch": 0.1488232579603138,
"grad_norm": 0.2749041020870209,
"learning_rate": 0.00019855256373991993,
"loss": 0.9111,
"step": 2580
},
{
"epoch": 0.14911167512690354,
"grad_norm": 0.28046393394470215,
"learning_rate": 0.0001985354460133159,
"loss": 0.9089,
"step": 2585
},
{
"epoch": 0.1494000922934933,
"grad_norm": 0.2683013379573822,
"learning_rate": 0.00019851822840829338,
"loss": 0.9122,
"step": 2590
},
{
"epoch": 0.14968850946008305,
"grad_norm": 0.28444692492485046,
"learning_rate": 0.0001985009109423046,
"loss": 0.9987,
"step": 2595
},
{
"epoch": 0.1499769266266728,
"grad_norm": 0.28526070713996887,
"learning_rate": 0.0001984834936329031,
"loss": 1.0177,
"step": 2600
},
{
"epoch": 0.15026534379326256,
"grad_norm": 0.2751544415950775,
"learning_rate": 0.00019846597649774358,
"loss": 1.0602,
"step": 2605
},
{
"epoch": 0.15055376095985232,
"grad_norm": 0.29558390378952026,
"learning_rate": 0.00019844835955458193,
"loss": 1.0015,
"step": 2610
},
{
"epoch": 0.15084217812644207,
"grad_norm": 0.27498286962509155,
"learning_rate": 0.00019843064282127511,
"loss": 0.9561,
"step": 2615
},
{
"epoch": 0.15113059529303183,
"grad_norm": 0.292961061000824,
"learning_rate": 0.00019841282631578145,
"loss": 0.9914,
"step": 2620
},
{
"epoch": 0.1514190124596216,
"grad_norm": 0.3029356896877289,
"learning_rate": 0.0001983949100561602,
"loss": 0.9801,
"step": 2625
},
{
"epoch": 0.15170742962621137,
"grad_norm": 0.2864689230918884,
"learning_rate": 0.00019837689406057183,
"loss": 0.9578,
"step": 2630
},
{
"epoch": 0.15199584679280112,
"grad_norm": 0.2750813961029053,
"learning_rate": 0.00019835877834727787,
"loss": 0.9483,
"step": 2635
},
{
"epoch": 0.15228426395939088,
"grad_norm": 0.27926185727119446,
"learning_rate": 0.00019834056293464093,
"loss": 1.0165,
"step": 2640
},
{
"epoch": 0.15257268112598063,
"grad_norm": 0.27533864974975586,
"learning_rate": 0.00019832224784112473,
"loss": 1.0241,
"step": 2645
},
{
"epoch": 0.15286109829257039,
"grad_norm": 0.276993989944458,
"learning_rate": 0.00019830383308529393,
"loss": 1.0444,
"step": 2650
},
{
"epoch": 0.15314951545916014,
"grad_norm": 0.2960858643054962,
"learning_rate": 0.0001982853186858143,
"loss": 0.9928,
"step": 2655
},
{
"epoch": 0.1534379326257499,
"grad_norm": 0.29162392020225525,
"learning_rate": 0.00019826670466145262,
"loss": 0.8887,
"step": 2660
},
{
"epoch": 0.15372634979233965,
"grad_norm": 0.2606879472732544,
"learning_rate": 0.0001982479910310765,
"loss": 0.9832,
"step": 2665
},
{
"epoch": 0.1540147669589294,
"grad_norm": 0.29048001766204834,
"learning_rate": 0.00019822917781365474,
"loss": 1.01,
"step": 2670
},
{
"epoch": 0.15430318412551916,
"grad_norm": 0.2942920923233032,
"learning_rate": 0.00019821026502825687,
"loss": 1.0289,
"step": 2675
},
{
"epoch": 0.15459160129210892,
"grad_norm": 0.2862975597381592,
"learning_rate": 0.00019819125269405352,
"loss": 0.9961,
"step": 2680
},
{
"epoch": 0.15488001845869867,
"grad_norm": 0.2896837890148163,
"learning_rate": 0.00019817214083031614,
"loss": 1.0002,
"step": 2685
},
{
"epoch": 0.15516843562528843,
"grad_norm": 0.26825401186943054,
"learning_rate": 0.00019815292945641705,
"loss": 0.9874,
"step": 2690
},
{
"epoch": 0.15545685279187818,
"grad_norm": 0.2813914120197296,
"learning_rate": 0.00019813361859182945,
"loss": 0.9919,
"step": 2695
},
{
"epoch": 0.15574526995846794,
"grad_norm": 0.284069687128067,
"learning_rate": 0.0001981142082561274,
"loss": 0.8997,
"step": 2700
},
{
"epoch": 0.1560336871250577,
"grad_norm": 0.2858209013938904,
"learning_rate": 0.00019809469846898586,
"loss": 0.9546,
"step": 2705
},
{
"epoch": 0.15632210429164745,
"grad_norm": 0.2836093604564667,
"learning_rate": 0.0001980750892501804,
"loss": 0.9254,
"step": 2710
},
{
"epoch": 0.1566105214582372,
"grad_norm": 0.32628414034843445,
"learning_rate": 0.00019805538061958765,
"loss": 0.94,
"step": 2715
},
{
"epoch": 0.15689893862482696,
"grad_norm": 0.2873879373073578,
"learning_rate": 0.0001980355725971847,
"loss": 0.9598,
"step": 2720
},
{
"epoch": 0.1571873557914167,
"grad_norm": 0.27270689606666565,
"learning_rate": 0.00019801566520304963,
"loss": 0.9622,
"step": 2725
},
{
"epoch": 0.15747577295800647,
"grad_norm": 0.25972458720207214,
"learning_rate": 0.0001979956584573612,
"loss": 0.9895,
"step": 2730
},
{
"epoch": 0.15776419012459622,
"grad_norm": 0.2917114198207855,
"learning_rate": 0.00019797555238039872,
"loss": 0.9528,
"step": 2735
},
{
"epoch": 0.15805260729118598,
"grad_norm": 0.26294592022895813,
"learning_rate": 0.00019795534699254238,
"loss": 0.9309,
"step": 2740
},
{
"epoch": 0.15834102445777573,
"grad_norm": 0.28122779726982117,
"learning_rate": 0.0001979350423142729,
"loss": 0.9853,
"step": 2745
},
{
"epoch": 0.15862944162436549,
"grad_norm": 0.29183605313301086,
"learning_rate": 0.00019791463836617176,
"loss": 0.9382,
"step": 2750
},
{
"epoch": 0.15891785879095524,
"grad_norm": 0.28074556589126587,
"learning_rate": 0.00019789413516892098,
"loss": 1.01,
"step": 2755
},
{
"epoch": 0.159206275957545,
"grad_norm": 0.2814944088459015,
"learning_rate": 0.00019787353274330313,
"loss": 1.0161,
"step": 2760
},
{
"epoch": 0.15949469312413475,
"grad_norm": 0.2898254990577698,
"learning_rate": 0.00019785283111020156,
"loss": 1.0388,
"step": 2765
},
{
"epoch": 0.1597831102907245,
"grad_norm": 0.2777402400970459,
"learning_rate": 0.00019783203029059997,
"loss": 0.9589,
"step": 2770
},
{
"epoch": 0.16007152745731426,
"grad_norm": 0.2646116316318512,
"learning_rate": 0.00019781113030558267,
"loss": 0.9569,
"step": 2775
},
{
"epoch": 0.16035994462390402,
"grad_norm": 0.3243483304977417,
"learning_rate": 0.00019779013117633454,
"loss": 0.9622,
"step": 2780
},
{
"epoch": 0.16064836179049377,
"grad_norm": 0.2765612304210663,
"learning_rate": 0.0001977690329241409,
"loss": 1.0068,
"step": 2785
},
{
"epoch": 0.16093677895708353,
"grad_norm": 0.30408522486686707,
"learning_rate": 0.00019774783557038755,
"loss": 0.969,
"step": 2790
},
{
"epoch": 0.16122519612367328,
"grad_norm": 0.26990190148353577,
"learning_rate": 0.00019772653913656076,
"loss": 1.025,
"step": 2795
},
{
"epoch": 0.16151361329026304,
"grad_norm": 0.31291985511779785,
"learning_rate": 0.00019770514364424725,
"loss": 1.0174,
"step": 2800
},
{
"epoch": 0.1618020304568528,
"grad_norm": 0.31198903918266296,
"learning_rate": 0.00019768364911513405,
"loss": 0.9603,
"step": 2805
},
{
"epoch": 0.16209044762344255,
"grad_norm": 0.28119274973869324,
"learning_rate": 0.00019766205557100868,
"loss": 0.9689,
"step": 2810
},
{
"epoch": 0.1623788647900323,
"grad_norm": 0.27684643864631653,
"learning_rate": 0.000197640363033759,
"loss": 0.9272,
"step": 2815
},
{
"epoch": 0.16266728195662206,
"grad_norm": 0.2740548253059387,
"learning_rate": 0.0001976185715253732,
"loss": 1.0165,
"step": 2820
},
{
"epoch": 0.1629556991232118,
"grad_norm": 0.3126582205295563,
"learning_rate": 0.00019759668106793975,
"loss": 0.9915,
"step": 2825
},
{
"epoch": 0.16324411628980157,
"grad_norm": 0.27744656801223755,
"learning_rate": 0.0001975746916836475,
"loss": 0.9971,
"step": 2830
},
{
"epoch": 0.16353253345639132,
"grad_norm": 0.280280202627182,
"learning_rate": 0.00019755260339478556,
"loss": 0.9637,
"step": 2835
},
{
"epoch": 0.16382095062298108,
"grad_norm": 0.2840816378593445,
"learning_rate": 0.0001975304162237432,
"loss": 0.9603,
"step": 2840
},
{
"epoch": 0.16410936778957083,
"grad_norm": 0.2826577126979828,
"learning_rate": 0.00019750813019301004,
"loss": 1.0331,
"step": 2845
},
{
"epoch": 0.1643977849561606,
"grad_norm": 0.2963692545890808,
"learning_rate": 0.00019748574532517586,
"loss": 0.999,
"step": 2850
},
{
"epoch": 0.16468620212275034,
"grad_norm": 0.2895634174346924,
"learning_rate": 0.00019746326164293056,
"loss": 0.9637,
"step": 2855
},
{
"epoch": 0.1649746192893401,
"grad_norm": 0.287422776222229,
"learning_rate": 0.0001974406791690643,
"loss": 0.9696,
"step": 2860
},
{
"epoch": 0.16526303645592985,
"grad_norm": 0.31378328800201416,
"learning_rate": 0.00019741799792646734,
"loss": 1.0066,
"step": 2865
},
{
"epoch": 0.1655514536225196,
"grad_norm": 0.28587618470191956,
"learning_rate": 0.00019739521793813006,
"loss": 0.9224,
"step": 2870
},
{
"epoch": 0.16583987078910936,
"grad_norm": 0.28385454416275024,
"learning_rate": 0.0001973723392271429,
"loss": 0.9961,
"step": 2875
},
{
"epoch": 0.16612828795569912,
"grad_norm": 0.27586954832077026,
"learning_rate": 0.00019734936181669638,
"loss": 1.065,
"step": 2880
},
{
"epoch": 0.16641670512228887,
"grad_norm": 0.30055347084999084,
"learning_rate": 0.00019732628573008114,
"loss": 1.0089,
"step": 2885
},
{
"epoch": 0.16670512228887863,
"grad_norm": 0.30119630694389343,
"learning_rate": 0.00019730311099068771,
"loss": 1.017,
"step": 2890
},
{
"epoch": 0.16699353945546838,
"grad_norm": 0.29206573963165283,
"learning_rate": 0.00019727983762200677,
"loss": 0.9635,
"step": 2895
},
{
"epoch": 0.16728195662205814,
"grad_norm": 0.2570163905620575,
"learning_rate": 0.00019725646564762878,
"loss": 0.9791,
"step": 2900
},
{
"epoch": 0.1675703737886479,
"grad_norm": 0.3360570967197418,
"learning_rate": 0.00019723299509124433,
"loss": 0.9498,
"step": 2905
},
{
"epoch": 0.16785879095523765,
"grad_norm": 0.29323843121528625,
"learning_rate": 0.00019720942597664385,
"loss": 0.986,
"step": 2910
},
{
"epoch": 0.1681472081218274,
"grad_norm": 0.30418166518211365,
"learning_rate": 0.00019718575832771768,
"loss": 0.9756,
"step": 2915
},
{
"epoch": 0.16843562528841716,
"grad_norm": 0.31183257699012756,
"learning_rate": 0.00019716199216845604,
"loss": 0.9997,
"step": 2920
},
{
"epoch": 0.1687240424550069,
"grad_norm": 0.26834046840667725,
"learning_rate": 0.000197138127522949,
"loss": 0.9315,
"step": 2925
},
{
"epoch": 0.16901245962159667,
"grad_norm": 0.27434879541397095,
"learning_rate": 0.00019711416441538652,
"loss": 1.0105,
"step": 2930
},
{
"epoch": 0.16930087678818642,
"grad_norm": 0.28828758001327515,
"learning_rate": 0.00019709010287005825,
"loss": 1.0128,
"step": 2935
},
{
"epoch": 0.16958929395477618,
"grad_norm": 0.2850480079650879,
"learning_rate": 0.00019706594291135366,
"loss": 0.9618,
"step": 2940
},
{
"epoch": 0.16987771112136593,
"grad_norm": 0.2937301993370056,
"learning_rate": 0.00019704168456376205,
"loss": 1.0175,
"step": 2945
},
{
"epoch": 0.1701661282879557,
"grad_norm": 0.28153088688850403,
"learning_rate": 0.0001970173278518724,
"loss": 0.9541,
"step": 2950
},
{
"epoch": 0.17045454545454544,
"grad_norm": 0.2839425802230835,
"learning_rate": 0.00019699287280037332,
"loss": 1.0139,
"step": 2955
},
{
"epoch": 0.1707429626211352,
"grad_norm": 0.28864094614982605,
"learning_rate": 0.00019696831943405324,
"loss": 1.0833,
"step": 2960
},
{
"epoch": 0.17103137978772495,
"grad_norm": 0.2697494626045227,
"learning_rate": 0.0001969436677778001,
"loss": 0.9827,
"step": 2965
},
{
"epoch": 0.1713197969543147,
"grad_norm": 0.2844550907611847,
"learning_rate": 0.0001969189178566016,
"loss": 1.005,
"step": 2970
},
{
"epoch": 0.1716082141209045,
"grad_norm": 0.30949264764785767,
"learning_rate": 0.000196894069695545,
"loss": 0.9696,
"step": 2975
},
{
"epoch": 0.17189663128749424,
"grad_norm": 0.2768407464027405,
"learning_rate": 0.00019686912331981702,
"loss": 0.9931,
"step": 2980
},
{
"epoch": 0.172185048454084,
"grad_norm": 0.28683245182037354,
"learning_rate": 0.00019684407875470415,
"loss": 1.0018,
"step": 2985
},
{
"epoch": 0.17247346562067375,
"grad_norm": 0.3155616223812103,
"learning_rate": 0.00019681893602559224,
"loss": 0.9813,
"step": 2990
},
{
"epoch": 0.1727618827872635,
"grad_norm": 0.3154447376728058,
"learning_rate": 0.0001967936951579667,
"loss": 0.9915,
"step": 2995
},
{
"epoch": 0.17305029995385326,
"grad_norm": 0.277576744556427,
"learning_rate": 0.00019676835617741249,
"loss": 0.9668,
"step": 3000
},
{
"epoch": 0.17333871712044302,
"grad_norm": 0.28618210554122925,
"learning_rate": 0.0001967429191096138,
"loss": 0.9745,
"step": 3005
},
{
"epoch": 0.17362713428703277,
"grad_norm": 0.27911707758903503,
"learning_rate": 0.0001967173839803545,
"loss": 0.9732,
"step": 3010
},
{
"epoch": 0.17391555145362253,
"grad_norm": 0.28373172879219055,
"learning_rate": 0.00019669175081551773,
"loss": 0.9797,
"step": 3015
},
{
"epoch": 0.17420396862021229,
"grad_norm": 0.29749229550361633,
"learning_rate": 0.00019666601964108598,
"loss": 0.94,
"step": 3020
},
{
"epoch": 0.17449238578680204,
"grad_norm": 0.31651487946510315,
"learning_rate": 0.00019664019048314116,
"loss": 0.9829,
"step": 3025
},
{
"epoch": 0.1747808029533918,
"grad_norm": 0.2834007740020752,
"learning_rate": 0.00019661426336786445,
"loss": 0.9336,
"step": 3030
},
{
"epoch": 0.17506922011998155,
"grad_norm": 0.2876712381839752,
"learning_rate": 0.00019658823832153632,
"loss": 0.9174,
"step": 3035
},
{
"epoch": 0.1753576372865713,
"grad_norm": 0.3259499669075012,
"learning_rate": 0.00019656211537053654,
"loss": 1.0362,
"step": 3040
},
{
"epoch": 0.17564605445316106,
"grad_norm": 0.26136502623558044,
"learning_rate": 0.00019653589454134406,
"loss": 0.9399,
"step": 3045
},
{
"epoch": 0.17593447161975082,
"grad_norm": 0.28630778193473816,
"learning_rate": 0.00019650957586053716,
"loss": 0.9861,
"step": 3050
},
{
"epoch": 0.17622288878634057,
"grad_norm": 0.2615172266960144,
"learning_rate": 0.00019648315935479315,
"loss": 1.0378,
"step": 3055
},
{
"epoch": 0.17651130595293033,
"grad_norm": 0.28133901953697205,
"learning_rate": 0.00019645664505088864,
"loss": 0.9746,
"step": 3060
},
{
"epoch": 0.17679972311952008,
"grad_norm": 0.3203901946544647,
"learning_rate": 0.00019643003297569923,
"loss": 0.9894,
"step": 3065
},
{
"epoch": 0.17708814028610984,
"grad_norm": 0.2845044434070587,
"learning_rate": 0.00019640332315619977,
"loss": 1.0024,
"step": 3070
},
{
"epoch": 0.1773765574526996,
"grad_norm": 0.28776776790618896,
"learning_rate": 0.0001963765156194641,
"loss": 1.0035,
"step": 3075
},
{
"epoch": 0.17766497461928935,
"grad_norm": 0.2923831343650818,
"learning_rate": 0.00019634961039266506,
"loss": 1.0253,
"step": 3080
},
{
"epoch": 0.1779533917858791,
"grad_norm": 0.29954782128334045,
"learning_rate": 0.00019632260750307467,
"loss": 0.9984,
"step": 3085
},
{
"epoch": 0.17824180895246886,
"grad_norm": 0.30335840582847595,
"learning_rate": 0.0001962955069780638,
"loss": 0.9339,
"step": 3090
},
{
"epoch": 0.1785302261190586,
"grad_norm": 0.28872916102409363,
"learning_rate": 0.00019626830884510236,
"loss": 1.0417,
"step": 3095
},
{
"epoch": 0.17881864328564837,
"grad_norm": 0.3210926949977875,
"learning_rate": 0.00019624101313175918,
"loss": 1.0293,
"step": 3100
},
{
"epoch": 0.17910706045223812,
"grad_norm": 0.29229721426963806,
"learning_rate": 0.00019621361986570194,
"loss": 0.9386,
"step": 3105
},
{
"epoch": 0.17939547761882788,
"grad_norm": 0.3137836754322052,
"learning_rate": 0.00019618612907469732,
"loss": 0.9874,
"step": 3110
},
{
"epoch": 0.17968389478541763,
"grad_norm": 0.27663466334342957,
"learning_rate": 0.00019615854078661077,
"loss": 0.9902,
"step": 3115
},
{
"epoch": 0.17997231195200739,
"grad_norm": 0.30164676904678345,
"learning_rate": 0.00019613085502940658,
"loss": 1.1187,
"step": 3120
},
{
"epoch": 0.18026072911859714,
"grad_norm": 0.2817506790161133,
"learning_rate": 0.00019610307183114787,
"loss": 0.9643,
"step": 3125
},
{
"epoch": 0.1805491462851869,
"grad_norm": 0.28451189398765564,
"learning_rate": 0.00019607519121999647,
"loss": 0.9553,
"step": 3130
},
{
"epoch": 0.18083756345177665,
"grad_norm": 0.3148361146450043,
"learning_rate": 0.00019604721322421303,
"loss": 0.9596,
"step": 3135
},
{
"epoch": 0.1811259806183664,
"grad_norm": 0.3131537437438965,
"learning_rate": 0.00019601913787215683,
"loss": 0.9841,
"step": 3140
},
{
"epoch": 0.18141439778495616,
"grad_norm": 0.301500141620636,
"learning_rate": 0.00019599096519228585,
"loss": 0.9387,
"step": 3145
},
{
"epoch": 0.18170281495154592,
"grad_norm": 0.2999275028705597,
"learning_rate": 0.0001959626952131568,
"loss": 0.8649,
"step": 3150
},
{
"epoch": 0.18199123211813567,
"grad_norm": 0.3055667281150818,
"learning_rate": 0.00019593432796342496,
"loss": 1.0364,
"step": 3155
},
{
"epoch": 0.18227964928472543,
"grad_norm": 0.30451443791389465,
"learning_rate": 0.00019590586347184417,
"loss": 1.0552,
"step": 3160
},
{
"epoch": 0.18256806645131518,
"grad_norm": 0.3046397566795349,
"learning_rate": 0.00019587730176726686,
"loss": 0.9897,
"step": 3165
},
{
"epoch": 0.18285648361790494,
"grad_norm": 0.3132875859737396,
"learning_rate": 0.00019584864287864408,
"loss": 0.953,
"step": 3170
},
{
"epoch": 0.1831449007844947,
"grad_norm": 0.2684531807899475,
"learning_rate": 0.00019581988683502525,
"loss": 1.0479,
"step": 3175
},
{
"epoch": 0.18343331795108445,
"grad_norm": 0.3220478594303131,
"learning_rate": 0.0001957910336655584,
"loss": 0.9818,
"step": 3180
},
{
"epoch": 0.1837217351176742,
"grad_norm": 0.29744499921798706,
"learning_rate": 0.00019576208339948988,
"loss": 0.985,
"step": 3185
},
{
"epoch": 0.18401015228426396,
"grad_norm": 0.26757848262786865,
"learning_rate": 0.00019573303606616459,
"loss": 0.9966,
"step": 3190
},
{
"epoch": 0.1842985694508537,
"grad_norm": 0.2966987192630768,
"learning_rate": 0.00019570389169502569,
"loss": 0.9853,
"step": 3195
},
{
"epoch": 0.18458698661744347,
"grad_norm": 0.2907325327396393,
"learning_rate": 0.00019567465031561487,
"loss": 1.0468,
"step": 3200
},
{
"epoch": 0.18487540378403322,
"grad_norm": 0.2841055989265442,
"learning_rate": 0.00019564531195757193,
"loss": 0.9837,
"step": 3205
},
{
"epoch": 0.18516382095062298,
"grad_norm": 0.2998584806919098,
"learning_rate": 0.0001956158766506352,
"loss": 1.0282,
"step": 3210
},
{
"epoch": 0.18545223811721273,
"grad_norm": 0.3043042719364166,
"learning_rate": 0.00019558634442464113,
"loss": 0.911,
"step": 3215
},
{
"epoch": 0.18574065528380249,
"grad_norm": 0.30067190527915955,
"learning_rate": 0.00019555671530952445,
"loss": 0.9701,
"step": 3220
},
{
"epoch": 0.18602907245039224,
"grad_norm": 0.297343373298645,
"learning_rate": 0.00019552698933531808,
"loss": 0.9935,
"step": 3225
},
{
"epoch": 0.186317489616982,
"grad_norm": 0.2842741310596466,
"learning_rate": 0.00019549716653215318,
"loss": 0.999,
"step": 3230
},
{
"epoch": 0.18660590678357175,
"grad_norm": 0.27844905853271484,
"learning_rate": 0.00019546724693025896,
"loss": 0.9668,
"step": 3235
},
{
"epoch": 0.1868943239501615,
"grad_norm": 0.29974377155303955,
"learning_rate": 0.00019543723055996282,
"loss": 0.9864,
"step": 3240
},
{
"epoch": 0.18718274111675126,
"grad_norm": 0.2982295751571655,
"learning_rate": 0.0001954071174516903,
"loss": 0.9902,
"step": 3245
},
{
"epoch": 0.18747115828334102,
"grad_norm": 0.3086935579776764,
"learning_rate": 0.00019537690763596487,
"loss": 0.9954,
"step": 3250
},
{
"epoch": 0.18775957544993077,
"grad_norm": 0.28824785351753235,
"learning_rate": 0.0001953466011434081,
"loss": 0.9979,
"step": 3255
},
{
"epoch": 0.18804799261652053,
"grad_norm": 0.2743071913719177,
"learning_rate": 0.00019531619800473952,
"loss": 0.9299,
"step": 3260
},
{
"epoch": 0.18833640978311028,
"grad_norm": 0.2896062433719635,
"learning_rate": 0.00019528569825077668,
"loss": 0.9861,
"step": 3265
},
{
"epoch": 0.18862482694970004,
"grad_norm": 0.29393669962882996,
"learning_rate": 0.00019525510191243498,
"loss": 1.0792,
"step": 3270
},
{
"epoch": 0.1889132441162898,
"grad_norm": 0.3489181399345398,
"learning_rate": 0.00019522440902072782,
"loss": 1.0056,
"step": 3275
},
{
"epoch": 0.18920166128287955,
"grad_norm": 0.31945231556892395,
"learning_rate": 0.0001951936196067664,
"loss": 1.0386,
"step": 3280
},
{
"epoch": 0.1894900784494693,
"grad_norm": 0.30114686489105225,
"learning_rate": 0.00019516273370175972,
"loss": 0.9667,
"step": 3285
},
{
"epoch": 0.18977849561605906,
"grad_norm": 0.3653857409954071,
"learning_rate": 0.00019513175133701474,
"loss": 0.9465,
"step": 3290
},
{
"epoch": 0.1900669127826488,
"grad_norm": 0.2919418513774872,
"learning_rate": 0.000195100672543936,
"loss": 0.9252,
"step": 3295
},
{
"epoch": 0.19035532994923857,
"grad_norm": 0.29241377115249634,
"learning_rate": 0.00019506949735402588,
"loss": 0.929,
"step": 3300
},
{
"epoch": 0.19064374711582832,
"grad_norm": 0.30068260431289673,
"learning_rate": 0.00019503822579888453,
"loss": 1.0254,
"step": 3305
},
{
"epoch": 0.19093216428241808,
"grad_norm": 0.2954903542995453,
"learning_rate": 0.00019500685791020968,
"loss": 0.9485,
"step": 3310
},
{
"epoch": 0.19122058144900783,
"grad_norm": 0.2899206876754761,
"learning_rate": 0.00019497539371979674,
"loss": 1.036,
"step": 3315
},
{
"epoch": 0.1915089986155976,
"grad_norm": 0.3165214955806732,
"learning_rate": 0.00019494383325953875,
"loss": 0.9616,
"step": 3320
},
{
"epoch": 0.19179741578218737,
"grad_norm": 0.3250178396701813,
"learning_rate": 0.0001949121765614263,
"loss": 0.9648,
"step": 3325
},
{
"epoch": 0.19208583294877712,
"grad_norm": 0.2635006904602051,
"learning_rate": 0.00019488042365754758,
"loss": 0.9789,
"step": 3330
},
{
"epoch": 0.19237425011536688,
"grad_norm": 0.2964721620082855,
"learning_rate": 0.0001948485745800882,
"loss": 0.9432,
"step": 3335
},
{
"epoch": 0.19266266728195663,
"grad_norm": 0.2993474006652832,
"learning_rate": 0.0001948166293613314,
"loss": 0.9556,
"step": 3340
},
{
"epoch": 0.1929510844485464,
"grad_norm": 0.28304216265678406,
"learning_rate": 0.00019478458803365772,
"loss": 0.9445,
"step": 3345
},
{
"epoch": 0.19323950161513614,
"grad_norm": 0.2697024941444397,
"learning_rate": 0.00019475245062954523,
"loss": 1.0552,
"step": 3350
},
{
"epoch": 0.1935279187817259,
"grad_norm": 0.2875863015651703,
"learning_rate": 0.00019472021718156937,
"loss": 0.9319,
"step": 3355
},
{
"epoch": 0.19381633594831565,
"grad_norm": 0.3006811738014221,
"learning_rate": 0.00019468788772240286,
"loss": 1.0049,
"step": 3360
},
{
"epoch": 0.1941047531149054,
"grad_norm": 0.30004388093948364,
"learning_rate": 0.0001946554622848158,
"loss": 1.0181,
"step": 3365
},
{
"epoch": 0.19439317028149516,
"grad_norm": 0.3029836118221283,
"learning_rate": 0.00019462294090167554,
"loss": 1.045,
"step": 3370
},
{
"epoch": 0.19468158744808492,
"grad_norm": 0.2854270339012146,
"learning_rate": 0.00019459032360594677,
"loss": 0.9876,
"step": 3375
},
{
"epoch": 0.19497000461467467,
"grad_norm": 0.3001527786254883,
"learning_rate": 0.0001945576104306913,
"loss": 0.9083,
"step": 3380
},
{
"epoch": 0.19525842178126443,
"grad_norm": 0.2907600700855255,
"learning_rate": 0.00019452480140906819,
"loss": 0.9734,
"step": 3385
},
{
"epoch": 0.19554683894785418,
"grad_norm": 0.2804548442363739,
"learning_rate": 0.00019449189657433358,
"loss": 1.0032,
"step": 3390
},
{
"epoch": 0.19583525611444394,
"grad_norm": 0.29847756028175354,
"learning_rate": 0.0001944588959598408,
"loss": 0.9485,
"step": 3395
},
{
"epoch": 0.1961236732810337,
"grad_norm": 0.28965532779693604,
"learning_rate": 0.00019442579959904024,
"loss": 0.9713,
"step": 3400
},
{
"epoch": 0.19641209044762345,
"grad_norm": 0.295213520526886,
"learning_rate": 0.00019439260752547935,
"loss": 0.9486,
"step": 3405
},
{
"epoch": 0.1967005076142132,
"grad_norm": 0.2934512794017792,
"learning_rate": 0.0001943593197728026,
"loss": 1.0448,
"step": 3410
},
{
"epoch": 0.19698892478080296,
"grad_norm": 0.29289090633392334,
"learning_rate": 0.00019432593637475138,
"loss": 0.9959,
"step": 3415
},
{
"epoch": 0.19727734194739271,
"grad_norm": 0.2757977545261383,
"learning_rate": 0.00019429245736516415,
"loss": 0.9612,
"step": 3420
},
{
"epoch": 0.19756575911398247,
"grad_norm": 0.28514814376831055,
"learning_rate": 0.00019425888277797615,
"loss": 1.0246,
"step": 3425
},
{
"epoch": 0.19785417628057222,
"grad_norm": 0.32380256056785583,
"learning_rate": 0.00019422521264721962,
"loss": 0.9404,
"step": 3430
},
{
"epoch": 0.19814259344716198,
"grad_norm": 0.28507691621780396,
"learning_rate": 0.0001941914470070236,
"loss": 0.8902,
"step": 3435
},
{
"epoch": 0.19843101061375173,
"grad_norm": 0.3757873773574829,
"learning_rate": 0.00019415758589161385,
"loss": 1.0038,
"step": 3440
},
{
"epoch": 0.1987194277803415,
"grad_norm": 0.3061589300632477,
"learning_rate": 0.00019412362933531307,
"loss": 0.8961,
"step": 3445
},
{
"epoch": 0.19900784494693124,
"grad_norm": 0.29617950320243835,
"learning_rate": 0.0001940895773725406,
"loss": 0.9573,
"step": 3450
},
{
"epoch": 0.199296262113521,
"grad_norm": 0.27990731596946716,
"learning_rate": 0.00019405543003781251,
"loss": 1.044,
"step": 3455
},
{
"epoch": 0.19958467928011075,
"grad_norm": 0.29822319746017456,
"learning_rate": 0.00019402118736574155,
"loss": 0.9799,
"step": 3460
},
{
"epoch": 0.1998730964467005,
"grad_norm": 0.3118431866168976,
"learning_rate": 0.00019398684939103707,
"loss": 1.0417,
"step": 3465
},
{
"epoch": 0.20016151361329027,
"grad_norm": 0.3202954828739166,
"learning_rate": 0.00019395241614850504,
"loss": 0.9731,
"step": 3470
},
{
"epoch": 0.20044993077988002,
"grad_norm": 0.3098292052745819,
"learning_rate": 0.00019391788767304804,
"loss": 0.985,
"step": 3475
},
{
"epoch": 0.20073834794646978,
"grad_norm": 0.2931598722934723,
"learning_rate": 0.00019388326399966515,
"loss": 1.0129,
"step": 3480
},
{
"epoch": 0.20102676511305953,
"grad_norm": 0.2935352027416229,
"learning_rate": 0.0001938485451634519,
"loss": 0.9402,
"step": 3485
},
{
"epoch": 0.20131518227964929,
"grad_norm": 0.3236974775791168,
"learning_rate": 0.00019381373119960033,
"loss": 1.0507,
"step": 3490
},
{
"epoch": 0.20160359944623904,
"grad_norm": 0.3834960162639618,
"learning_rate": 0.00019377882214339893,
"loss": 0.9554,
"step": 3495
},
{
"epoch": 0.2018920166128288,
"grad_norm": 0.2892552316188812,
"learning_rate": 0.00019374381803023252,
"loss": 1.0119,
"step": 3500
},
{
"epoch": 0.20218043377941855,
"grad_norm": 0.29538676142692566,
"learning_rate": 0.0001937087188955823,
"loss": 0.9977,
"step": 3505
},
{
"epoch": 0.2024688509460083,
"grad_norm": 0.2964411973953247,
"learning_rate": 0.00019367352477502576,
"loss": 0.9636,
"step": 3510
},
{
"epoch": 0.20275726811259806,
"grad_norm": 0.3167349696159363,
"learning_rate": 0.00019363823570423675,
"loss": 0.9345,
"step": 3515
},
{
"epoch": 0.20304568527918782,
"grad_norm": 0.3199044466018677,
"learning_rate": 0.0001936028517189852,
"loss": 0.913,
"step": 3520
},
{
"epoch": 0.20333410244577757,
"grad_norm": 0.27600806951522827,
"learning_rate": 0.00019356737285513748,
"loss": 0.959,
"step": 3525
},
{
"epoch": 0.20362251961236733,
"grad_norm": 0.31621217727661133,
"learning_rate": 0.00019353179914865596,
"loss": 1.0437,
"step": 3530
},
{
"epoch": 0.20391093677895708,
"grad_norm": 0.30049943923950195,
"learning_rate": 0.00019349613063559916,
"loss": 0.9675,
"step": 3535
},
{
"epoch": 0.20419935394554684,
"grad_norm": 0.3039463460445404,
"learning_rate": 0.00019346036735212177,
"loss": 1.0542,
"step": 3540
},
{
"epoch": 0.2044877711121366,
"grad_norm": 0.3049977123737335,
"learning_rate": 0.00019342450933447448,
"loss": 0.8974,
"step": 3545
},
{
"epoch": 0.20477618827872635,
"grad_norm": 0.2853706181049347,
"learning_rate": 0.00019338855661900405,
"loss": 0.9711,
"step": 3550
},
{
"epoch": 0.2050646054453161,
"grad_norm": 0.2970394492149353,
"learning_rate": 0.00019335250924215318,
"loss": 0.9516,
"step": 3555
},
{
"epoch": 0.20535302261190586,
"grad_norm": 0.3310398459434509,
"learning_rate": 0.00019331636724046058,
"loss": 0.9293,
"step": 3560
},
{
"epoch": 0.2056414397784956,
"grad_norm": 0.2932792901992798,
"learning_rate": 0.0001932801306505608,
"loss": 1.0088,
"step": 3565
},
{
"epoch": 0.20592985694508537,
"grad_norm": 0.3343851566314697,
"learning_rate": 0.00019324379950918437,
"loss": 1.0363,
"step": 3570
},
{
"epoch": 0.20621827411167512,
"grad_norm": 0.30094677209854126,
"learning_rate": 0.00019320737385315756,
"loss": 1.0072,
"step": 3575
},
{
"epoch": 0.20650669127826488,
"grad_norm": 0.28837206959724426,
"learning_rate": 0.00019317085371940246,
"loss": 0.9139,
"step": 3580
},
{
"epoch": 0.20679510844485463,
"grad_norm": 0.29000407457351685,
"learning_rate": 0.00019313423914493703,
"loss": 0.9431,
"step": 3585
},
{
"epoch": 0.20708352561144439,
"grad_norm": 0.28823748230934143,
"learning_rate": 0.00019309753016687477,
"loss": 0.9281,
"step": 3590
},
{
"epoch": 0.20737194277803414,
"grad_norm": 0.30797070264816284,
"learning_rate": 0.00019306072682242505,
"loss": 0.9611,
"step": 3595
},
{
"epoch": 0.2076603599446239,
"grad_norm": 0.2971121370792389,
"learning_rate": 0.00019302382914889284,
"loss": 1.0199,
"step": 3600
},
{
"epoch": 0.20794877711121365,
"grad_norm": 0.2938947081565857,
"learning_rate": 0.00019298683718367864,
"loss": 0.9275,
"step": 3605
},
{
"epoch": 0.2082371942778034,
"grad_norm": 0.3001919686794281,
"learning_rate": 0.00019294975096427862,
"loss": 0.9963,
"step": 3610
},
{
"epoch": 0.20852561144439316,
"grad_norm": 0.3122607469558716,
"learning_rate": 0.00019291257052828447,
"loss": 1.0458,
"step": 3615
},
{
"epoch": 0.20881402861098292,
"grad_norm": 0.2895052433013916,
"learning_rate": 0.00019287529591338333,
"loss": 0.9592,
"step": 3620
},
{
"epoch": 0.20910244577757267,
"grad_norm": 0.2828371822834015,
"learning_rate": 0.0001928379271573579,
"loss": 0.9518,
"step": 3625
},
{
"epoch": 0.20939086294416243,
"grad_norm": 0.30132856965065,
"learning_rate": 0.0001928004642980862,
"loss": 0.9374,
"step": 3630
},
{
"epoch": 0.20967928011075218,
"grad_norm": 0.4656534194946289,
"learning_rate": 0.0001927629073735417,
"loss": 0.9824,
"step": 3635
},
{
"epoch": 0.20996769727734194,
"grad_norm": 0.2774214744567871,
"learning_rate": 0.00019272525642179323,
"loss": 0.9528,
"step": 3640
},
{
"epoch": 0.2102561144439317,
"grad_norm": 0.2919476330280304,
"learning_rate": 0.00019268751148100486,
"loss": 0.9404,
"step": 3645
},
{
"epoch": 0.21054453161052145,
"grad_norm": 0.3007878065109253,
"learning_rate": 0.00019264967258943595,
"loss": 0.96,
"step": 3650
},
{
"epoch": 0.2108329487771112,
"grad_norm": 0.30731719732284546,
"learning_rate": 0.0001926117397854412,
"loss": 0.9321,
"step": 3655
},
{
"epoch": 0.21112136594370096,
"grad_norm": 0.32939255237579346,
"learning_rate": 0.0001925737131074703,
"loss": 1.0182,
"step": 3660
},
{
"epoch": 0.2114097831102907,
"grad_norm": 0.29776227474212646,
"learning_rate": 0.0001925355925940683,
"loss": 1.0224,
"step": 3665
},
{
"epoch": 0.2116982002768805,
"grad_norm": 0.3057902753353119,
"learning_rate": 0.00019249737828387522,
"loss": 0.9812,
"step": 3670
},
{
"epoch": 0.21198661744347025,
"grad_norm": 0.3011026382446289,
"learning_rate": 0.0001924590702156262,
"loss": 0.9753,
"step": 3675
},
{
"epoch": 0.21227503461006,
"grad_norm": 0.2978782653808594,
"learning_rate": 0.00019242066842815146,
"loss": 1.0129,
"step": 3680
},
{
"epoch": 0.21256345177664976,
"grad_norm": 0.2966994047164917,
"learning_rate": 0.00019238217296037614,
"loss": 1.0068,
"step": 3685
},
{
"epoch": 0.21285186894323951,
"grad_norm": 0.2818816602230072,
"learning_rate": 0.00019234358385132038,
"loss": 1.0062,
"step": 3690
},
{
"epoch": 0.21314028610982927,
"grad_norm": 0.280269980430603,
"learning_rate": 0.00019230490114009928,
"loss": 0.9392,
"step": 3695
},
{
"epoch": 0.21342870327641902,
"grad_norm": 0.29371026158332825,
"learning_rate": 0.00019226612486592271,
"loss": 0.8971,
"step": 3700
},
{
"epoch": 0.21371712044300878,
"grad_norm": 0.3066560924053192,
"learning_rate": 0.00019222725506809547,
"loss": 0.9893,
"step": 3705
},
{
"epoch": 0.21400553760959853,
"grad_norm": 0.31458479166030884,
"learning_rate": 0.00019218829178601713,
"loss": 1.0389,
"step": 3710
},
{
"epoch": 0.2142939547761883,
"grad_norm": 0.3057044446468353,
"learning_rate": 0.00019214923505918202,
"loss": 1.0005,
"step": 3715
},
{
"epoch": 0.21458237194277804,
"grad_norm": 0.27441418170928955,
"learning_rate": 0.00019211008492717914,
"loss": 0.9777,
"step": 3720
},
{
"epoch": 0.2148707891093678,
"grad_norm": 0.2985784113407135,
"learning_rate": 0.00019207084142969225,
"loss": 1.0475,
"step": 3725
},
{
"epoch": 0.21515920627595755,
"grad_norm": 0.305512934923172,
"learning_rate": 0.0001920315046064997,
"loss": 0.9554,
"step": 3730
},
{
"epoch": 0.2154476234425473,
"grad_norm": 0.3009251356124878,
"learning_rate": 0.0001919920744974745,
"loss": 0.9912,
"step": 3735
},
{
"epoch": 0.21573604060913706,
"grad_norm": 0.29489755630493164,
"learning_rate": 0.00019195255114258408,
"loss": 0.9554,
"step": 3740
},
{
"epoch": 0.21602445777572682,
"grad_norm": 0.3059771955013275,
"learning_rate": 0.0001919129345818905,
"loss": 0.9819,
"step": 3745
},
{
"epoch": 0.21631287494231657,
"grad_norm": 0.3015615940093994,
"learning_rate": 0.00019187322485555031,
"loss": 0.9948,
"step": 3750
},
{
"epoch": 0.21660129210890633,
"grad_norm": 0.3108586072921753,
"learning_rate": 0.0001918334220038144,
"loss": 0.9818,
"step": 3755
},
{
"epoch": 0.21688970927549608,
"grad_norm": 0.30573326349258423,
"learning_rate": 0.00019179352606702813,
"loss": 0.9519,
"step": 3760
},
{
"epoch": 0.21717812644208584,
"grad_norm": 0.2957397997379303,
"learning_rate": 0.00019175353708563117,
"loss": 1.0094,
"step": 3765
},
{
"epoch": 0.2174665436086756,
"grad_norm": 0.2969014644622803,
"learning_rate": 0.00019171345510015758,
"loss": 1.0162,
"step": 3770
},
{
"epoch": 0.21775496077526535,
"grad_norm": 0.33074361085891724,
"learning_rate": 0.00019167328015123558,
"loss": 0.9382,
"step": 3775
},
{
"epoch": 0.2180433779418551,
"grad_norm": 0.2909998297691345,
"learning_rate": 0.0001916330122795877,
"loss": 0.9768,
"step": 3780
},
{
"epoch": 0.21833179510844486,
"grad_norm": 0.28647512197494507,
"learning_rate": 0.00019159265152603064,
"loss": 0.9658,
"step": 3785
},
{
"epoch": 0.21862021227503461,
"grad_norm": 0.3733946979045868,
"learning_rate": 0.00019155219793147522,
"loss": 1.037,
"step": 3790
},
{
"epoch": 0.21890862944162437,
"grad_norm": 0.2883405089378357,
"learning_rate": 0.00019151165153692644,
"loss": 0.9551,
"step": 3795
},
{
"epoch": 0.21919704660821412,
"grad_norm": 0.33625394105911255,
"learning_rate": 0.00019147101238348326,
"loss": 0.995,
"step": 3800
},
{
"epoch": 0.21948546377480388,
"grad_norm": 0.4042999744415283,
"learning_rate": 0.00019143028051233873,
"loss": 0.9512,
"step": 3805
},
{
"epoch": 0.21977388094139363,
"grad_norm": 0.277295857667923,
"learning_rate": 0.00019138945596477994,
"loss": 0.9281,
"step": 3810
},
{
"epoch": 0.2200622981079834,
"grad_norm": 0.3070628046989441,
"learning_rate": 0.0001913485387821877,
"loss": 0.938,
"step": 3815
},
{
"epoch": 0.22035071527457314,
"grad_norm": 0.2898661494255066,
"learning_rate": 0.00019130752900603702,
"loss": 1.0103,
"step": 3820
},
{
"epoch": 0.2206391324411629,
"grad_norm": 0.2981604039669037,
"learning_rate": 0.00019126642667789654,
"loss": 0.9787,
"step": 3825
},
{
"epoch": 0.22092754960775265,
"grad_norm": 0.2816370129585266,
"learning_rate": 0.00019122523183942879,
"loss": 1.039,
"step": 3830
},
{
"epoch": 0.2212159667743424,
"grad_norm": 0.306822806596756,
"learning_rate": 0.00019118394453239006,
"loss": 1.0161,
"step": 3835
},
{
"epoch": 0.22150438394093216,
"grad_norm": 0.29982468485832214,
"learning_rate": 0.00019114256479863038,
"loss": 0.959,
"step": 3840
},
{
"epoch": 0.22179280110752192,
"grad_norm": 0.2966124713420868,
"learning_rate": 0.00019110109268009347,
"loss": 0.9996,
"step": 3845
},
{
"epoch": 0.22208121827411167,
"grad_norm": 0.3192947208881378,
"learning_rate": 0.00019105952821881668,
"loss": 1.0132,
"step": 3850
},
{
"epoch": 0.22236963544070143,
"grad_norm": 0.2927592694759369,
"learning_rate": 0.00019101787145693098,
"loss": 0.9738,
"step": 3855
},
{
"epoch": 0.22265805260729118,
"grad_norm": 0.2782720923423767,
"learning_rate": 0.00019097612243666086,
"loss": 0.9538,
"step": 3860
},
{
"epoch": 0.22294646977388094,
"grad_norm": 0.32348090410232544,
"learning_rate": 0.0001909342812003244,
"loss": 0.9593,
"step": 3865
},
{
"epoch": 0.2232348869404707,
"grad_norm": 0.32968342304229736,
"learning_rate": 0.00019089234779033306,
"loss": 0.9899,
"step": 3870
},
{
"epoch": 0.22352330410706045,
"grad_norm": 0.29580381512641907,
"learning_rate": 0.00019085032224919177,
"loss": 0.9515,
"step": 3875
},
{
"epoch": 0.2238117212736502,
"grad_norm": 0.27999478578567505,
"learning_rate": 0.00019080820461949886,
"loss": 0.9596,
"step": 3880
},
{
"epoch": 0.22410013844023996,
"grad_norm": 0.31083959341049194,
"learning_rate": 0.00019076599494394602,
"loss": 1.0069,
"step": 3885
},
{
"epoch": 0.22438855560682971,
"grad_norm": 0.2649812400341034,
"learning_rate": 0.00019072369326531824,
"loss": 0.9238,
"step": 3890
},
{
"epoch": 0.22467697277341947,
"grad_norm": 0.2908613383769989,
"learning_rate": 0.00019068129962649365,
"loss": 0.9745,
"step": 3895
},
{
"epoch": 0.22496538994000922,
"grad_norm": 0.2983262538909912,
"learning_rate": 0.00019063881407044373,
"loss": 0.9155,
"step": 3900
},
{
"epoch": 0.22525380710659898,
"grad_norm": 0.3074907660484314,
"learning_rate": 0.00019059623664023311,
"loss": 1.0384,
"step": 3905
},
{
"epoch": 0.22554222427318874,
"grad_norm": 0.3024677336215973,
"learning_rate": 0.00019055356737901952,
"loss": 1.0626,
"step": 3910
},
{
"epoch": 0.2258306414397785,
"grad_norm": 0.324719101190567,
"learning_rate": 0.00019051080633005372,
"loss": 0.9757,
"step": 3915
},
{
"epoch": 0.22611905860636825,
"grad_norm": 0.31149742007255554,
"learning_rate": 0.00019046795353667965,
"loss": 1.0294,
"step": 3920
},
{
"epoch": 0.226407475772958,
"grad_norm": 0.3361373543739319,
"learning_rate": 0.00019042500904233408,
"loss": 0.949,
"step": 3925
},
{
"epoch": 0.22669589293954776,
"grad_norm": 0.3346847593784332,
"learning_rate": 0.00019038197289054684,
"loss": 0.9531,
"step": 3930
},
{
"epoch": 0.2269843101061375,
"grad_norm": 0.3011166453361511,
"learning_rate": 0.00019033884512494064,
"loss": 0.9515,
"step": 3935
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.350754052400589,
"learning_rate": 0.00019029562578923106,
"loss": 0.9878,
"step": 3940
},
{
"epoch": 0.22756114443931702,
"grad_norm": 0.3115714192390442,
"learning_rate": 0.00019025231492722643,
"loss": 0.9914,
"step": 3945
},
{
"epoch": 0.22784956160590678,
"grad_norm": 0.29641732573509216,
"learning_rate": 0.000190208912582828,
"loss": 0.9508,
"step": 3950
},
{
"epoch": 0.22813797877249653,
"grad_norm": 0.3013533353805542,
"learning_rate": 0.0001901654188000296,
"loss": 0.9551,
"step": 3955
},
{
"epoch": 0.22842639593908629,
"grad_norm": 0.3072235584259033,
"learning_rate": 0.0001901218336229178,
"loss": 1.0324,
"step": 3960
},
{
"epoch": 0.22871481310567604,
"grad_norm": 0.2967047691345215,
"learning_rate": 0.00019007815709567183,
"loss": 0.9767,
"step": 3965
},
{
"epoch": 0.2290032302722658,
"grad_norm": 0.3344308137893677,
"learning_rate": 0.0001900343892625635,
"loss": 1.053,
"step": 3970
},
{
"epoch": 0.22929164743885555,
"grad_norm": 0.279471218585968,
"learning_rate": 0.00018999053016795719,
"loss": 0.9597,
"step": 3975
},
{
"epoch": 0.2295800646054453,
"grad_norm": 0.3151692748069763,
"learning_rate": 0.00018994657985630972,
"loss": 0.981,
"step": 3980
},
{
"epoch": 0.22986848177203506,
"grad_norm": 0.29757049679756165,
"learning_rate": 0.00018990253837217042,
"loss": 0.9948,
"step": 3985
},
{
"epoch": 0.23015689893862482,
"grad_norm": 0.29068654775619507,
"learning_rate": 0.00018985840576018107,
"loss": 0.9492,
"step": 3990
},
{
"epoch": 0.23044531610521457,
"grad_norm": 0.29149913787841797,
"learning_rate": 0.00018981418206507575,
"loss": 0.9603,
"step": 3995
},
{
"epoch": 0.23073373327180433,
"grad_norm": 0.2850954830646515,
"learning_rate": 0.00018976986733168093,
"loss": 1.0198,
"step": 4000
},
{
"epoch": 0.23102215043839408,
"grad_norm": 0.3014662563800812,
"learning_rate": 0.00018972546160491528,
"loss": 1.0628,
"step": 4005
},
{
"epoch": 0.23131056760498384,
"grad_norm": 0.29958969354629517,
"learning_rate": 0.00018968096492978976,
"loss": 0.9891,
"step": 4010
},
{
"epoch": 0.2315989847715736,
"grad_norm": 0.29551297426223755,
"learning_rate": 0.0001896363773514075,
"loss": 0.9811,
"step": 4015
},
{
"epoch": 0.23188740193816337,
"grad_norm": 0.30971017479896545,
"learning_rate": 0.0001895916989149638,
"loss": 1.0459,
"step": 4020
},
{
"epoch": 0.23217581910475313,
"grad_norm": 0.3282906115055084,
"learning_rate": 0.000189546929665746,
"loss": 1.0698,
"step": 4025
},
{
"epoch": 0.23246423627134288,
"grad_norm": 0.3017507493495941,
"learning_rate": 0.00018950206964913355,
"loss": 0.9867,
"step": 4030
},
{
"epoch": 0.23275265343793264,
"grad_norm": 0.34195518493652344,
"learning_rate": 0.0001894571189105979,
"loss": 0.9247,
"step": 4035
},
{
"epoch": 0.2330410706045224,
"grad_norm": 0.33378762006759644,
"learning_rate": 0.00018941207749570237,
"loss": 1.0384,
"step": 4040
},
{
"epoch": 0.23332948777111215,
"grad_norm": 0.325948029756546,
"learning_rate": 0.00018936694545010232,
"loss": 0.9698,
"step": 4045
},
{
"epoch": 0.2336179049377019,
"grad_norm": 0.2848076820373535,
"learning_rate": 0.0001893217228195449,
"loss": 1.0036,
"step": 4050
},
{
"epoch": 0.23390632210429166,
"grad_norm": 0.30070775747299194,
"learning_rate": 0.0001892764096498691,
"loss": 1.0397,
"step": 4055
},
{
"epoch": 0.2341947392708814,
"grad_norm": 0.3177594244480133,
"learning_rate": 0.00018923100598700561,
"loss": 1.0136,
"step": 4060
},
{
"epoch": 0.23448315643747117,
"grad_norm": 0.31077563762664795,
"learning_rate": 0.00018918551187697703,
"loss": 0.9457,
"step": 4065
},
{
"epoch": 0.23477157360406092,
"grad_norm": 0.2947135865688324,
"learning_rate": 0.00018913992736589746,
"loss": 0.9988,
"step": 4070
},
{
"epoch": 0.23505999077065068,
"grad_norm": 0.26377373933792114,
"learning_rate": 0.00018909425249997267,
"loss": 0.9891,
"step": 4075
},
{
"epoch": 0.23534840793724043,
"grad_norm": 0.3427537977695465,
"learning_rate": 0.0001890484873255001,
"loss": 0.993,
"step": 4080
},
{
"epoch": 0.2356368251038302,
"grad_norm": 0.28606218099594116,
"learning_rate": 0.00018900263188886864,
"loss": 0.9609,
"step": 4085
},
{
"epoch": 0.23592524227041994,
"grad_norm": 0.31335821747779846,
"learning_rate": 0.00018895668623655873,
"loss": 0.9278,
"step": 4090
},
{
"epoch": 0.2362136594370097,
"grad_norm": 0.3148699104785919,
"learning_rate": 0.00018891065041514224,
"loss": 0.9486,
"step": 4095
},
{
"epoch": 0.23650207660359945,
"grad_norm": 0.30335333943367004,
"learning_rate": 0.0001888645244712824,
"loss": 0.9604,
"step": 4100
},
{
"epoch": 0.2367904937701892,
"grad_norm": 0.2990083396434784,
"learning_rate": 0.0001888183084517338,
"loss": 0.9277,
"step": 4105
},
{
"epoch": 0.23707891093677896,
"grad_norm": 0.3039418160915375,
"learning_rate": 0.00018877200240334236,
"loss": 1.0381,
"step": 4110
},
{
"epoch": 0.23736732810336872,
"grad_norm": 0.3109247386455536,
"learning_rate": 0.0001887256063730453,
"loss": 1.0214,
"step": 4115
},
{
"epoch": 0.23765574526995847,
"grad_norm": 0.29135051369667053,
"learning_rate": 0.00018867912040787096,
"loss": 1.0111,
"step": 4120
},
{
"epoch": 0.23794416243654823,
"grad_norm": 0.29950061440467834,
"learning_rate": 0.0001886325445549389,
"loss": 0.9879,
"step": 4125
},
{
"epoch": 0.23823257960313798,
"grad_norm": 0.3028976619243622,
"learning_rate": 0.00018858587886145975,
"loss": 0.9808,
"step": 4130
},
{
"epoch": 0.23852099676972774,
"grad_norm": 0.2960391342639923,
"learning_rate": 0.0001885391233747352,
"loss": 0.9033,
"step": 4135
},
{
"epoch": 0.2388094139363175,
"grad_norm": 0.28858163952827454,
"learning_rate": 0.00018849227814215805,
"loss": 0.8774,
"step": 4140
},
{
"epoch": 0.23909783110290725,
"grad_norm": 0.3187437653541565,
"learning_rate": 0.00018844534321121195,
"loss": 1.032,
"step": 4145
},
{
"epoch": 0.239386248269497,
"grad_norm": 0.30050045251846313,
"learning_rate": 0.00018839831862947152,
"loss": 0.9785,
"step": 4150
},
{
"epoch": 0.23967466543608676,
"grad_norm": 0.3172016739845276,
"learning_rate": 0.0001883512044446023,
"loss": 1.0049,
"step": 4155
},
{
"epoch": 0.23996308260267651,
"grad_norm": 0.2758901119232178,
"learning_rate": 0.00018830400070436057,
"loss": 0.8758,
"step": 4160
},
{
"epoch": 0.24025149976926627,
"grad_norm": 0.31265828013420105,
"learning_rate": 0.00018825670745659345,
"loss": 0.9875,
"step": 4165
},
{
"epoch": 0.24053991693585602,
"grad_norm": 0.2935623526573181,
"learning_rate": 0.00018820932474923873,
"loss": 0.9738,
"step": 4170
},
{
"epoch": 0.24082833410244578,
"grad_norm": 0.31961116194725037,
"learning_rate": 0.00018816185263032496,
"loss": 0.985,
"step": 4175
},
{
"epoch": 0.24111675126903553,
"grad_norm": 0.302990198135376,
"learning_rate": 0.00018811429114797123,
"loss": 0.9693,
"step": 4180
},
{
"epoch": 0.2414051684356253,
"grad_norm": 0.3246656358242035,
"learning_rate": 0.00018806664035038727,
"loss": 0.9715,
"step": 4185
},
{
"epoch": 0.24169358560221504,
"grad_norm": 0.30691856145858765,
"learning_rate": 0.00018801890028587333,
"loss": 0.9967,
"step": 4190
},
{
"epoch": 0.2419820027688048,
"grad_norm": 0.3090788424015045,
"learning_rate": 0.00018797107100282015,
"loss": 1.0014,
"step": 4195
},
{
"epoch": 0.24227041993539455,
"grad_norm": 0.28349974751472473,
"learning_rate": 0.0001879231525497089,
"loss": 0.9426,
"step": 4200
},
{
"epoch": 0.2425588371019843,
"grad_norm": 0.3226814270019531,
"learning_rate": 0.00018787514497511104,
"loss": 1.0058,
"step": 4205
},
{
"epoch": 0.24284725426857406,
"grad_norm": 0.3090320825576782,
"learning_rate": 0.0001878270483276886,
"loss": 0.9565,
"step": 4210
},
{
"epoch": 0.24313567143516382,
"grad_norm": 0.29639485478401184,
"learning_rate": 0.00018777886265619365,
"loss": 0.9994,
"step": 4215
},
{
"epoch": 0.24342408860175357,
"grad_norm": 0.30157527327537537,
"learning_rate": 0.00018773058800946858,
"loss": 0.9349,
"step": 4220
},
{
"epoch": 0.24371250576834333,
"grad_norm": 0.2847401797771454,
"learning_rate": 0.0001876822244364461,
"loss": 0.9882,
"step": 4225
},
{
"epoch": 0.24400092293493308,
"grad_norm": 0.2939082086086273,
"learning_rate": 0.00018763377198614887,
"loss": 0.9545,
"step": 4230
},
{
"epoch": 0.24428934010152284,
"grad_norm": 0.30300137400627136,
"learning_rate": 0.00018758523070768973,
"loss": 0.9069,
"step": 4235
},
{
"epoch": 0.2445777572681126,
"grad_norm": 0.2980591952800751,
"learning_rate": 0.00018753660065027152,
"loss": 0.9992,
"step": 4240
},
{
"epoch": 0.24486617443470235,
"grad_norm": 0.31828731298446655,
"learning_rate": 0.00018748788186318712,
"loss": 0.9711,
"step": 4245
},
{
"epoch": 0.2451545916012921,
"grad_norm": 0.31123876571655273,
"learning_rate": 0.00018743907439581933,
"loss": 0.9393,
"step": 4250
},
{
"epoch": 0.24544300876788186,
"grad_norm": 0.29812201857566833,
"learning_rate": 0.00018739017829764082,
"loss": 0.9653,
"step": 4255
},
{
"epoch": 0.24573142593447161,
"grad_norm": 0.33146384358406067,
"learning_rate": 0.0001873411936182141,
"loss": 0.9758,
"step": 4260
},
{
"epoch": 0.24601984310106137,
"grad_norm": 0.3051407039165497,
"learning_rate": 0.0001872921204071915,
"loss": 1.0172,
"step": 4265
},
{
"epoch": 0.24630826026765112,
"grad_norm": 0.30195561051368713,
"learning_rate": 0.000187242958714315,
"loss": 0.9868,
"step": 4270
},
{
"epoch": 0.24659667743424088,
"grad_norm": 0.2948630750179291,
"learning_rate": 0.00018719370858941644,
"loss": 0.9771,
"step": 4275
},
{
"epoch": 0.24688509460083063,
"grad_norm": 0.3198891282081604,
"learning_rate": 0.00018714437008241709,
"loss": 1.04,
"step": 4280
},
{
"epoch": 0.2471735117674204,
"grad_norm": 0.3208988606929779,
"learning_rate": 0.000187094943243328,
"loss": 0.9666,
"step": 4285
},
{
"epoch": 0.24746192893401014,
"grad_norm": 0.3209957182407379,
"learning_rate": 0.00018704542812224956,
"loss": 0.9374,
"step": 4290
},
{
"epoch": 0.2477503461005999,
"grad_norm": 0.3006252348423004,
"learning_rate": 0.00018699582476937185,
"loss": 0.9798,
"step": 4295
},
{
"epoch": 0.24803876326718965,
"grad_norm": 0.3490176796913147,
"learning_rate": 0.00018694613323497422,
"loss": 1.0087,
"step": 4300
},
{
"epoch": 0.2483271804337794,
"grad_norm": 0.3163358271121979,
"learning_rate": 0.0001868963535694255,
"loss": 1.043,
"step": 4305
},
{
"epoch": 0.24861559760036916,
"grad_norm": 0.298026442527771,
"learning_rate": 0.0001868464858231838,
"loss": 1.0404,
"step": 4310
},
{
"epoch": 0.24890401476695892,
"grad_norm": 0.3209499418735504,
"learning_rate": 0.00018679653004679655,
"loss": 0.9687,
"step": 4315
},
{
"epoch": 0.24919243193354867,
"grad_norm": 0.3158719539642334,
"learning_rate": 0.0001867464862909004,
"loss": 0.9548,
"step": 4320
},
{
"epoch": 0.24948084910013843,
"grad_norm": 0.28783926367759705,
"learning_rate": 0.00018669635460622107,
"loss": 0.9042,
"step": 4325
},
{
"epoch": 0.24976926626672818,
"grad_norm": 0.2980654835700989,
"learning_rate": 0.00018664613504357366,
"loss": 0.97,
"step": 4330
},
{
"epoch": 0.25005768343331797,
"grad_norm": 0.2950812876224518,
"learning_rate": 0.00018659582765386204,
"loss": 1.0261,
"step": 4335
},
{
"epoch": 0.2503461005999077,
"grad_norm": 0.2984694540500641,
"learning_rate": 0.0001865454324880794,
"loss": 0.9859,
"step": 4340
},
{
"epoch": 0.2506345177664975,
"grad_norm": 0.3119395971298218,
"learning_rate": 0.00018649494959730765,
"loss": 1.03,
"step": 4345
},
{
"epoch": 0.2509229349330872,
"grad_norm": 0.3380660116672516,
"learning_rate": 0.00018644437903271778,
"loss": 1.0373,
"step": 4350
},
{
"epoch": 0.251211352099677,
"grad_norm": 0.310693621635437,
"learning_rate": 0.0001863937208455696,
"loss": 0.977,
"step": 4355
},
{
"epoch": 0.2514997692662667,
"grad_norm": 0.3119440972805023,
"learning_rate": 0.00018634297508721167,
"loss": 0.9384,
"step": 4360
},
{
"epoch": 0.2517881864328565,
"grad_norm": 0.3072355389595032,
"learning_rate": 0.00018629214180908144,
"loss": 1.0126,
"step": 4365
},
{
"epoch": 0.2520766035994462,
"grad_norm": 0.3056802749633789,
"learning_rate": 0.00018624122106270506,
"loss": 0.9496,
"step": 4370
},
{
"epoch": 0.252365020766036,
"grad_norm": 0.34883102774620056,
"learning_rate": 0.00018619021289969717,
"loss": 0.9626,
"step": 4375
},
{
"epoch": 0.25265343793262574,
"grad_norm": 0.2876664698123932,
"learning_rate": 0.00018613911737176125,
"loss": 0.9452,
"step": 4380
},
{
"epoch": 0.2529418550992155,
"grad_norm": 0.3051524758338928,
"learning_rate": 0.00018608793453068914,
"loss": 0.996,
"step": 4385
},
{
"epoch": 0.25323027226580525,
"grad_norm": 0.2734985053539276,
"learning_rate": 0.0001860366644283613,
"loss": 0.9395,
"step": 4390
},
{
"epoch": 0.25351868943239503,
"grad_norm": 0.30163031816482544,
"learning_rate": 0.00018598530711674667,
"loss": 0.9608,
"step": 4395
},
{
"epoch": 0.25380710659898476,
"grad_norm": 0.2709837555885315,
"learning_rate": 0.00018593386264790243,
"loss": 0.9611,
"step": 4400
},
{
"epoch": 0.25409552376557454,
"grad_norm": 0.3166120946407318,
"learning_rate": 0.00018588233107397429,
"loss": 0.8999,
"step": 4405
},
{
"epoch": 0.25438394093216427,
"grad_norm": 0.2956826090812683,
"learning_rate": 0.00018583071244719607,
"loss": 0.9097,
"step": 4410
},
{
"epoch": 0.25467235809875405,
"grad_norm": 0.31426194310188293,
"learning_rate": 0.00018577900681989,
"loss": 0.941,
"step": 4415
},
{
"epoch": 0.2549607752653438,
"grad_norm": 0.2746027410030365,
"learning_rate": 0.0001857272142444664,
"loss": 0.9168,
"step": 4420
},
{
"epoch": 0.25524919243193356,
"grad_norm": 0.2936379015445709,
"learning_rate": 0.00018567533477342377,
"loss": 0.9536,
"step": 4425
},
{
"epoch": 0.2555376095985233,
"grad_norm": 0.31358134746551514,
"learning_rate": 0.0001856233684593486,
"loss": 0.9569,
"step": 4430
},
{
"epoch": 0.25582602676511307,
"grad_norm": 0.31144851446151733,
"learning_rate": 0.0001855713153549155,
"loss": 0.9447,
"step": 4435
},
{
"epoch": 0.2561144439317028,
"grad_norm": 0.31088197231292725,
"learning_rate": 0.00018551917551288706,
"loss": 0.9873,
"step": 4440
},
{
"epoch": 0.2564028610982926,
"grad_norm": 0.31137150526046753,
"learning_rate": 0.0001854669489861137,
"loss": 0.9769,
"step": 4445
},
{
"epoch": 0.2566912782648823,
"grad_norm": 0.3470550775527954,
"learning_rate": 0.0001854146358275338,
"loss": 0.9824,
"step": 4450
},
{
"epoch": 0.2569796954314721,
"grad_norm": 0.305550754070282,
"learning_rate": 0.00018536223609017348,
"loss": 1.0573,
"step": 4455
},
{
"epoch": 0.2572681125980618,
"grad_norm": 0.30111902952194214,
"learning_rate": 0.00018530974982714667,
"loss": 0.9919,
"step": 4460
},
{
"epoch": 0.2575565297646516,
"grad_norm": 0.29458123445510864,
"learning_rate": 0.00018525717709165498,
"loss": 1.0249,
"step": 4465
},
{
"epoch": 0.2578449469312413,
"grad_norm": 0.2974050045013428,
"learning_rate": 0.0001852045179369877,
"loss": 1.0155,
"step": 4470
},
{
"epoch": 0.2581333640978311,
"grad_norm": 0.27646365761756897,
"learning_rate": 0.00018515177241652163,
"loss": 0.9477,
"step": 4475
},
{
"epoch": 0.25842178126442084,
"grad_norm": 0.3065283000469208,
"learning_rate": 0.0001850989405837212,
"loss": 0.9789,
"step": 4480
},
{
"epoch": 0.2587101984310106,
"grad_norm": 0.31208351254463196,
"learning_rate": 0.00018504602249213838,
"loss": 1.0209,
"step": 4485
},
{
"epoch": 0.25899861559760035,
"grad_norm": 0.27680978178977966,
"learning_rate": 0.0001849930181954124,
"loss": 0.9937,
"step": 4490
},
{
"epoch": 0.25928703276419013,
"grad_norm": 0.35537493228912354,
"learning_rate": 0.00018493992774727005,
"loss": 1.019,
"step": 4495
},
{
"epoch": 0.25957544993077986,
"grad_norm": 0.2992296814918518,
"learning_rate": 0.00018488675120152532,
"loss": 0.9409,
"step": 4500
},
{
"epoch": 0.25986386709736964,
"grad_norm": 0.2907122075557709,
"learning_rate": 0.00018483348861207953,
"loss": 0.9925,
"step": 4505
},
{
"epoch": 0.26015228426395937,
"grad_norm": 0.3083319664001465,
"learning_rate": 0.00018478014003292116,
"loss": 0.9494,
"step": 4510
},
{
"epoch": 0.26044070143054915,
"grad_norm": 0.2940841615200043,
"learning_rate": 0.00018472670551812596,
"loss": 1.0234,
"step": 4515
},
{
"epoch": 0.2607291185971389,
"grad_norm": 0.3526857793331146,
"learning_rate": 0.0001846731851218567,
"loss": 1.0047,
"step": 4520
},
{
"epoch": 0.26101753576372866,
"grad_norm": 0.2867284119129181,
"learning_rate": 0.00018461957889836324,
"loss": 0.953,
"step": 4525
},
{
"epoch": 0.2613059529303184,
"grad_norm": 0.28662440180778503,
"learning_rate": 0.00018456588690198236,
"loss": 0.9734,
"step": 4530
},
{
"epoch": 0.26159437009690817,
"grad_norm": 0.2874925136566162,
"learning_rate": 0.0001845121091871379,
"loss": 1.012,
"step": 4535
},
{
"epoch": 0.2618827872634979,
"grad_norm": 0.30890873074531555,
"learning_rate": 0.0001844582458083405,
"loss": 0.9317,
"step": 4540
},
{
"epoch": 0.2621712044300877,
"grad_norm": 0.2991410791873932,
"learning_rate": 0.0001844042968201877,
"loss": 0.9488,
"step": 4545
},
{
"epoch": 0.26245962159667746,
"grad_norm": 0.29846030473709106,
"learning_rate": 0.0001843502622773637,
"loss": 0.9722,
"step": 4550
},
{
"epoch": 0.2627480387632672,
"grad_norm": 0.30086445808410645,
"learning_rate": 0.0001842961422346396,
"loss": 0.9901,
"step": 4555
},
{
"epoch": 0.26303645592985697,
"grad_norm": 0.3020675778388977,
"learning_rate": 0.00018424193674687297,
"loss": 1.0275,
"step": 4560
},
{
"epoch": 0.2633248730964467,
"grad_norm": 0.3111262023448944,
"learning_rate": 0.00018418764586900817,
"loss": 0.9977,
"step": 4565
},
{
"epoch": 0.2636132902630365,
"grad_norm": 0.3167891204357147,
"learning_rate": 0.00018413326965607593,
"loss": 1.0266,
"step": 4570
},
{
"epoch": 0.2639017074296262,
"grad_norm": 0.28536850214004517,
"learning_rate": 0.00018407880816319363,
"loss": 0.9475,
"step": 4575
},
{
"epoch": 0.264190124596216,
"grad_norm": 0.30811807513237,
"learning_rate": 0.00018402426144556504,
"loss": 0.9549,
"step": 4580
},
{
"epoch": 0.2644785417628057,
"grad_norm": 0.2881765365600586,
"learning_rate": 0.0001839696295584803,
"loss": 1.0276,
"step": 4585
},
{
"epoch": 0.2647669589293955,
"grad_norm": 0.3339601159095764,
"learning_rate": 0.0001839149125573159,
"loss": 0.9772,
"step": 4590
},
{
"epoch": 0.26505537609598523,
"grad_norm": 0.2897505760192871,
"learning_rate": 0.0001838601104975346,
"loss": 1.0897,
"step": 4595
},
{
"epoch": 0.265343793262575,
"grad_norm": 0.3119150400161743,
"learning_rate": 0.00018380522343468532,
"loss": 0.9842,
"step": 4600
},
{
"epoch": 0.265343793262575,
"step": 4600,
"total_flos": 3.2343958172802744e+18,
"train_loss": 0.0,
"train_runtime": 0.0427,
"train_samples_per_second": 9970.556,
"train_steps_per_second": 304.266
}
],
"logging_steps": 5,
"max_steps": 13,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2343958172802744e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}