diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,6489 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.265343793262575,
+  "eval_steps": 500,
+  "global_step": 4600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5.7683433317951084e-05,
+      "grad_norm": 0.3952319025993347,
+      "learning_rate": 1.1534025374855825e-07,
+      "loss": 1.182,
+      "step": 1
+    },
+    {
+      "epoch": 0.0002884171665897554,
+      "grad_norm": 0.3334461748600006,
+      "learning_rate": 5.767012687427913e-07,
+      "loss": 1.0887,
+      "step": 5
+    },
+    {
+      "epoch": 0.0005768343331795108,
+      "grad_norm": 0.41704559326171875,
+      "learning_rate": 1.1534025374855826e-06,
+      "loss": 1.2132,
+      "step": 10
+    },
+    {
+      "epoch": 0.0008652514997692663,
+      "grad_norm": 0.4982852637767792,
+      "learning_rate": 1.7301038062283738e-06,
+      "loss": 1.1888,
+      "step": 15
+    },
+    {
+      "epoch": 0.0011536686663590216,
+      "grad_norm": 0.3702298104763031,
+      "learning_rate": 2.3068050749711653e-06,
+      "loss": 1.2105,
+      "step": 20
+    },
+    {
+      "epoch": 0.001442085832948777,
+      "grad_norm": 0.3640645444393158,
+      "learning_rate": 2.8835063437139563e-06,
+      "loss": 1.1714,
+      "step": 25
+    },
+    {
+      "epoch": 0.0017305029995385325,
+      "grad_norm": 0.31508558988571167,
+      "learning_rate": 3.4602076124567477e-06,
+      "loss": 1.0438,
+      "step": 30
+    },
+    {
+      "epoch": 0.0020189201661282878,
+      "grad_norm": 0.3910152018070221,
+      "learning_rate": 4.036908881199539e-06,
+      "loss": 1.212,
+      "step": 35
+    },
+    {
+      "epoch": 0.0023073373327180432,
+      "grad_norm": 0.32711583375930786,
+      "learning_rate": 4.6136101499423305e-06,
+      "loss": 1.1552,
+      "step": 40
+    },
+    {
+      "epoch": 0.0025957544993077987,
+      "grad_norm": 0.37455540895462036,
+      "learning_rate": 5.190311418685121e-06,
+      "loss": 1.1355,
+      "step": 45
+    },
+    {
+      "epoch": 0.002884171665897554,
+      "grad_norm": 0.32155269384384155,
+      "learning_rate": 5.7670126874279126e-06,
+      "loss": 1.1375,
+      "step": 50
+    },
+    {
+      "epoch": 0.0031725888324873096,
+      "grad_norm": 0.29815641045570374,
+      "learning_rate": 6.3437139561707036e-06,
+      "loss": 1.1193,
+      "step": 55
+    },
+    {
+      "epoch": 0.003461005999077065,
+      "grad_norm": 0.39492201805114746,
+      "learning_rate": 6.920415224913495e-06,
+      "loss": 1.1053,
+      "step": 60
+    },
+    {
+      "epoch": 0.0037494231656668205,
+      "grad_norm": 0.3298701345920563,
+      "learning_rate": 7.497116493656286e-06,
+      "loss": 1.107,
+      "step": 65
+    },
+    {
+      "epoch": 0.0040378403322565756,
+      "grad_norm": 0.3114672005176544,
+      "learning_rate": 8.073817762399077e-06,
+      "loss": 1.0677,
+      "step": 70
+    },
+    {
+      "epoch": 0.0043262574988463314,
+      "grad_norm": 0.3159383535385132,
+      "learning_rate": 8.650519031141868e-06,
+      "loss": 1.0959,
+      "step": 75
+    },
+    {
+      "epoch": 0.0046146746654360865,
+      "grad_norm": 0.2858622074127197,
+      "learning_rate": 9.227220299884661e-06,
+      "loss": 1.0435,
+      "step": 80
+    },
+    {
+      "epoch": 0.004903091832025842,
+      "grad_norm": 0.3337515890598297,
+      "learning_rate": 9.803921568627451e-06,
+      "loss": 0.9889,
+      "step": 85
+    },
+    {
+      "epoch": 0.005191508998615597,
+      "grad_norm": 0.3027825951576233,
+      "learning_rate": 1.0380622837370241e-05,
+      "loss": 1.1145,
+      "step": 90
+    },
+    {
+      "epoch": 0.005479926165205353,
+      "grad_norm": 0.34131115674972534,
+      "learning_rate": 1.0957324106113035e-05,
+      "loss": 1.0596,
+      "step": 95
+    },
+    {
+      "epoch": 0.005768343331795108,
+      "grad_norm": 0.3263566792011261,
+      "learning_rate": 1.1534025374855825e-05,
+      "loss": 0.9887,
+      "step": 100
+    },
+    {
+      "epoch": 0.006056760498384864,
+      "grad_norm": 0.325528085231781,
+      "learning_rate": 1.2110726643598615e-05,
+      "loss": 1.0143,
+      "step": 105
+    },
+    {
+      "epoch": 0.006345177664974619,
+      "grad_norm": 0.3773256242275238,
+      "learning_rate": 1.2687427912341407e-05,
+      "loss": 1.0,
+      "step": 110
+    },
+    {
+      "epoch": 0.006633594831564375,
+      "grad_norm": 0.2968287765979767,
+      "learning_rate": 1.3264129181084197e-05,
+      "loss": 0.9572,
+      "step": 115
+    },
+    {
+      "epoch": 0.00692201199815413,
+      "grad_norm": 0.29874077439308167,
+      "learning_rate": 1.384083044982699e-05,
+      "loss": 1.0344,
+      "step": 120
+    },
+    {
+      "epoch": 0.007210429164743885,
+      "grad_norm": 0.3251142203807831,
+      "learning_rate": 1.4417531718569783e-05,
+      "loss": 1.0183,
+      "step": 125
+    },
+    {
+      "epoch": 0.007498846331333641,
+      "grad_norm": 0.29589974880218506,
+      "learning_rate": 1.4994232987312573e-05,
+      "loss": 1.047,
+      "step": 130
+    },
+    {
+      "epoch": 0.007787263497923396,
+      "grad_norm": 0.3242173194885254,
+      "learning_rate": 1.5570934256055363e-05,
+      "loss": 1.0461,
+      "step": 135
+    },
+    {
+      "epoch": 0.008075680664513151,
+      "grad_norm": 0.31147414445877075,
+      "learning_rate": 1.6147635524798155e-05,
+      "loss": 1.047,
+      "step": 140
+    },
+    {
+      "epoch": 0.008364097831102908,
+      "grad_norm": 0.31779709458351135,
+      "learning_rate": 1.6724336793540947e-05,
+      "loss": 1.0784,
+      "step": 145
+    },
+    {
+      "epoch": 0.008652514997692663,
+      "grad_norm": 0.3391679525375366,
+      "learning_rate": 1.7301038062283735e-05,
+      "loss": 1.0576,
+      "step": 150
+    },
+    {
+      "epoch": 0.008940932164282418,
+      "grad_norm": 0.3228215277194977,
+      "learning_rate": 1.787773933102653e-05,
+      "loss": 1.0145,
+      "step": 155
+    },
+    {
+      "epoch": 0.009229349330872173,
+      "grad_norm": 0.30271971225738525,
+      "learning_rate": 1.8454440599769322e-05,
+      "loss": 0.9874,
+      "step": 160
+    },
+    {
+      "epoch": 0.00951776649746193,
+      "grad_norm": 0.30643004179000854,
+      "learning_rate": 1.903114186851211e-05,
+      "loss": 0.9733,
+      "step": 165
+    },
+    {
+      "epoch": 0.009806183664051685,
+      "grad_norm": 0.36777183413505554,
+      "learning_rate": 1.9607843137254903e-05,
+      "loss": 1.0242,
+      "step": 170
+    },
+    {
+      "epoch": 0.01009460083064144,
+      "grad_norm": 0.3419516086578369,
+      "learning_rate": 2.0184544405997694e-05,
+      "loss": 1.1211,
+      "step": 175
+    },
+    {
+      "epoch": 0.010383017997231195,
+      "grad_norm": 0.3591030538082123,
+      "learning_rate": 2.0761245674740483e-05,
+      "loss": 1.0323,
+      "step": 180
+    },
+    {
+      "epoch": 0.01067143516382095,
+      "grad_norm": 0.38365352153778076,
+      "learning_rate": 2.1337946943483278e-05,
+      "loss": 0.9613,
+      "step": 185
+    },
+    {
+      "epoch": 0.010959852330410707,
+      "grad_norm": 0.3436645269393921,
+      "learning_rate": 2.191464821222607e-05,
+      "loss": 1.0753,
+      "step": 190
+    },
+    {
+      "epoch": 0.011248269497000462,
+      "grad_norm": 0.341776967048645,
+      "learning_rate": 2.249134948096886e-05,
+      "loss": 1.064,
+      "step": 195
+    },
+    {
+      "epoch": 0.011536686663590217,
+      "grad_norm": 0.38297685980796814,
+      "learning_rate": 2.306805074971165e-05,
+      "loss": 1.0105,
+      "step": 200
+    },
+    {
+      "epoch": 0.011825103830179972,
+      "grad_norm": 0.3430030643939972,
+      "learning_rate": 2.3644752018454442e-05,
+      "loss": 1.0103,
+      "step": 205
+    },
+    {
+      "epoch": 0.012113520996769728,
+      "grad_norm": 0.3319534361362457,
+      "learning_rate": 2.422145328719723e-05,
+      "loss": 1.0671,
+      "step": 210
+    },
+    {
+      "epoch": 0.012401938163359483,
+      "grad_norm": 0.3615305423736572,
+      "learning_rate": 2.4798154555940022e-05,
+      "loss": 0.9236,
+      "step": 215
+    },
+    {
+      "epoch": 0.012690355329949238,
+      "grad_norm": 0.4457886517047882,
+      "learning_rate": 2.5374855824682814e-05,
+      "loss": 1.0461,
+      "step": 220
+    },
+    {
+      "epoch": 0.012978772496538993,
+      "grad_norm": 0.7715578675270081,
+      "learning_rate": 2.5951557093425606e-05,
+      "loss": 1.0131,
+      "step": 225
+    },
+    {
+      "epoch": 0.01326718966312875,
+      "grad_norm": 0.4368738830089569,
+      "learning_rate": 2.6528258362168395e-05,
+      "loss": 1.0255,
+      "step": 230
+    },
+    {
+      "epoch": 0.013555606829718505,
+      "grad_norm": 0.38978299498558044,
+      "learning_rate": 2.7104959630911193e-05,
+      "loss": 0.9773,
+      "step": 235
+    },
+    {
+      "epoch": 0.01384402399630826,
+      "grad_norm": 0.35930851101875305,
+      "learning_rate": 2.768166089965398e-05,
+      "loss": 1.0043,
+      "step": 240
+    },
+    {
+      "epoch": 0.014132441162898015,
+      "grad_norm": 0.37871646881103516,
+      "learning_rate": 2.8258362168396773e-05,
+      "loss": 1.0082,
+      "step": 245
+    },
+    {
+      "epoch": 0.01442085832948777,
+      "grad_norm": 0.3493201732635498,
+      "learning_rate": 2.8835063437139565e-05,
+      "loss": 0.9856,
+      "step": 250
+    },
+    {
+      "epoch": 0.014709275496077527,
+      "grad_norm": 0.364734947681427,
+      "learning_rate": 2.9411764705882354e-05,
+      "loss": 1.0379,
+      "step": 255
+    },
+    {
+      "epoch": 0.014997692662667282,
+      "grad_norm": 0.3644263446331024,
+      "learning_rate": 2.9988465974625146e-05,
+      "loss": 1.006,
+      "step": 260
+    },
+    {
+      "epoch": 0.015286109829257037,
+      "grad_norm": 0.3671714961528778,
+      "learning_rate": 3.0565167243367934e-05,
+      "loss": 0.9499,
+      "step": 265
+    },
+    {
+      "epoch": 0.015574526995846792,
+      "grad_norm": 0.384804904460907,
+      "learning_rate": 3.1141868512110726e-05,
+      "loss": 1.0438,
+      "step": 270
+    },
+    {
+      "epoch": 0.015862944162436547,
+      "grad_norm": 0.36940938234329224,
+      "learning_rate": 3.171856978085352e-05,
+      "loss": 0.9476,
+      "step": 275
+    },
+    {
+      "epoch": 0.016151361329026302,
+      "grad_norm": 0.38267725706100464,
+      "learning_rate": 3.229527104959631e-05,
+      "loss": 0.9689,
+      "step": 280
+    },
+    {
+      "epoch": 0.01643977849561606,
+      "grad_norm": 0.3497903347015381,
+      "learning_rate": 3.28719723183391e-05,
+      "loss": 0.9143,
+      "step": 285
+    },
+    {
+      "epoch": 0.016728195662205816,
+      "grad_norm": 0.3465529978275299,
+      "learning_rate": 3.344867358708189e-05,
+      "loss": 0.9616,
+      "step": 290
+    },
+    {
+      "epoch": 0.01701661282879557,
+      "grad_norm": 0.3548210859298706,
+      "learning_rate": 3.4025374855824685e-05,
+      "loss": 0.9695,
+      "step": 295
+    },
+    {
+      "epoch": 0.017305029995385326,
+      "grad_norm": 0.3769378662109375,
+      "learning_rate": 3.460207612456747e-05,
+      "loss": 0.963,
+      "step": 300
+    },
+    {
+      "epoch": 0.01759344716197508,
+      "grad_norm": 0.3663967549800873,
+      "learning_rate": 3.517877739331027e-05,
+      "loss": 1.0924,
+      "step": 305
+    },
+    {
+      "epoch": 0.017881864328564836,
+      "grad_norm": 0.38498544692993164,
+      "learning_rate": 3.575547866205306e-05,
+      "loss": 1.0481,
+      "step": 310
+    },
+    {
+      "epoch": 0.01817028149515459,
+      "grad_norm": 0.3465900123119354,
+      "learning_rate": 3.633217993079585e-05,
+      "loss": 1.0396,
+      "step": 315
+    },
+    {
+      "epoch": 0.018458698661744346,
+      "grad_norm": 0.3498382270336151,
+      "learning_rate": 3.6908881199538644e-05,
+      "loss": 1.0005,
+      "step": 320
+    },
+    {
+      "epoch": 0.0187471158283341,
+      "grad_norm": 0.3397336006164551,
+      "learning_rate": 3.748558246828143e-05,
+      "loss": 0.9682,
+      "step": 325
+    },
+    {
+      "epoch": 0.01903553299492386,
+      "grad_norm": 0.33760690689086914,
+      "learning_rate": 3.806228373702422e-05,
+      "loss": 0.9975,
+      "step": 330
+    },
+    {
+      "epoch": 0.019323950161513614,
+      "grad_norm": 0.32710301876068115,
+      "learning_rate": 3.863898500576701e-05,
+      "loss": 0.985,
+      "step": 335
+    },
+    {
+      "epoch": 0.01961236732810337,
+      "grad_norm": 0.40678462386131287,
+      "learning_rate": 3.9215686274509805e-05,
+      "loss": 0.9664,
+      "step": 340
+    },
+    {
+      "epoch": 0.019900784494693124,
+      "grad_norm": 0.38339948654174805,
+      "learning_rate": 3.97923875432526e-05,
+      "loss": 0.9962,
+      "step": 345
+    },
+    {
+      "epoch": 0.02018920166128288,
+      "grad_norm": 0.3516389727592468,
+      "learning_rate": 4.036908881199539e-05,
+      "loss": 0.9385,
+      "step": 350
+    },
+    {
+      "epoch": 0.020477618827872635,
+      "grad_norm": 0.3469911515712738,
+      "learning_rate": 4.094579008073818e-05,
+      "loss": 0.9795,
+      "step": 355
+    },
+    {
+      "epoch": 0.02076603599446239,
+      "grad_norm": 0.351566344499588,
+      "learning_rate": 4.1522491349480966e-05,
+      "loss": 1.0131,
+      "step": 360
+    },
+    {
+      "epoch": 0.021054453161052145,
+      "grad_norm": 0.3254294991493225,
+      "learning_rate": 4.209919261822376e-05,
+      "loss": 0.9784,
+      "step": 365
+    },
+    {
+      "epoch": 0.0213428703276419,
+      "grad_norm": 0.352115660905838,
+      "learning_rate": 4.2675893886966556e-05,
+      "loss": 1.0013,
+      "step": 370
+    },
+    {
+      "epoch": 0.021631287494231658,
+      "grad_norm": 0.35616523027420044,
+      "learning_rate": 4.325259515570935e-05,
+      "loss": 1.0209,
+      "step": 375
+    },
+    {
+      "epoch": 0.021919704660821413,
+      "grad_norm": 0.3402170240879059,
+      "learning_rate": 4.382929642445214e-05,
+      "loss": 0.976,
+      "step": 380
+    },
+    {
+      "epoch": 0.022208121827411168,
+      "grad_norm": 0.30762144923210144,
+      "learning_rate": 4.440599769319493e-05,
+      "loss": 0.8757,
+      "step": 385
+    },
+    {
+      "epoch": 0.022496538994000923,
+      "grad_norm": 0.33472269773483276,
+      "learning_rate": 4.498269896193772e-05,
+      "loss": 1.0687,
+      "step": 390
+    },
+    {
+      "epoch": 0.022784956160590678,
+      "grad_norm": 0.3568858802318573,
+      "learning_rate": 4.555940023068051e-05,
+      "loss": 1.0279,
+      "step": 395
+    },
+    {
+      "epoch": 0.023073373327180433,
+      "grad_norm": 0.3303862512111664,
+      "learning_rate": 4.61361014994233e-05,
+      "loss": 1.0061,
+      "step": 400
+    },
+    {
+      "epoch": 0.023361790493770188,
+      "grad_norm": 0.3586498498916626,
+      "learning_rate": 4.671280276816609e-05,
+      "loss": 1.0007,
+      "step": 405
+    },
+    {
+      "epoch": 0.023650207660359943,
+      "grad_norm": 0.34804537892341614,
+      "learning_rate": 4.7289504036908884e-05,
+      "loss": 0.9913,
+      "step": 410
+    },
+    {
+      "epoch": 0.0239386248269497,
+      "grad_norm": 0.33361154794692993,
+      "learning_rate": 4.7866205305651676e-05,
+      "loss": 0.9615,
+      "step": 415
+    },
+    {
+      "epoch": 0.024227041993539457,
+      "grad_norm": 0.30743229389190674,
+      "learning_rate": 4.844290657439446e-05,
+      "loss": 1.0062,
+      "step": 420
+    },
+    {
+      "epoch": 0.024515459160129212,
+      "grad_norm": 0.3414464294910431,
+      "learning_rate": 4.901960784313725e-05,
+      "loss": 1.0266,
+      "step": 425
+    },
+    {
+      "epoch": 0.024803876326718967,
+      "grad_norm": 0.311254620552063,
+      "learning_rate": 4.9596309111880045e-05,
+      "loss": 0.9525,
+      "step": 430
+    },
+    {
+      "epoch": 0.025092293493308722,
+      "grad_norm": 0.3211973011493683,
+      "learning_rate": 5.017301038062284e-05,
+      "loss": 1.0204,
+      "step": 435
+    },
+    {
+      "epoch": 0.025380710659898477,
+      "grad_norm": 0.32264503836631775,
+      "learning_rate": 5.074971164936563e-05,
+      "loss": 0.9187,
+      "step": 440
+    },
+    {
+      "epoch": 0.025669127826488232,
+      "grad_norm": 0.3149093985557556,
+      "learning_rate": 5.132641291810843e-05,
+      "loss": 1.0324,
+      "step": 445
+    },
+    {
+      "epoch": 0.025957544993077987,
+      "grad_norm": 0.31910112500190735,
+      "learning_rate": 5.190311418685121e-05,
+      "loss": 0.9924,
+      "step": 450
+    },
+    {
+      "epoch": 0.026245962159667742,
+      "grad_norm": 0.329057514667511,
+      "learning_rate": 5.2479815455594004e-05,
+      "loss": 1.0235,
+      "step": 455
+    },
+    {
+      "epoch": 0.0265343793262575,
+      "grad_norm": 0.32927969098091125,
+      "learning_rate": 5.305651672433679e-05,
+      "loss": 0.9986,
+      "step": 460
+    },
+    {
+      "epoch": 0.026822796492847256,
+      "grad_norm": 0.30113425850868225,
+      "learning_rate": 5.363321799307959e-05,
+      "loss": 0.9996,
+      "step": 465
+    },
+    {
+      "epoch": 0.02711121365943701,
+      "grad_norm": 0.31802427768707275,
+      "learning_rate": 5.4209919261822386e-05,
+      "loss": 0.903,
+      "step": 470
+    },
+    {
+      "epoch": 0.027399630826026766,
+      "grad_norm": 0.31492453813552856,
+      "learning_rate": 5.478662053056517e-05,
+      "loss": 0.9627,
+      "step": 475
+    },
+    {
+      "epoch": 0.02768804799261652,
+      "grad_norm": 0.32527875900268555,
+      "learning_rate": 5.536332179930796e-05,
+      "loss": 0.9842,
+      "step": 480
+    },
+    {
+      "epoch": 0.027976465159206276,
+      "grad_norm": 0.3000083267688751,
+      "learning_rate": 5.594002306805075e-05,
+      "loss": 0.9275,
+      "step": 485
+    },
+    {
+      "epoch": 0.02826488232579603,
+      "grad_norm": 0.30580878257751465,
+      "learning_rate": 5.651672433679355e-05,
+      "loss": 1.0111,
+      "step": 490
+    },
+    {
+      "epoch": 0.028553299492385786,
+      "grad_norm": 0.3029692769050598,
+      "learning_rate": 5.709342560553633e-05,
+      "loss": 0.9997,
+      "step": 495
+    },
+    {
+      "epoch": 0.02884171665897554,
+      "grad_norm": 0.29320913553237915,
+      "learning_rate": 5.767012687427913e-05,
+      "loss": 0.9728,
+      "step": 500
+    },
+    {
+      "epoch": 0.0291301338255653,
+      "grad_norm": 0.27277612686157227,
+      "learning_rate": 5.8246828143021916e-05,
+      "loss": 0.9481,
+      "step": 505
+    },
+    {
+      "epoch": 0.029418550992155054,
+      "grad_norm": 0.3065517544746399,
+      "learning_rate": 5.882352941176471e-05,
+      "loss": 1.0068,
+      "step": 510
+    },
+    {
+      "epoch": 0.02970696815874481,
+      "grad_norm": 0.30595871806144714,
+      "learning_rate": 5.940023068050749e-05,
+      "loss": 1.0394,
+      "step": 515
+    },
+    {
+      "epoch": 0.029995385325334564,
+      "grad_norm": 0.2905437648296356,
+      "learning_rate": 5.997693194925029e-05,
+      "loss": 0.8914,
+      "step": 520
+    },
+    {
+      "epoch": 0.03028380249192432,
+      "grad_norm": 0.30169710516929626,
+      "learning_rate": 6.0553633217993076e-05,
+      "loss": 1.0714,
+      "step": 525
+    },
+    {
+      "epoch": 0.030572219658514074,
+      "grad_norm": 0.30245259404182434,
+      "learning_rate": 6.113033448673587e-05,
+      "loss": 0.9748,
+      "step": 530
+    },
+    {
+      "epoch": 0.03086063682510383,
+      "grad_norm": 0.31071239709854126,
+      "learning_rate": 6.170703575547867e-05,
+      "loss": 1.0307,
+      "step": 535
+    },
+    {
+      "epoch": 0.031149053991693584,
+      "grad_norm": 0.301554799079895,
+      "learning_rate": 6.228373702422145e-05,
+      "loss": 0.9904,
+      "step": 540
+    },
+    {
+      "epoch": 0.03143747115828334,
+      "grad_norm": 0.29832157492637634,
+      "learning_rate": 6.286043829296425e-05,
+      "loss": 0.965,
+      "step": 545
+    },
+    {
+      "epoch": 0.031725888324873094,
+      "grad_norm": 0.2960033118724823,
+      "learning_rate": 6.343713956170704e-05,
+      "loss": 0.9661,
+      "step": 550
+    },
+    {
+      "epoch": 0.03201430549146285,
+      "grad_norm": 0.2793910503387451,
+      "learning_rate": 6.401384083044983e-05,
+      "loss": 0.9691,
+      "step": 555
+    },
+    {
+      "epoch": 0.032302722658052604,
+      "grad_norm": 0.2931232750415802,
+      "learning_rate": 6.459054209919262e-05,
+      "loss": 1.0152,
+      "step": 560
+    },
+    {
+      "epoch": 0.03259113982464236,
+      "grad_norm": 0.29276397824287415,
+      "learning_rate": 6.516724336793542e-05,
+      "loss": 0.9644,
+      "step": 565
+    },
+    {
+      "epoch": 0.03287955699123212,
+      "grad_norm": 0.2859160304069519,
+      "learning_rate": 6.57439446366782e-05,
+      "loss": 0.8926,
+      "step": 570
+    },
+    {
+      "epoch": 0.033167974157821876,
+      "grad_norm": 0.2981337308883667,
+      "learning_rate": 6.6320645905421e-05,
+      "loss": 0.9805,
+      "step": 575
+    },
+    {
+      "epoch": 0.03345639132441163,
+      "grad_norm": 0.28318145871162415,
+      "learning_rate": 6.689734717416379e-05,
+      "loss": 0.9828,
+      "step": 580
+    },
+    {
+      "epoch": 0.033744808491001387,
+      "grad_norm": 0.2922738194465637,
+      "learning_rate": 6.747404844290659e-05,
+      "loss": 0.9495,
+      "step": 585
+    },
+    {
+      "epoch": 0.03403322565759114,
+      "grad_norm": 0.3307567536830902,
+      "learning_rate": 6.805074971164937e-05,
+      "loss": 0.975,
+      "step": 590
+    },
+    {
+      "epoch": 0.0343216428241809,
+      "grad_norm": 0.2792339622974396,
+      "learning_rate": 6.862745098039216e-05,
+      "loss": 1.0021,
+      "step": 595
+    },
+    {
+      "epoch": 0.03461005999077065,
+      "grad_norm": 0.26365357637405396,
+      "learning_rate": 6.920415224913494e-05,
+      "loss": 1.0316,
+      "step": 600
+    },
+    {
+      "epoch": 0.03489847715736041,
+      "grad_norm": 0.285918265581131,
+      "learning_rate": 6.978085351787774e-05,
+      "loss": 1.0025,
+      "step": 605
+    },
+    {
+      "epoch": 0.03518689432395016,
+      "grad_norm": 0.290382444858551,
+      "learning_rate": 7.035755478662054e-05,
+      "loss": 1.0198,
+      "step": 610
+    },
+    {
+      "epoch": 0.03547531149053992,
+      "grad_norm": 0.2909998595714569,
+      "learning_rate": 7.093425605536332e-05,
+      "loss": 1.0522,
+      "step": 615
+    },
+    {
+      "epoch": 0.03576372865712967,
+      "grad_norm": 0.2691628038883209,
+      "learning_rate": 7.151095732410612e-05,
+      "loss": 1.0285,
+      "step": 620
+    },
+    {
+      "epoch": 0.03605214582371943,
+      "grad_norm": 0.2793739140033722,
+      "learning_rate": 7.20876585928489e-05,
+      "loss": 0.9431,
+      "step": 625
+    },
+    {
+      "epoch": 0.03634056299030918,
+      "grad_norm": 0.28252139687538147,
+      "learning_rate": 7.26643598615917e-05,
+      "loss": 0.954,
+      "step": 630
+    },
+    {
+      "epoch": 0.03662898015689894,
+      "grad_norm": 0.2551520764827728,
+      "learning_rate": 7.324106113033449e-05,
+      "loss": 0.9477,
+      "step": 635
+    },
+    {
+      "epoch": 0.03691739732348869,
+      "grad_norm": 0.2769528925418854,
+      "learning_rate": 7.381776239907729e-05,
+      "loss": 1.0228,
+      "step": 640
+    },
+    {
+      "epoch": 0.03720581449007845,
+      "grad_norm": 0.26769739389419556,
+      "learning_rate": 7.439446366782007e-05,
+      "loss": 0.9844,
+      "step": 645
+    },
+    {
+      "epoch": 0.0374942316566682,
+      "grad_norm": 0.2822119891643524,
+      "learning_rate": 7.497116493656286e-05,
+      "loss": 1.0532,
+      "step": 650
+    },
+    {
+      "epoch": 0.03778264882325796,
+      "grad_norm": 0.2787601053714752,
+      "learning_rate": 7.554786620530564e-05,
+      "loss": 1.0154,
+      "step": 655
+    },
+    {
+      "epoch": 0.03807106598984772,
+      "grad_norm": 0.27694109082221985,
+      "learning_rate": 7.612456747404844e-05,
+      "loss": 0.9775,
+      "step": 660
+    },
+    {
+      "epoch": 0.038359483156437474,
+      "grad_norm": 0.4112897217273712,
+      "learning_rate": 7.670126874279123e-05,
+      "loss": 1.0071,
+      "step": 665
+    },
+    {
+      "epoch": 0.03864790032302723,
+      "grad_norm": 0.26005199551582336,
+      "learning_rate": 7.727797001153403e-05,
+      "loss": 0.9632,
+      "step": 670
+    },
+    {
+      "epoch": 0.038936317489616984,
+      "grad_norm": 0.25056615471839905,
+      "learning_rate": 7.785467128027682e-05,
+      "loss": 0.9773,
+      "step": 675
+    },
+    {
+      "epoch": 0.03922473465620674,
+      "grad_norm": 0.27164942026138306,
+      "learning_rate": 7.843137254901961e-05,
+      "loss": 0.9927,
+      "step": 680
+    },
+    {
+      "epoch": 0.039513151822796494,
+      "grad_norm": 0.26238757371902466,
+      "learning_rate": 7.900807381776241e-05,
+      "loss": 0.9612,
+      "step": 685
+    },
+    {
+      "epoch": 0.03980156898938625,
+      "grad_norm": 0.28629186749458313,
+      "learning_rate": 7.95847750865052e-05,
+      "loss": 0.9579,
+      "step": 690
+    },
+    {
+      "epoch": 0.040089986155976004,
+      "grad_norm": 0.2650497555732727,
+      "learning_rate": 8.016147635524799e-05,
+      "loss": 0.9667,
+      "step": 695
+    },
+    {
+      "epoch": 0.04037840332256576,
+      "grad_norm": 0.26934972405433655,
+      "learning_rate": 8.073817762399078e-05,
+      "loss": 0.9257,
+      "step": 700
+    },
+    {
+      "epoch": 0.040666820489155514,
+      "grad_norm": 0.27391955256462097,
+      "learning_rate": 8.131487889273358e-05,
+      "loss": 1.0725,
+      "step": 705
+    },
+    {
+      "epoch": 0.04095523765574527,
+      "grad_norm": 0.2905539274215698,
+      "learning_rate": 8.189158016147636e-05,
+      "loss": 0.9979,
+      "step": 710
+    },
+    {
+      "epoch": 0.041243654822335024,
+      "grad_norm": 0.26050031185150146,
+      "learning_rate": 8.246828143021915e-05,
+      "loss": 0.9901,
+      "step": 715
+    },
+    {
+      "epoch": 0.04153207198892478,
+      "grad_norm": 0.4822568893432617,
+      "learning_rate": 8.304498269896193e-05,
+      "loss": 0.9753,
+      "step": 720
+    },
+    {
+      "epoch": 0.041820489155514534,
+      "grad_norm": 0.27065780758857727,
+      "learning_rate": 8.362168396770473e-05,
+      "loss": 0.961,
+      "step": 725
+    },
+    {
+      "epoch": 0.04210890632210429,
+      "grad_norm": 0.27039390802383423,
+      "learning_rate": 8.419838523644751e-05,
+      "loss": 1.0218,
+      "step": 730
+    },
+    {
+      "epoch": 0.042397323488694044,
+      "grad_norm": 0.267991304397583,
+      "learning_rate": 8.477508650519031e-05,
+      "loss": 0.8937,
+      "step": 735
+    },
+    {
+      "epoch": 0.0426857406552838,
+      "grad_norm": 0.2698671519756317,
+      "learning_rate": 8.535178777393311e-05,
+      "loss": 1.0203,
+      "step": 740
+    },
+    {
+      "epoch": 0.04297415782187356,
+      "grad_norm": 0.25605538487434387,
+      "learning_rate": 8.59284890426759e-05,
+      "loss": 1.0398,
+      "step": 745
+    },
+    {
+      "epoch": 0.043262574988463316,
+      "grad_norm": 0.26644793152809143,
+      "learning_rate": 8.65051903114187e-05,
+      "loss": 1.0212,
+      "step": 750
+    },
+    {
+      "epoch": 0.04355099215505307,
+      "grad_norm": 0.2879778742790222,
+      "learning_rate": 8.708189158016148e-05,
+      "loss": 0.9854,
+      "step": 755
+    },
+    {
+      "epoch": 0.043839409321642826,
+      "grad_norm": 0.26750192046165466,
+      "learning_rate": 8.765859284890428e-05,
+      "loss": 1.0168,
+      "step": 760
+    },
+    {
+      "epoch": 0.04412782648823258,
+      "grad_norm": 0.2743099331855774,
+      "learning_rate": 8.823529411764706e-05,
+      "loss": 0.9447,
+      "step": 765
+    },
+    {
+      "epoch": 0.044416243654822336,
+      "grad_norm": 0.27284887433052063,
+      "learning_rate": 8.881199538638986e-05,
+      "loss": 1.016,
+      "step": 770
+    },
+    {
+      "epoch": 0.04470466082141209,
+      "grad_norm": 0.26251500844955444,
+      "learning_rate": 8.938869665513265e-05,
+      "loss": 0.9275,
+      "step": 775
+    },
+    {
+      "epoch": 0.044993077988001846,
+      "grad_norm": 0.26898619532585144,
+      "learning_rate": 8.996539792387543e-05,
+      "loss": 0.9258,
+      "step": 780
+    },
+    {
+      "epoch": 0.0452814951545916,
+      "grad_norm": 0.2636859118938446,
+      "learning_rate": 9.054209919261822e-05,
+      "loss": 1.1368,
+      "step": 785
+    },
+    {
+      "epoch": 0.045569912321181356,
+      "grad_norm": 0.25750333070755005,
+      "learning_rate": 9.111880046136102e-05,
+      "loss": 0.9829,
+      "step": 790
+    },
+    {
+      "epoch": 0.04585832948777111,
+      "grad_norm": 0.26251962780952454,
+      "learning_rate": 9.16955017301038e-05,
+      "loss": 1.0722,
+      "step": 795
+    },
+    {
+      "epoch": 0.046146746654360866,
+      "grad_norm": 0.24186044931411743,
+      "learning_rate": 9.22722029988466e-05,
+      "loss": 0.9681,
+      "step": 800
+    },
+    {
+      "epoch": 0.04643516382095062,
+      "grad_norm": 0.2631891965866089,
+      "learning_rate": 9.28489042675894e-05,
+      "loss": 1.0082,
+      "step": 805
+    },
+    {
+      "epoch": 0.046723580987540377,
+      "grad_norm": 0.25769105553627014,
+      "learning_rate": 9.342560553633218e-05,
+      "loss": 0.9419,
+      "step": 810
+    },
+    {
+      "epoch": 0.04701199815413013,
+      "grad_norm": 0.26983222365379333,
+      "learning_rate": 9.400230680507498e-05,
+      "loss": 0.9698,
+      "step": 815
+    },
+    {
+      "epoch": 0.04730041532071989,
+      "grad_norm": 0.268951952457428,
+      "learning_rate": 9.457900807381777e-05,
+      "loss": 1.0199,
+      "step": 820
+    },
+    {
+      "epoch": 0.04758883248730964,
+      "grad_norm": 0.2618368864059448,
+      "learning_rate": 9.515570934256057e-05,
+      "loss": 1.0474,
+      "step": 825
+    },
+    {
+      "epoch": 0.0478772496538994,
+      "grad_norm": 0.2535788118839264,
+      "learning_rate": 9.573241061130335e-05,
+      "loss": 1.051,
+      "step": 830
+    },
+    {
+      "epoch": 0.04816566682048916,
+      "grad_norm": 0.24797338247299194,
+      "learning_rate": 9.630911188004614e-05,
+      "loss": 0.9787,
+      "step": 835
+    },
+    {
+      "epoch": 0.048454083987078914,
+      "grad_norm": 0.2542094886302948,
+      "learning_rate": 9.688581314878892e-05,
+      "loss": 1.0301,
+      "step": 840
+    },
+    {
+      "epoch": 0.04874250115366867,
+      "grad_norm": 0.34137168526649475,
+      "learning_rate": 9.746251441753172e-05,
+      "loss": 0.8916,
+      "step": 845
+    },
+    {
+      "epoch": 0.049030918320258424,
+      "grad_norm": 0.25905948877334595,
+      "learning_rate": 9.80392156862745e-05,
+      "loss": 1.0086,
+      "step": 850
+    },
+    {
+      "epoch": 0.04931933548684818,
+      "grad_norm": 0.24208292365074158,
+      "learning_rate": 9.86159169550173e-05,
+      "loss": 0.962,
+      "step": 855
+    },
+    {
+      "epoch": 0.049607752653437934,
+      "grad_norm": 0.2500937879085541,
+      "learning_rate": 9.919261822376009e-05,
+      "loss": 0.983,
+      "step": 860
+    },
+    {
+      "epoch": 0.04989616982002769,
+      "grad_norm": 0.2481968104839325,
+      "learning_rate": 9.976931949250289e-05,
+      "loss": 0.9798,
+      "step": 865
+    },
+    {
+      "epoch": 0.050184586986617444,
+      "grad_norm": 0.25975415110588074,
+      "learning_rate": 0.00010034602076124569,
+      "loss": 0.9621,
+      "step": 870
+    },
+    {
+      "epoch": 0.0504730041532072,
+      "grad_norm": 0.25389575958251953,
+      "learning_rate": 0.00010092272202998847,
+      "loss": 0.9959,
+      "step": 875
+    },
+    {
+      "epoch": 0.050761421319796954,
+      "grad_norm": 0.26200932264328003,
+      "learning_rate": 0.00010149942329873126,
+      "loss": 0.9432,
+      "step": 880
+    },
+    {
+      "epoch": 0.05104983848638671,
+      "grad_norm": 0.25433865189552307,
+      "learning_rate": 0.00010207612456747407,
+      "loss": 1.0272,
+      "step": 885
+    },
+    {
+      "epoch": 0.051338255652976464,
+      "grad_norm": 0.29402443766593933,
+      "learning_rate": 0.00010265282583621685,
+      "loss": 1.018,
+      "step": 890
+    },
+    {
+      "epoch": 0.05162667281956622,
+      "grad_norm": 0.2625313699245453,
+      "learning_rate": 0.00010322952710495964,
+      "loss": 1.0326,
+      "step": 895
+    },
+    {
+      "epoch": 0.051915089986155974,
+      "grad_norm": 0.2682657241821289,
+      "learning_rate": 0.00010380622837370242,
+      "loss": 1.0215,
+      "step": 900
+    },
+    {
+      "epoch": 0.05220350715274573,
+      "grad_norm": 0.27114447951316833,
+      "learning_rate": 0.00010438292964244522,
+      "loss": 0.9736,
+      "step": 905
+    },
+    {
+      "epoch": 0.052491924319335484,
+      "grad_norm": 0.2469518631696701,
+      "learning_rate": 0.00010495963091118801,
+      "loss": 0.93,
+      "step": 910
+    },
+    {
+      "epoch": 0.05278034148592524,
+      "grad_norm": 0.262253999710083,
+      "learning_rate": 0.00010553633217993079,
+      "loss": 0.9477,
+      "step": 915
+    },
+    {
+      "epoch": 0.053068758652515,
+      "grad_norm": 0.25354915857315063,
+      "learning_rate": 0.00010611303344867358,
+      "loss": 0.9926,
+      "step": 920
+    },
+    {
+      "epoch": 0.053357175819104756,
+      "grad_norm": 0.24856913089752197,
+      "learning_rate": 0.00010668973471741639,
+      "loss": 0.9726,
+      "step": 925
+    },
+    {
+      "epoch": 0.05364559298569451,
+      "grad_norm": 0.24939557909965515,
+      "learning_rate": 0.00010726643598615918,
+      "loss": 0.9575,
+      "step": 930
+    },
+    {
+      "epoch": 0.053934010152284266,
+      "grad_norm": 0.2722608745098114,
+      "learning_rate": 0.00010784313725490196,
+      "loss": 1.0017,
+      "step": 935
+    },
+    {
+      "epoch": 0.05422242731887402,
+      "grad_norm": 0.25203198194503784,
+      "learning_rate": 0.00010841983852364477,
+      "loss": 0.9141,
+      "step": 940
+    },
+    {
+      "epoch": 0.054510844485463776,
+      "grad_norm": 0.2586802840232849,
+      "learning_rate": 0.00010899653979238756,
+      "loss": 1.0066,
+      "step": 945
+    },
+    {
+      "epoch": 0.05479926165205353,
+      "grad_norm": 0.24033570289611816,
+      "learning_rate": 0.00010957324106113034,
+      "loss": 1.0113,
+      "step": 950
+    },
+    {
+      "epoch": 0.055087678818643286,
+      "grad_norm": 0.2373732328414917,
+      "learning_rate": 0.00011014994232987313,
+      "loss": 1.0172,
+      "step": 955
+    },
+    {
+      "epoch": 0.05537609598523304,
+      "grad_norm": 0.25045233964920044,
+      "learning_rate": 0.00011072664359861593,
+      "loss": 0.9548,
+      "step": 960
+    },
+    {
+      "epoch": 0.055664513151822796,
+      "grad_norm": 0.25307127833366394,
+      "learning_rate": 0.00011130334486735871,
+      "loss": 0.8803,
+      "step": 965
+    },
+    {
+      "epoch": 0.05595293031841255,
+      "grad_norm": 0.2580971121788025,
+      "learning_rate": 0.0001118800461361015,
+      "loss": 1.0257,
+      "step": 970
+    },
+    {
+      "epoch": 0.056241347485002306,
+      "grad_norm": 0.3492274284362793,
+      "learning_rate": 0.00011245674740484428,
+      "loss": 0.9915,
+      "step": 975
+    },
+    {
+      "epoch": 0.05652976465159206,
+      "grad_norm": 0.3969261944293976,
+      "learning_rate": 0.0001130334486735871,
+      "loss": 0.9871,
+      "step": 980
+    },
+    {
+      "epoch": 0.056818181818181816,
+      "grad_norm": 0.2512189447879791,
+      "learning_rate": 0.00011361014994232988,
+      "loss": 0.9999,
+      "step": 985
+    },
+    {
+      "epoch": 0.05710659898477157,
+      "grad_norm": 0.24583379924297333,
+      "learning_rate": 0.00011418685121107266,
+      "loss": 1.019,
+      "step": 990
+    },
+    {
+      "epoch": 0.057395016151361326,
+      "grad_norm": 0.23418952524662018,
+      "learning_rate": 0.00011476355247981545,
+      "loss": 0.9976,
+      "step": 995
+    },
+    {
+      "epoch": 0.05768343331795108,
+      "grad_norm": 0.24816179275512695,
+      "learning_rate": 0.00011534025374855826,
+      "loss": 0.9787,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05797185048454084,
+      "grad_norm": 0.238878071308136,
+      "learning_rate": 0.00011591695501730105,
+      "loss": 0.9831,
+      "step": 1005
+    },
+    {
+      "epoch": 0.0582602676511306,
+      "grad_norm": 0.240176260471344,
+      "learning_rate": 0.00011649365628604383,
+      "loss": 0.9604,
+      "step": 1010
+    },
+    {
+      "epoch": 0.05854868481772035,
+      "grad_norm": 0.24366143345832825,
+      "learning_rate": 0.00011707035755478663,
+      "loss": 1.0633,
+      "step": 1015
+    },
+    {
+      "epoch": 0.05883710198431011,
+      "grad_norm": 0.24254244565963745,
+      "learning_rate": 0.00011764705882352942,
+      "loss": 1.0299,
+      "step": 1020
+    },
+    {
+      "epoch": 0.05912551915089986,
+      "grad_norm": 0.2483944445848465,
+      "learning_rate": 0.0001182237600922722,
+      "loss": 1.0325,
+      "step": 1025
+    },
+    {
+      "epoch": 0.05941393631748962,
+      "grad_norm": 0.23639345169067383,
+      "learning_rate": 0.00011880046136101499,
+      "loss": 0.9192,
+      "step": 1030
+    },
+    {
+      "epoch": 0.059702353484079373,
+      "grad_norm": 0.26320794224739075,
+      "learning_rate": 0.0001193771626297578,
+      "loss": 0.973,
+      "step": 1035
+    },
+    {
+      "epoch": 0.05999077065066913,
+      "grad_norm": 0.26271867752075195,
+      "learning_rate": 0.00011995386389850058,
+      "loss": 1.0339,
+      "step": 1040
+    },
+    {
+      "epoch": 0.060279187817258884,
+      "grad_norm": 0.2515929043292999,
+      "learning_rate": 0.00012053056516724337,
+      "loss": 0.9777,
+      "step": 1045
+    },
+    {
+      "epoch": 0.06056760498384864,
+      "grad_norm": 0.24450047314167023,
+      "learning_rate": 0.00012110726643598615,
+      "loss": 0.9781,
+      "step": 1050
+    },
+    {
+      "epoch": 0.060856022150438394,
+      "grad_norm": 0.247002974152565,
+      "learning_rate": 0.00012168396770472896,
+      "loss": 0.9742,
+      "step": 1055
+    },
+    {
+      "epoch": 0.06114443931702815,
+      "grad_norm": 0.22039633989334106,
+      "learning_rate": 0.00012226066897347174,
+      "loss": 0.9602,
+      "step": 1060
+    },
+    {
+      "epoch": 0.061432856483617904,
+      "grad_norm": 0.25299662351608276,
+      "learning_rate": 0.00012283737024221453,
+      "loss": 0.9429,
+      "step": 1065
+    },
+    {
+      "epoch": 0.06172127365020766,
+      "grad_norm": 0.24021919071674347,
+      "learning_rate": 0.00012341407151095733,
+      "loss": 1.0543,
+      "step": 1070
+    },
+    {
+      "epoch": 0.062009690816797414,
+      "grad_norm": 0.2851802408695221,
+      "learning_rate": 0.00012399077277970013,
+      "loss": 1.0169,
+      "step": 1075
+    },
+    {
+      "epoch": 0.06229810798338717,
+      "grad_norm": 0.2532206177711487,
+      "learning_rate": 0.0001245674740484429,
+      "loss": 0.9388,
+      "step": 1080
+    },
+    {
+      "epoch": 0.06258652514997692,
+      "grad_norm": 0.2355235517024994,
+      "learning_rate": 0.0001251441753171857,
+      "loss": 0.9283,
+      "step": 1085
+    },
+    {
+      "epoch": 0.06287494231656668,
+      "grad_norm": 0.2673757076263428,
+      "learning_rate": 0.0001257208765859285,
+      "loss": 1.0022,
+      "step": 1090
+    },
+    {
+      "epoch": 0.06316335948315643,
+      "grad_norm": 0.22847038507461548,
+      "learning_rate": 0.0001262975778546713,
+      "loss": 0.9481,
+      "step": 1095
+    },
+    {
+      "epoch": 0.06345177664974619,
+      "grad_norm": 0.25772714614868164,
+      "learning_rate": 0.00012687427912341407,
+      "loss": 0.9909,
+      "step": 1100
+    },
+    {
+      "epoch": 0.06374019381633594,
+      "grad_norm": 0.238713800907135,
+      "learning_rate": 0.00012745098039215687,
+      "loss": 0.9379,
+      "step": 1105
+    },
+    {
+      "epoch": 0.0640286109829257,
+      "grad_norm": 0.24460141360759735,
+      "learning_rate": 0.00012802768166089967,
+      "loss": 0.9398,
+      "step": 1110
+    },
+    {
+      "epoch": 0.06431702814951545,
+      "grad_norm": 0.23570501804351807,
+      "learning_rate": 0.00012860438292964244,
+      "loss": 0.9292,
+      "step": 1115
+    },
+    {
+      "epoch": 0.06460544531610521,
+      "grad_norm": 0.26408931612968445,
+      "learning_rate": 0.00012918108419838524,
+      "loss": 1.026,
+      "step": 1120
+    },
+    {
+      "epoch": 0.06489386248269496,
+      "grad_norm": 0.2372530698776245,
+      "learning_rate": 0.00012975778546712804,
+      "loss": 0.9906,
+      "step": 1125
+    },
+    {
+      "epoch": 0.06518227964928472,
+      "grad_norm": 0.2314678579568863,
+      "learning_rate": 0.00013033448673587084,
+      "loss": 0.9447,
+      "step": 1130
+    },
+    {
+      "epoch": 0.06547069681587447,
+      "grad_norm": 0.25254136323928833,
+      "learning_rate": 0.0001309111880046136,
+      "loss": 1.0364,
+      "step": 1135
+    },
+    {
+      "epoch": 0.06575911398246424,
+      "grad_norm": 0.23922473192214966,
+      "learning_rate": 0.0001314878892733564,
+      "loss": 1.0091,
+      "step": 1140
+    },
+    {
+      "epoch": 0.066047531149054,
+      "grad_norm": 0.24500273168087006,
+      "learning_rate": 0.0001320645905420992,
+      "loss": 0.9951,
+      "step": 1145
+    },
+    {
+      "epoch": 0.06633594831564375,
+      "grad_norm": 0.23815661668777466,
+      "learning_rate": 0.000132641291810842,
+      "loss": 1.0065,
+      "step": 1150
+    },
+    {
+      "epoch": 0.06662436548223351,
+      "grad_norm": 0.26173415780067444,
+      "learning_rate": 0.00013321799307958477,
+      "loss": 1.0159,
+      "step": 1155
+    },
+    {
+      "epoch": 0.06691278264882326,
+      "grad_norm": 0.22709496319293976,
+      "learning_rate": 0.00013379469434832757,
+      "loss": 0.9121,
+      "step": 1160
+    },
+    {
+      "epoch": 0.06720119981541302,
+      "grad_norm": 0.2595439553260803,
+      "learning_rate": 0.00013437139561707037,
+      "loss": 1.0136,
+      "step": 1165
+    },
+    {
+      "epoch": 0.06748961698200277,
+      "grad_norm": 0.23945558071136475,
+      "learning_rate": 0.00013494809688581317,
+      "loss": 0.9508,
+      "step": 1170
+    },
+    {
+      "epoch": 0.06777803414859253,
+      "grad_norm": 0.2526959478855133,
+      "learning_rate": 0.00013552479815455594,
+      "loss": 0.9304,
+      "step": 1175
+    },
+    {
+      "epoch": 0.06806645131518228,
+      "grad_norm": 0.2385508418083191,
+      "learning_rate": 0.00013610149942329874,
+      "loss": 1.012,
+      "step": 1180
+    },
+    {
+      "epoch": 0.06835486848177204,
+      "grad_norm": 0.25558724999427795,
+      "learning_rate": 0.00013667820069204154,
+      "loss": 1.0289,
+      "step": 1185
+    },
+    {
+      "epoch": 0.0686432856483618,
+      "grad_norm": 0.26076334714889526,
+      "learning_rate": 0.0001372549019607843,
+      "loss": 0.9564,
+      "step": 1190
+    },
+    {
+      "epoch": 0.06893170281495155,
+      "grad_norm": 0.24157829582691193,
+      "learning_rate": 0.0001378316032295271,
+      "loss": 1.0265,
+      "step": 1195
+    },
+    {
+      "epoch": 0.0692201199815413,
+      "grad_norm": 0.2505204379558563,
+      "learning_rate": 0.00013840830449826988,
+      "loss": 0.965,
+      "step": 1200
+    },
+    {
+      "epoch": 0.06950853714813106,
+      "grad_norm": 0.2583898603916168,
+      "learning_rate": 0.0001389850057670127,
+      "loss": 1.0161,
+      "step": 1205
+    },
+    {
+      "epoch": 0.06979695431472081,
+      "grad_norm": 0.24660265445709229,
+      "learning_rate": 0.00013956170703575548,
+      "loss": 1.0086,
+      "step": 1210
+    },
+    {
+      "epoch": 0.07008537148131057,
+      "grad_norm": 0.2303483486175537,
+      "learning_rate": 0.00014013840830449828,
+      "loss": 1.0004,
+      "step": 1215
+    },
+    {
+      "epoch": 0.07037378864790032,
+      "grad_norm": 0.25441575050354004,
+      "learning_rate": 0.00014071510957324108,
+      "loss": 1.0218,
+      "step": 1220
+    },
+    {
+      "epoch": 0.07066220581449008,
+      "grad_norm": 0.2441866099834442,
+      "learning_rate": 0.00014129181084198387,
+      "loss": 0.9947,
+      "step": 1225
+    },
+    {
+      "epoch": 0.07095062298107983,
+      "grad_norm": 0.2431473582983017,
+      "learning_rate": 0.00014186851211072665,
+      "loss": 0.977,
+      "step": 1230
+    },
+    {
+      "epoch": 0.07123904014766959,
+      "grad_norm": 0.22348998486995697,
+      "learning_rate": 0.00014244521337946944,
+      "loss": 0.9626,
+      "step": 1235
+    },
+    {
+      "epoch": 0.07152745731425934,
+      "grad_norm": 0.25038719177246094,
+      "learning_rate": 0.00014302191464821224,
+      "loss": 1.0234,
+      "step": 1240
+    },
+    {
+      "epoch": 0.0718158744808491,
+      "grad_norm": 0.24543331563472748,
+      "learning_rate": 0.00014359861591695501,
+      "loss": 0.9782,
+      "step": 1245
+    },
+    {
+      "epoch": 0.07210429164743885,
+      "grad_norm": 0.2646369934082031,
+      "learning_rate": 0.0001441753171856978,
+      "loss": 1.0049,
+      "step": 1250
+    },
+    {
+      "epoch": 0.07239270881402861,
+      "grad_norm": 0.24707183241844177,
+      "learning_rate": 0.00014475201845444058,
+      "loss": 1.0426,
+      "step": 1255
+    },
+    {
+      "epoch": 0.07268112598061836,
+      "grad_norm": 0.24609191715717316,
+      "learning_rate": 0.0001453287197231834,
+      "loss": 0.9978,
+      "step": 1260
+    },
+    {
+      "epoch": 0.07296954314720812,
+      "grad_norm": 0.2498229593038559,
+      "learning_rate": 0.00014590542099192618,
+      "loss": 1.0299,
+      "step": 1265
+    },
+    {
+      "epoch": 0.07325796031379787,
+      "grad_norm": 0.24294817447662354,
+      "learning_rate": 0.00014648212226066898,
+      "loss": 0.9387,
+      "step": 1270
+    },
+    {
+      "epoch": 0.07354637748038763,
+      "grad_norm": 0.22789110243320465,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 0.9859,
+      "step": 1275
+    },
+    {
+      "epoch": 0.07383479464697738,
+      "grad_norm": 0.2392035871744156,
+      "learning_rate": 0.00014763552479815458,
+      "loss": 0.9821,
+      "step": 1280
+    },
+    {
+      "epoch": 0.07412321181356714,
+      "grad_norm": 0.24138358235359192,
+      "learning_rate": 0.00014821222606689735,
+      "loss": 0.9644,
+      "step": 1285
+    },
+    {
+      "epoch": 0.0744116289801569,
+      "grad_norm": 0.2574746012687683,
+      "learning_rate": 0.00014878892733564015,
+      "loss": 0.9894,
+      "step": 1290
+    },
+    {
+      "epoch": 0.07470004614674665,
+      "grad_norm": 0.2577558755874634,
+      "learning_rate": 0.00014936562860438295,
+      "loss": 1.0049,
+      "step": 1295
+    },
+    {
+      "epoch": 0.0749884633133364,
+      "grad_norm": 0.2638446092605591,
+      "learning_rate": 0.00014994232987312572,
+      "loss": 0.9866,
+      "step": 1300
+    },
+    {
+      "epoch": 0.07527688047992616,
+      "grad_norm": 0.2279583364725113,
+      "learning_rate": 0.00015051903114186852,
+      "loss": 0.9697,
+      "step": 1305
+    },
+    {
+      "epoch": 0.07556529764651591,
+      "grad_norm": 0.25132206082344055,
+      "learning_rate": 0.0001510957324106113,
+      "loss": 0.9654,
+      "step": 1310
+    },
+    {
+      "epoch": 0.07585371481310568,
+      "grad_norm": 0.24250829219818115,
+      "learning_rate": 0.00015167243367935411,
+      "loss": 0.9594,
+      "step": 1315
+    },
+    {
+      "epoch": 0.07614213197969544,
+      "grad_norm": 0.24679099023342133,
+      "learning_rate": 0.00015224913494809689,
+      "loss": 0.9514,
+      "step": 1320
+    },
+    {
+      "epoch": 0.07643054914628519,
+      "grad_norm": 0.26517555117607117,
+      "learning_rate": 0.00015282583621683968,
+      "loss": 0.9575,
+      "step": 1325
+    },
+    {
+      "epoch": 0.07671896631287495,
+      "grad_norm": 0.23794426023960114,
+      "learning_rate": 0.00015340253748558246,
+      "loss": 0.9982,
+      "step": 1330
+    },
+    {
+      "epoch": 0.0770073834794647,
+      "grad_norm": 0.2488831728696823,
+      "learning_rate": 0.00015397923875432528,
+      "loss": 0.9454,
+      "step": 1335
+    },
+    {
+      "epoch": 0.07729580064605446,
+      "grad_norm": 0.26782914996147156,
+      "learning_rate": 0.00015455594002306805,
+      "loss": 1.0235,
+      "step": 1340
+    },
+    {
+      "epoch": 0.07758421781264421,
+      "grad_norm": 0.25021234154701233,
+      "learning_rate": 0.00015513264129181085,
+      "loss": 0.9243,
+      "step": 1345
+    },
+    {
+      "epoch": 0.07787263497923397,
+      "grad_norm": 0.2522822618484497,
+      "learning_rate": 0.00015570934256055365,
+      "loss": 1.0428,
+      "step": 1350
+    },
+    {
+      "epoch": 0.07816105214582372,
+      "grad_norm": 0.27001574635505676,
+      "learning_rate": 0.00015628604382929645,
+      "loss": 0.9755,
+      "step": 1355
+    },
+    {
+      "epoch": 0.07844946931241348,
+      "grad_norm": 0.24071645736694336,
+      "learning_rate": 0.00015686274509803922,
+      "loss": 1.013,
+      "step": 1360
+    },
+    {
+      "epoch": 0.07873788647900323,
+      "grad_norm": 0.24303098022937775,
+      "learning_rate": 0.00015743944636678202,
+      "loss": 0.9862,
+      "step": 1365
+    },
+    {
+      "epoch": 0.07902630364559299,
+      "grad_norm": 0.2542005479335785,
+      "learning_rate": 0.00015801614763552482,
+      "loss": 0.9709,
+      "step": 1370
+    },
+    {
+      "epoch": 0.07931472081218274,
+      "grad_norm": 0.2585870325565338,
+      "learning_rate": 0.0001585928489042676,
+      "loss": 1.0085,
+      "step": 1375
+    },
+    {
+      "epoch": 0.0796031379787725,
+      "grad_norm": 0.2629243731498718,
+      "learning_rate": 0.0001591695501730104,
+      "loss": 0.985,
+      "step": 1380
+    },
+    {
+      "epoch": 0.07989155514536225,
+      "grad_norm": 0.24008338153362274,
+      "learning_rate": 0.00015974625144175316,
+      "loss": 0.9839,
+      "step": 1385
+    },
+    {
+      "epoch": 0.08017997231195201,
+      "grad_norm": 0.2442033439874649,
+      "learning_rate": 0.00016032295271049598,
+      "loss": 0.8798,
+      "step": 1390
+    },
+    {
+      "epoch": 0.08046838947854176,
+      "grad_norm": 0.250362366437912,
+      "learning_rate": 0.00016089965397923876,
+      "loss": 0.9301,
+      "step": 1395
+    },
+    {
+      "epoch": 0.08075680664513152,
+      "grad_norm": 0.2477293759584427,
+      "learning_rate": 0.00016147635524798155,
+      "loss": 0.9561,
+      "step": 1400
+    },
+    {
+      "epoch": 0.08104522381172127,
+      "grad_norm": 0.23329582810401917,
+      "learning_rate": 0.00016205305651672435,
+      "loss": 0.9505,
+      "step": 1405
+    },
+    {
+      "epoch": 0.08133364097831103,
+      "grad_norm": 0.24549901485443115,
+      "learning_rate": 0.00016262975778546715,
+      "loss": 1.0284,
+      "step": 1410
+    },
+    {
+      "epoch": 0.08162205814490078,
+      "grad_norm": 0.24419653415679932,
+      "learning_rate": 0.00016320645905420992,
+      "loss": 0.9114,
+      "step": 1415
+    },
+    {
+      "epoch": 0.08191047531149054,
+      "grad_norm": 0.24551044404506683,
+      "learning_rate": 0.00016378316032295272,
+      "loss": 0.9574,
+      "step": 1420
+    },
+    {
+      "epoch": 0.0821988924780803,
+      "grad_norm": 0.29641515016555786,
+      "learning_rate": 0.00016435986159169552,
+      "loss": 0.9821,
+      "step": 1425
+    },
+    {
+      "epoch": 0.08248730964467005,
+      "grad_norm": 0.24953129887580872,
+      "learning_rate": 0.0001649365628604383,
+      "loss": 0.9966,
+      "step": 1430
+    },
+    {
+      "epoch": 0.0827757268112598,
+      "grad_norm": 0.25181591510772705,
+      "learning_rate": 0.0001655132641291811,
+      "loss": 1.023,
+      "step": 1435
+    },
+    {
+      "epoch": 0.08306414397784956,
+      "grad_norm": 0.2478877305984497,
+      "learning_rate": 0.00016608996539792386,
+      "loss": 0.9762,
+      "step": 1440
+    },
+    {
+      "epoch": 0.08335256114443931,
+      "grad_norm": 0.24414442479610443,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.9339,
+      "step": 1445
+    },
+    {
+      "epoch": 0.08364097831102907,
+      "grad_norm": 0.24295495450496674,
+      "learning_rate": 0.00016724336793540946,
+      "loss": 1.0144,
+      "step": 1450
+    },
+    {
+      "epoch": 0.08392939547761882,
+      "grad_norm": 0.25291165709495544,
+      "learning_rate": 0.00016782006920415226,
+      "loss": 0.916,
+      "step": 1455
+    },
+    {
+      "epoch": 0.08421781264420858,
+      "grad_norm": 0.23744194209575653,
+      "learning_rate": 0.00016839677047289503,
+      "loss": 0.952,
+      "step": 1460
+    },
+    {
+      "epoch": 0.08450622981079833,
+      "grad_norm": 0.24316394329071045,
+      "learning_rate": 0.00016897347174163786,
+      "loss": 0.9725,
+      "step": 1465
+    },
+    {
+      "epoch": 0.08479464697738809,
+      "grad_norm": 0.23748493194580078,
+      "learning_rate": 0.00016955017301038063,
+      "loss": 0.9831,
+      "step": 1470
+    },
+    {
+      "epoch": 0.08508306414397784,
+      "grad_norm": 0.25356602668762207,
+      "learning_rate": 0.00017012687427912343,
+      "loss": 0.9632,
+      "step": 1475
+    },
+    {
+      "epoch": 0.0853714813105676,
+      "grad_norm": 0.24660415947437286,
+      "learning_rate": 0.00017070357554786622,
+      "loss": 0.9319,
+      "step": 1480
+    },
+    {
+      "epoch": 0.08565989847715735,
+      "grad_norm": 0.25426214933395386,
+      "learning_rate": 0.000171280276816609,
+      "loss": 1.0245,
+      "step": 1485
+    },
+    {
+      "epoch": 0.08594831564374712,
+      "grad_norm": 0.23765899240970612,
+      "learning_rate": 0.0001718569780853518,
+      "loss": 0.9202,
+      "step": 1490
+    },
+    {
+      "epoch": 0.08623673281033688,
+      "grad_norm": 0.24204228818416595,
+      "learning_rate": 0.00017243367935409457,
+      "loss": 0.9974,
+      "step": 1495
+    },
+    {
+      "epoch": 0.08652514997692663,
+      "grad_norm": 0.23034018278121948,
+      "learning_rate": 0.0001730103806228374,
+      "loss": 0.9251,
+      "step": 1500
+    },
+    {
+      "epoch": 0.08681356714351639,
+      "grad_norm": 0.24768561124801636,
+      "learning_rate": 0.00017358708189158016,
+      "loss": 0.957,
+      "step": 1505
+    },
+    {
+      "epoch": 0.08710198431010614,
+      "grad_norm": 0.24252378940582275,
+      "learning_rate": 0.00017416378316032296,
+      "loss": 0.9347,
+      "step": 1510
+    },
+    {
+      "epoch": 0.0873904014766959,
+      "grad_norm": 0.24422116577625275,
+      "learning_rate": 0.00017474048442906573,
+      "loss": 0.956,
+      "step": 1515
+    },
+    {
+      "epoch": 0.08767881864328565,
+      "grad_norm": 0.25470009446144104,
+      "learning_rate": 0.00017531718569780856,
+      "loss": 0.9355,
+      "step": 1520
+    },
+    {
+      "epoch": 0.08796723580987541,
+      "grad_norm": 0.240427628159523,
+      "learning_rate": 0.00017589388696655133,
+      "loss": 1.0345,
+      "step": 1525
+    },
+    {
+      "epoch": 0.08825565297646516,
+      "grad_norm": 0.2679055631160736,
+      "learning_rate": 0.00017647058823529413,
+      "loss": 1.0215,
+      "step": 1530
+    },
+    {
+      "epoch": 0.08854407014305492,
+      "grad_norm": 0.2706778943538666,
+      "learning_rate": 0.00017704728950403693,
+      "loss": 0.9951,
+      "step": 1535
+    },
+    {
+      "epoch": 0.08883248730964467,
+      "grad_norm": 0.24882011115550995,
+      "learning_rate": 0.00017762399077277973,
+      "loss": 1.0267,
+      "step": 1540
+    },
+    {
+      "epoch": 0.08912090447623443,
+      "grad_norm": 0.24369126558303833,
+      "learning_rate": 0.0001782006920415225,
+      "loss": 1.046,
+      "step": 1545
+    },
+    {
+      "epoch": 0.08940932164282418,
+      "grad_norm": 0.27035751938819885,
+      "learning_rate": 0.0001787773933102653,
+      "loss": 1.0522,
+      "step": 1550
+    },
+    {
+      "epoch": 0.08969773880941394,
+      "grad_norm": 0.25707873702049255,
+      "learning_rate": 0.0001793540945790081,
+      "loss": 0.9507,
+      "step": 1555
+    },
+    {
+      "epoch": 0.08998615597600369,
+      "grad_norm": 0.26456013321876526,
+      "learning_rate": 0.00017993079584775087,
+      "loss": 0.9941,
+      "step": 1560
+    },
+    {
+      "epoch": 0.09027457314259345,
+      "grad_norm": 0.26937803626060486,
+      "learning_rate": 0.00018050749711649367,
+      "loss": 1.0267,
+      "step": 1565
+    },
+    {
+      "epoch": 0.0905629903091832,
+      "grad_norm": 0.2615615725517273,
+      "learning_rate": 0.00018108419838523644,
+      "loss": 0.984,
+      "step": 1570
+    },
+    {
+      "epoch": 0.09085140747577296,
+      "grad_norm": 0.23720060288906097,
+      "learning_rate": 0.00018166089965397926,
+      "loss": 0.9401,
+      "step": 1575
+    },
+    {
+      "epoch": 0.09113982464236271,
+      "grad_norm": 0.24640457332134247,
+      "learning_rate": 0.00018223760092272203,
+      "loss": 1.086,
+      "step": 1580
+    },
+    {
+      "epoch": 0.09142824180895247,
+      "grad_norm": 0.2521013915538788,
+      "learning_rate": 0.00018281430219146483,
+      "loss": 0.9619,
+      "step": 1585
+    },
+    {
+      "epoch": 0.09171665897554222,
+      "grad_norm": 0.23948408663272858,
+      "learning_rate": 0.0001833910034602076,
+      "loss": 0.9835,
+      "step": 1590
+    },
+    {
+      "epoch": 0.09200507614213198,
+      "grad_norm": 0.25325456261634827,
+      "learning_rate": 0.00018396770472895043,
+      "loss": 1.0552,
+      "step": 1595
+    },
+    {
+      "epoch": 0.09229349330872173,
+      "grad_norm": 0.24731087684631348,
+      "learning_rate": 0.0001845444059976932,
+      "loss": 0.9253,
+      "step": 1600
+    },
+    {
+      "epoch": 0.09258191047531149,
+      "grad_norm": 0.26164206862449646,
+      "learning_rate": 0.000185121107266436,
+      "loss": 0.9396,
+      "step": 1605
+    },
+    {
+      "epoch": 0.09287032764190124,
+      "grad_norm": 0.25318196415901184,
+      "learning_rate": 0.0001856978085351788,
+      "loss": 0.9431,
+      "step": 1610
+    },
+    {
+      "epoch": 0.093158744808491,
+      "grad_norm": 0.2592536211013794,
+      "learning_rate": 0.00018627450980392157,
+      "loss": 0.9955,
+      "step": 1615
+    },
+    {
+      "epoch": 0.09344716197508075,
+      "grad_norm": 0.2497592270374298,
+      "learning_rate": 0.00018685121107266437,
+      "loss": 0.9844,
+      "step": 1620
+    },
+    {
+      "epoch": 0.09373557914167051,
+      "grad_norm": 0.2648375630378723,
+      "learning_rate": 0.00018742791234140714,
+      "loss": 0.9655,
+      "step": 1625
+    },
+    {
+      "epoch": 0.09402399630826026,
+      "grad_norm": 0.25172188878059387,
+      "learning_rate": 0.00018800461361014997,
+      "loss": 1.0322,
+      "step": 1630
+    },
+    {
+      "epoch": 0.09431241347485002,
+      "grad_norm": 0.24844340980052948,
+      "learning_rate": 0.00018858131487889274,
+      "loss": 0.9636,
+      "step": 1635
+    },
+    {
+      "epoch": 0.09460083064143977,
+      "grad_norm": 0.25023674964904785,
+      "learning_rate": 0.00018915801614763554,
+      "loss": 0.9601,
+      "step": 1640
+    },
+    {
+      "epoch": 0.09488924780802953,
+      "grad_norm": 0.2417484074831009,
+      "learning_rate": 0.0001897347174163783,
+      "loss": 0.9748,
+      "step": 1645
+    },
+    {
+      "epoch": 0.09517766497461928,
+      "grad_norm": 0.2597021162509918,
+      "learning_rate": 0.00019031141868512113,
+      "loss": 0.9672,
+      "step": 1650
+    },
+    {
+      "epoch": 0.09546608214120904,
+      "grad_norm": 0.25209182500839233,
+      "learning_rate": 0.0001908881199538639,
+      "loss": 0.9766,
+      "step": 1655
+    },
+    {
+      "epoch": 0.0957544993077988,
+      "grad_norm": 0.2704354226589203,
+      "learning_rate": 0.0001914648212226067,
+      "loss": 0.9658,
+      "step": 1660
+    },
+    {
+      "epoch": 0.09604291647438856,
+      "grad_norm": 0.2553963363170624,
+      "learning_rate": 0.00019204152249134948,
+      "loss": 0.972,
+      "step": 1665
+    },
+    {
+      "epoch": 0.09633133364097832,
+      "grad_norm": 0.25183454155921936,
+      "learning_rate": 0.00019261822376009227,
+      "loss": 0.9312,
+      "step": 1670
+    },
+    {
+      "epoch": 0.09661975080756807,
+      "grad_norm": 0.27272742986679077,
+      "learning_rate": 0.00019319492502883507,
+      "loss": 1.0585,
+      "step": 1675
+    },
+    {
+      "epoch": 0.09690816797415783,
+      "grad_norm": 0.25347381830215454,
+      "learning_rate": 0.00019377162629757784,
+      "loss": 1.0013,
+      "step": 1680
+    },
+    {
+      "epoch": 0.09719658514074758,
+      "grad_norm": 0.26412150263786316,
+      "learning_rate": 0.00019434832756632067,
+      "loss": 0.9175,
+      "step": 1685
+    },
+    {
+      "epoch": 0.09748500230733734,
+      "grad_norm": 0.2841266393661499,
+      "learning_rate": 0.00019492502883506344,
+      "loss": 0.8907,
+      "step": 1690
+    },
+    {
+      "epoch": 0.09777341947392709,
+      "grad_norm": 0.2843879163265228,
+      "learning_rate": 0.00019550173010380624,
+      "loss": 0.9952,
+      "step": 1695
+    },
+    {
+      "epoch": 0.09806183664051685,
+      "grad_norm": 0.24573901295661926,
+      "learning_rate": 0.000196078431372549,
+      "loss": 1.0093,
+      "step": 1700
+    },
+    {
+      "epoch": 0.0983502538071066,
+      "grad_norm": 0.25996410846710205,
+      "learning_rate": 0.00019665513264129184,
+      "loss": 1.0403,
+      "step": 1705
+    },
+    {
+      "epoch": 0.09863867097369636,
+      "grad_norm": 0.26386144757270813,
+      "learning_rate": 0.0001972318339100346,
+      "loss": 1.0211,
+      "step": 1710
+    },
+    {
+      "epoch": 0.09892708814028611,
+      "grad_norm": 0.26584669947624207,
+      "learning_rate": 0.0001978085351787774,
+      "loss": 0.9985,
+      "step": 1715
+    },
+    {
+      "epoch": 0.09921550530687587,
+      "grad_norm": 0.25835517048835754,
+      "learning_rate": 0.00019838523644752018,
+      "loss": 0.9615,
+      "step": 1720
+    },
+    {
+      "epoch": 0.09950392247346562,
+      "grad_norm": 0.2537446618080139,
+      "learning_rate": 0.000198961937716263,
+      "loss": 0.9851,
+      "step": 1725
+    },
+    {
+      "epoch": 0.09979233964005538,
+      "grad_norm": 0.2637675702571869,
+      "learning_rate": 0.00019953863898500578,
+      "loss": 0.9991,
+      "step": 1730
+    },
+    {
+      "epoch": 0.10008075680664513,
+      "grad_norm": 0.2486466020345688,
+      "learning_rate": 0.00019999999797274117,
+      "loss": 0.928,
+      "step": 1735
+    },
+    {
+      "epoch": 0.10036917397323489,
+      "grad_norm": 0.31705260276794434,
+      "learning_rate": 0.0001999999270186907,
+      "loss": 0.9909,
+      "step": 1740
+    },
+    {
+      "epoch": 0.10065759113982464,
+      "grad_norm": 0.2822314500808716,
+      "learning_rate": 0.0001999997547017808,
+      "loss": 0.9688,
+      "step": 1745
+    },
+    {
+      "epoch": 0.1009460083064144,
+      "grad_norm": 0.2564781606197357,
+      "learning_rate": 0.0001999994810221862,
+      "loss": 0.9515,
+      "step": 1750
+    },
+    {
+      "epoch": 0.10123442547300415,
+      "grad_norm": 0.2958817183971405,
+      "learning_rate": 0.00019999910598018426,
+      "loss": 0.9859,
+      "step": 1755
+    },
+    {
+      "epoch": 0.10152284263959391,
+      "grad_norm": 0.25060567259788513,
+      "learning_rate": 0.00019999862957615513,
+      "loss": 1.0043,
+      "step": 1760
+    },
+    {
+      "epoch": 0.10181125980618366,
+      "grad_norm": 0.2674092650413513,
+      "learning_rate": 0.00019999805181058176,
+      "loss": 0.9626,
+      "step": 1765
+    },
+    {
+      "epoch": 0.10209967697277342,
+      "grad_norm": 0.2575248181819916,
+      "learning_rate": 0.00019999737268404973,
+      "loss": 1.0265,
+      "step": 1770
+    },
+    {
+      "epoch": 0.10238809413936317,
+      "grad_norm": 0.2554805278778076,
+      "learning_rate": 0.00019999659219724749,
+      "loss": 0.9661,
+      "step": 1775
+    },
+    {
+      "epoch": 0.10267651130595293,
+      "grad_norm": 0.26680126786231995,
+      "learning_rate": 0.00019999571035096608,
+      "loss": 1.0231,
+      "step": 1780
+    },
+    {
+      "epoch": 0.10296492847254268,
+      "grad_norm": 0.25776219367980957,
+      "learning_rate": 0.00019999472714609943,
+      "loss": 0.9058,
+      "step": 1785
+    },
+    {
+      "epoch": 0.10325334563913244,
+      "grad_norm": 0.2542843818664551,
+      "learning_rate": 0.00019999364258364413,
+      "loss": 0.9773,
+      "step": 1790
+    },
+    {
+      "epoch": 0.10354176280572219,
+      "grad_norm": 0.2621992826461792,
+      "learning_rate": 0.0001999924566646995,
+      "loss": 0.9559,
+      "step": 1795
+    },
+    {
+      "epoch": 0.10383017997231195,
+      "grad_norm": 0.2683923840522766,
+      "learning_rate": 0.00019999116939046764,
+      "loss": 1.0355,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1041185971389017,
+      "grad_norm": 0.24701032042503357,
+      "learning_rate": 0.0001999897807622534,
+      "loss": 1.0906,
+      "step": 1805
+    },
+    {
+      "epoch": 0.10440701430549146,
+      "grad_norm": 0.25396963953971863,
+      "learning_rate": 0.0001999882907814643,
+      "loss": 1.0226,
+      "step": 1810
+    },
+    {
+      "epoch": 0.10469543147208121,
+      "grad_norm": 0.28205832839012146,
+      "learning_rate": 0.00019998669944961062,
+      "loss": 0.9224,
+      "step": 1815
+    },
+    {
+      "epoch": 0.10498384863867097,
+      "grad_norm": 0.26078683137893677,
+      "learning_rate": 0.0001999850067683054,
+      "loss": 0.9427,
+      "step": 1820
+    },
+    {
+      "epoch": 0.10527226580526072,
+      "grad_norm": 0.25481727719306946,
+      "learning_rate": 0.00019998321273926437,
+      "loss": 1.0042,
+      "step": 1825
+    },
+    {
+      "epoch": 0.10556068297185048,
+      "grad_norm": 0.25570574402809143,
+      "learning_rate": 0.00019998131736430604,
+      "loss": 0.9722,
+      "step": 1830
+    },
+    {
+      "epoch": 0.10584910013844025,
+      "grad_norm": 0.2734397351741791,
+      "learning_rate": 0.00019997932064535158,
+      "loss": 1.001,
+      "step": 1835
+    },
+    {
+      "epoch": 0.10613751730503,
+      "grad_norm": 0.27242162823677063,
+      "learning_rate": 0.00019997722258442499,
+      "loss": 0.9647,
+      "step": 1840
+    },
+    {
+      "epoch": 0.10642593447161976,
+      "grad_norm": 0.2732183635234833,
+      "learning_rate": 0.00019997502318365286,
+      "loss": 0.9697,
+      "step": 1845
+    },
+    {
+      "epoch": 0.10671435163820951,
+      "grad_norm": 0.26898330450057983,
+      "learning_rate": 0.00019997272244526456,
+      "loss": 0.9284,
+      "step": 1850
+    },
+    {
+      "epoch": 0.10700276880479927,
+      "grad_norm": 0.2656812071800232,
+      "learning_rate": 0.00019997032037159224,
+      "loss": 1.0368,
+      "step": 1855
+    },
+    {
+      "epoch": 0.10729118597138902,
+      "grad_norm": 0.2728678584098816,
+      "learning_rate": 0.00019996781696507069,
+      "loss": 1.0147,
+      "step": 1860
+    },
+    {
+      "epoch": 0.10757960313797878,
+      "grad_norm": 0.2543455958366394,
+      "learning_rate": 0.00019996521222823743,
+      "loss": 0.954,
+      "step": 1865
+    },
+    {
+      "epoch": 0.10786802030456853,
+      "grad_norm": 0.27658751606941223,
+      "learning_rate": 0.00019996250616373268,
+      "loss": 0.9796,
+      "step": 1870
+    },
+    {
+      "epoch": 0.10815643747115829,
+      "grad_norm": 0.27136722207069397,
+      "learning_rate": 0.00019995969877429945,
+      "loss": 0.9125,
+      "step": 1875
+    },
+    {
+      "epoch": 0.10844485463774804,
+      "grad_norm": 0.2712014317512512,
+      "learning_rate": 0.0001999567900627833,
+      "loss": 1.0053,
+      "step": 1880
+    },
+    {
+      "epoch": 0.1087332718043378,
+      "grad_norm": 0.2740635573863983,
+      "learning_rate": 0.0001999537800321327,
+      "loss": 0.9951,
+      "step": 1885
+    },
+    {
+      "epoch": 0.10902168897092755,
+      "grad_norm": 0.26667481660842896,
+      "learning_rate": 0.0001999506686853986,
+      "loss": 1.0062,
+      "step": 1890
+    },
+    {
+      "epoch": 0.10931010613751731,
+      "grad_norm": 0.2604423463344574,
+      "learning_rate": 0.0001999474560257348,
+      "loss": 0.9852,
+      "step": 1895
+    },
+    {
+      "epoch": 0.10959852330410706,
+      "grad_norm": 0.27640554308891296,
+      "learning_rate": 0.00019994414205639775,
+      "loss": 0.959,
+      "step": 1900
+    },
+    {
+      "epoch": 0.10988694047069682,
+      "grad_norm": 0.25489839911460876,
+      "learning_rate": 0.00019994072678074655,
+      "loss": 0.9957,
+      "step": 1905
+    },
+    {
+      "epoch": 0.11017535763728657,
+      "grad_norm": 0.2796529233455658,
+      "learning_rate": 0.00019993721020224308,
+      "loss": 0.9418,
+      "step": 1910
+    },
+    {
+      "epoch": 0.11046377480387633,
+      "grad_norm": 0.2622373402118683,
+      "learning_rate": 0.00019993359232445176,
+      "loss": 0.9573,
+      "step": 1915
+    },
+    {
+      "epoch": 0.11075219197046608,
+      "grad_norm": 0.2514156997203827,
+      "learning_rate": 0.0001999298731510399,
+      "loss": 0.9373,
+      "step": 1920
+    },
+    {
+      "epoch": 0.11104060913705584,
+      "grad_norm": 0.2672327160835266,
+      "learning_rate": 0.00019992605268577727,
+      "loss": 0.9097,
+      "step": 1925
+    },
+    {
+      "epoch": 0.11132902630364559,
+      "grad_norm": 0.26772674918174744,
+      "learning_rate": 0.00019992213093253643,
+      "loss": 1.0108,
+      "step": 1930
+    },
+    {
+      "epoch": 0.11161744347023535,
+      "grad_norm": 0.2462950050830841,
+      "learning_rate": 0.00019991810789529257,
+      "loss": 1.0006,
+      "step": 1935
+    },
+    {
+      "epoch": 0.1119058606368251,
+      "grad_norm": 0.26759883761405945,
+      "learning_rate": 0.0001999139835781236,
+      "loss": 0.9758,
+      "step": 1940
+    },
+    {
+      "epoch": 0.11219427780341486,
+      "grad_norm": 0.2841535806655884,
+      "learning_rate": 0.00019990975798521,
+      "loss": 1.0408,
+      "step": 1945
+    },
+    {
+      "epoch": 0.11248269497000461,
+      "grad_norm": 0.2822214365005493,
+      "learning_rate": 0.00019990543112083503,
+      "loss": 0.9317,
+      "step": 1950
+    },
+    {
+      "epoch": 0.11277111213659437,
+      "grad_norm": 0.2670351564884186,
+      "learning_rate": 0.00019990100298938442,
+      "loss": 0.9536,
+      "step": 1955
+    },
+    {
+      "epoch": 0.11305952930318412,
+      "grad_norm": 0.27470991015434265,
+      "learning_rate": 0.00019989647359534672,
+      "loss": 1.0404,
+      "step": 1960
+    },
+    {
+      "epoch": 0.11334794646977388,
+      "grad_norm": 0.2892574071884155,
+      "learning_rate": 0.00019989184294331308,
+      "loss": 0.9912,
+      "step": 1965
+    },
+    {
+      "epoch": 0.11363636363636363,
+      "grad_norm": 0.28786224126815796,
+      "learning_rate": 0.0001998871110379772,
+      "loss": 1.048,
+      "step": 1970
+    },
+    {
+      "epoch": 0.11392478080295339,
+      "grad_norm": 0.2730783522129059,
+      "learning_rate": 0.0001998822778841355,
+      "loss": 1.0148,
+      "step": 1975
+    },
+    {
+      "epoch": 0.11421319796954314,
+      "grad_norm": 0.25908493995666504,
+      "learning_rate": 0.00019987734348668706,
+      "loss": 0.9237,
+      "step": 1980
+    },
+    {
+      "epoch": 0.1145016151361329,
+      "grad_norm": 0.2924931049346924,
+      "learning_rate": 0.00019987230785063344,
+      "loss": 1.0084,
+      "step": 1985
+    },
+    {
+      "epoch": 0.11479003230272265,
+      "grad_norm": 0.2685001790523529,
+      "learning_rate": 0.00019986717098107896,
+      "loss": 0.977,
+      "step": 1990
+    },
+    {
+      "epoch": 0.11507844946931241,
+      "grad_norm": 0.26407670974731445,
+      "learning_rate": 0.0001998619328832305,
+      "loss": 1.0132,
+      "step": 1995
+    },
+    {
+      "epoch": 0.11536686663590216,
+      "grad_norm": 0.2581160366535187,
+      "learning_rate": 0.00019985659356239758,
+      "loss": 1.0553,
+      "step": 2000
+    },
+    {
+      "epoch": 0.11565528380249192,
+      "grad_norm": 0.2579261064529419,
+      "learning_rate": 0.0001998511530239922,
+      "loss": 0.992,
+      "step": 2005
+    },
+    {
+      "epoch": 0.11594370096908169,
+      "grad_norm": 0.27874529361724854,
+      "learning_rate": 0.00019984561127352914,
+      "loss": 1.0208,
+      "step": 2010
+    },
+    {
+      "epoch": 0.11623211813567144,
+      "grad_norm": 0.2448752522468567,
+      "learning_rate": 0.00019983996831662566,
+      "loss": 1.0272,
+      "step": 2015
+    },
+    {
+      "epoch": 0.1165205353022612,
+      "grad_norm": 0.2515913248062134,
+      "learning_rate": 0.00019983422415900158,
+      "loss": 1.0251,
+      "step": 2020
+    },
+    {
+      "epoch": 0.11680895246885095,
+      "grad_norm": 0.2612157464027405,
+      "learning_rate": 0.0001998283788064794,
+      "loss": 0.9298,
+      "step": 2025
+    },
+    {
+      "epoch": 0.1170973696354407,
+      "grad_norm": 0.2781950533390045,
+      "learning_rate": 0.00019982243226498411,
+      "loss": 1.0191,
+      "step": 2030
+    },
+    {
+      "epoch": 0.11738578680203046,
+      "grad_norm": 0.27393776178359985,
+      "learning_rate": 0.00019981638454054333,
+      "loss": 0.8712,
+      "step": 2035
+    },
+    {
+      "epoch": 0.11767420396862022,
+      "grad_norm": 0.271932452917099,
+      "learning_rate": 0.00019981023563928716,
+      "loss": 0.9644,
+      "step": 2040
+    },
+    {
+      "epoch": 0.11796262113520997,
+      "grad_norm": 0.2659457325935364,
+      "learning_rate": 0.00019980398556744837,
+      "loss": 0.9295,
+      "step": 2045
+    },
+    {
+      "epoch": 0.11825103830179973,
+      "grad_norm": 0.2813827395439148,
+      "learning_rate": 0.00019979763433136216,
+      "loss": 0.975,
+      "step": 2050
+    },
+    {
+      "epoch": 0.11853945546838948,
+      "grad_norm": 0.24046528339385986,
+      "learning_rate": 0.00019979118193746637,
+      "loss": 0.9836,
+      "step": 2055
+    },
+    {
+      "epoch": 0.11882787263497924,
+      "grad_norm": 0.27069780230522156,
+      "learning_rate": 0.00019978462839230133,
+      "loss": 1.0503,
+      "step": 2060
+    },
+    {
+      "epoch": 0.11911628980156899,
+      "grad_norm": 0.2609676718711853,
+      "learning_rate": 0.00019977797370250986,
+      "loss": 0.959,
+      "step": 2065
+    },
+    {
+      "epoch": 0.11940470696815875,
+      "grad_norm": 0.2760465145111084,
+      "learning_rate": 0.0001997712178748374,
+      "loss": 1.0014,
+      "step": 2070
+    },
+    {
+      "epoch": 0.1196931241347485,
+      "grad_norm": 0.2539708614349365,
+      "learning_rate": 0.00019976436091613184,
+      "loss": 1.0215,
+      "step": 2075
+    },
+    {
+      "epoch": 0.11998154130133826,
+      "grad_norm": 0.27062153816223145,
+      "learning_rate": 0.0001997574028333436,
+      "loss": 0.964,
+      "step": 2080
+    },
+    {
+      "epoch": 0.12026995846792801,
+      "grad_norm": 0.26900675892829895,
+      "learning_rate": 0.00019975034363352556,
+      "loss": 0.935,
+      "step": 2085
+    },
+    {
+      "epoch": 0.12055837563451777,
+      "grad_norm": 0.27462172508239746,
+      "learning_rate": 0.0001997431833238332,
+      "loss": 0.974,
+      "step": 2090
+    },
+    {
+      "epoch": 0.12084679280110752,
+      "grad_norm": 0.3665010333061218,
+      "learning_rate": 0.00019973592191152437,
+      "loss": 1.0159,
+      "step": 2095
+    },
+    {
+      "epoch": 0.12113520996769728,
+      "grad_norm": 0.28900420665740967,
+      "learning_rate": 0.00019972855940395947,
+      "loss": 1.0202,
+      "step": 2100
+    },
+    {
+      "epoch": 0.12142362713428703,
+      "grad_norm": 0.2706412374973297,
+      "learning_rate": 0.00019972109580860132,
+      "loss": 0.9766,
+      "step": 2105
+    },
+    {
+      "epoch": 0.12171204430087679,
+      "grad_norm": 0.28748854994773865,
+      "learning_rate": 0.00019971353113301527,
+      "loss": 1.095,
+      "step": 2110
+    },
+    {
+      "epoch": 0.12200046146746654,
+      "grad_norm": 0.2745112180709839,
+      "learning_rate": 0.0001997058653848691,
+      "loss": 0.9995,
+      "step": 2115
+    },
+    {
+      "epoch": 0.1222888786340563,
+      "grad_norm": 0.27372869849205017,
+      "learning_rate": 0.00019969809857193306,
+      "loss": 0.9582,
+      "step": 2120
+    },
+    {
+      "epoch": 0.12257729580064605,
+      "grad_norm": 0.2714395821094513,
+      "learning_rate": 0.00019969023070207973,
+      "loss": 0.9423,
+      "step": 2125
+    },
+    {
+      "epoch": 0.12286571296723581,
+      "grad_norm": 0.26695722341537476,
+      "learning_rate": 0.0001996822617832843,
+      "loss": 0.9192,
+      "step": 2130
+    },
+    {
+      "epoch": 0.12315413013382556,
+      "grad_norm": 0.2779480814933777,
+      "learning_rate": 0.00019967419182362429,
+      "loss": 0.9577,
+      "step": 2135
+    },
+    {
+      "epoch": 0.12344254730041532,
+      "grad_norm": 0.279851496219635,
+      "learning_rate": 0.0001996660208312796,
+      "loss": 0.9946,
+      "step": 2140
+    },
+    {
+      "epoch": 0.12373096446700507,
+      "grad_norm": 0.2676329016685486,
+      "learning_rate": 0.00019965774881453263,
+      "loss": 1.0293,
+      "step": 2145
+    },
+    {
+      "epoch": 0.12401938163359483,
+      "grad_norm": 0.2577393054962158,
+      "learning_rate": 0.00019964937578176816,
+      "loss": 0.9845,
+      "step": 2150
+    },
+    {
+      "epoch": 0.12430779880018458,
+      "grad_norm": 0.2870205342769623,
+      "learning_rate": 0.00019964090174147327,
+      "loss": 0.9747,
+      "step": 2155
+    },
+    {
+      "epoch": 0.12459621596677434,
+      "grad_norm": 0.2597945034503937,
+      "learning_rate": 0.00019963232670223752,
+      "loss": 0.9896,
+      "step": 2160
+    },
+    {
+      "epoch": 0.12488463313336409,
+      "grad_norm": 0.3189765512943268,
+      "learning_rate": 0.00019962365067275286,
+      "loss": 0.9538,
+      "step": 2165
+    },
+    {
+      "epoch": 0.12517305029995385,
+      "grad_norm": 0.27205929160118103,
+      "learning_rate": 0.00019961487366181355,
+      "loss": 0.9626,
+      "step": 2170
+    },
+    {
+      "epoch": 0.1254614674665436,
+      "grad_norm": 0.26647019386291504,
+      "learning_rate": 0.0001996059956783162,
+      "loss": 1.0142,
+      "step": 2175
+    },
+    {
+      "epoch": 0.12574988463313336,
+      "grad_norm": 0.2724989652633667,
+      "learning_rate": 0.00019959701673125983,
+      "loss": 1.0228,
+      "step": 2180
+    },
+    {
+      "epoch": 0.1260383017997231,
+      "grad_norm": 0.27627307176589966,
+      "learning_rate": 0.00019958793682974574,
+      "loss": 0.9744,
+      "step": 2185
+    },
+    {
+      "epoch": 0.12632671896631287,
+      "grad_norm": 0.2836136221885681,
+      "learning_rate": 0.00019957875598297759,
+      "loss": 1.0011,
+      "step": 2190
+    },
+    {
+      "epoch": 0.12661513613290262,
+      "grad_norm": 0.26454490423202515,
+      "learning_rate": 0.00019956947420026136,
+      "loss": 1.0463,
+      "step": 2195
+    },
+    {
+      "epoch": 0.12690355329949238,
+      "grad_norm": 0.29074445366859436,
+      "learning_rate": 0.00019956009149100533,
+      "loss": 0.9643,
+      "step": 2200
+    },
+    {
+      "epoch": 0.12719197046608213,
+      "grad_norm": 0.2764613926410675,
+      "learning_rate": 0.00019955060786472012,
+      "loss": 0.9245,
+      "step": 2205
+    },
+    {
+      "epoch": 0.1274803876326719,
+      "grad_norm": 0.2702649235725403,
+      "learning_rate": 0.00019954102333101856,
+      "loss": 0.9734,
+      "step": 2210
+    },
+    {
+      "epoch": 0.12776880479926164,
+      "grad_norm": 0.28136304020881653,
+      "learning_rate": 0.00019953133789961584,
+      "loss": 0.9782,
+      "step": 2215
+    },
+    {
+      "epoch": 0.1280572219658514,
+      "grad_norm": 0.29559558629989624,
+      "learning_rate": 0.0001995215515803294,
+      "loss": 0.9708,
+      "step": 2220
+    },
+    {
+      "epoch": 0.12834563913244115,
+      "grad_norm": 0.2811656892299652,
+      "learning_rate": 0.00019951166438307894,
+      "loss": 0.9839,
+      "step": 2225
+    },
+    {
+      "epoch": 0.1286340562990309,
+      "grad_norm": 0.27432867884635925,
+      "learning_rate": 0.00019950167631788642,
+      "loss": 0.9697,
+      "step": 2230
+    },
+    {
+      "epoch": 0.12892247346562066,
+      "grad_norm": 0.28106796741485596,
+      "learning_rate": 0.000199491587394876,
+      "loss": 0.9526,
+      "step": 2235
+    },
+    {
+      "epoch": 0.12921089063221042,
+      "grad_norm": 0.2755594253540039,
+      "learning_rate": 0.00019948139762427416,
+      "loss": 0.9943,
+      "step": 2240
+    },
+    {
+      "epoch": 0.12949930779880017,
+      "grad_norm": 0.27341076731681824,
+      "learning_rate": 0.00019947110701640952,
+      "loss": 0.9661,
+      "step": 2245
+    },
+    {
+      "epoch": 0.12978772496538993,
+      "grad_norm": 0.2582038938999176,
+      "learning_rate": 0.000199460715581713,
+      "loss": 0.9083,
+      "step": 2250
+    },
+    {
+      "epoch": 0.13007614213197968,
+      "grad_norm": 0.2739073932170868,
+      "learning_rate": 0.00019945022333071752,
+      "loss": 1.0518,
+      "step": 2255
+    },
+    {
+      "epoch": 0.13036455929856944,
+      "grad_norm": 0.2646303176879883,
+      "learning_rate": 0.0001994396302740585,
+      "loss": 0.9709,
+      "step": 2260
+    },
+    {
+      "epoch": 0.1306529764651592,
+      "grad_norm": 0.2723826766014099,
+      "learning_rate": 0.00019942893642247326,
+      "loss": 0.9845,
+      "step": 2265
+    },
+    {
+      "epoch": 0.13094139363174895,
+      "grad_norm": 0.27351605892181396,
+      "learning_rate": 0.00019941814178680144,
+      "loss": 1.0138,
+      "step": 2270
+    },
+    {
+      "epoch": 0.13122981079833873,
+      "grad_norm": 0.2802083492279053,
+      "learning_rate": 0.00019940724637798477,
+      "loss": 0.9364,
+      "step": 2275
+    },
+    {
+      "epoch": 0.13151822796492849,
+      "grad_norm": 0.27607461810112,
+      "learning_rate": 0.00019939625020706724,
+      "loss": 0.9931,
+      "step": 2280
+    },
+    {
+      "epoch": 0.13180664513151824,
+      "grad_norm": 0.270385205745697,
+      "learning_rate": 0.0001993851532851948,
+      "loss": 0.9763,
+      "step": 2285
+    },
+    {
+      "epoch": 0.132095062298108,
+      "grad_norm": 0.2873282730579376,
+      "learning_rate": 0.00019937395562361564,
+      "loss": 1.0417,
+      "step": 2290
+    },
+    {
+      "epoch": 0.13238347946469775,
+      "grad_norm": 0.2726912796497345,
+      "learning_rate": 0.0001993626572336801,
+      "loss": 0.9555,
+      "step": 2295
+    },
+    {
+      "epoch": 0.1326718966312875,
+      "grad_norm": 0.2793363332748413,
+      "learning_rate": 0.00019935125812684047,
+      "loss": 0.9883,
+      "step": 2300
+    },
+    {
+      "epoch": 0.13296031379787726,
+      "grad_norm": 0.2792257070541382,
+      "learning_rate": 0.0001993397583146513,
+      "loss": 1.0003,
+      "step": 2305
+    },
+    {
+      "epoch": 0.13324873096446702,
+      "grad_norm": 0.27051353454589844,
+      "learning_rate": 0.00019932815780876904,
+      "loss": 0.9726,
+      "step": 2310
+    },
+    {
+      "epoch": 0.13353714813105677,
+      "grad_norm": 0.28619712591171265,
+      "learning_rate": 0.00019931645662095237,
+      "loss": 0.9621,
+      "step": 2315
+    },
+    {
+      "epoch": 0.13382556529764653,
+      "grad_norm": 0.27812543511390686,
+      "learning_rate": 0.00019930465476306197,
+      "loss": 0.9909,
+      "step": 2320
+    },
+    {
+      "epoch": 0.13411398246423628,
+      "grad_norm": 0.27520883083343506,
+      "learning_rate": 0.0001992927522470605,
+      "loss": 1.0185,
+      "step": 2325
+    },
+    {
+      "epoch": 0.13440239963082604,
+      "grad_norm": 0.27513301372528076,
+      "learning_rate": 0.00019928074908501272,
+      "loss": 0.9595,
+      "step": 2330
+    },
+    {
+      "epoch": 0.1346908167974158,
+      "grad_norm": 0.29639777541160583,
+      "learning_rate": 0.0001992686452890854,
+      "loss": 0.9819,
+      "step": 2335
+    },
+    {
+      "epoch": 0.13497923396400555,
+      "grad_norm": 0.2893521189689636,
+      "learning_rate": 0.00019925644087154734,
+      "loss": 0.9894,
+      "step": 2340
+    },
+    {
+      "epoch": 0.1352676511305953,
+      "grad_norm": 0.267421156167984,
+      "learning_rate": 0.0001992441358447692,
+      "loss": 0.9882,
+      "step": 2345
+    },
+    {
+      "epoch": 0.13555606829718506,
+      "grad_norm": 0.2774795591831207,
+      "learning_rate": 0.00019923173022122378,
+      "loss": 0.9404,
+      "step": 2350
+    },
+    {
+      "epoch": 0.1358444854637748,
+      "grad_norm": 0.30167555809020996,
+      "learning_rate": 0.00019921922401348576,
+      "loss": 0.9631,
+      "step": 2355
+    },
+    {
+      "epoch": 0.13613290263036457,
+      "grad_norm": 0.2823658287525177,
+      "learning_rate": 0.00019920661723423183,
+      "loss": 0.9271,
+      "step": 2360
+    },
+    {
+      "epoch": 0.13642131979695432,
+      "grad_norm": 0.2752264142036438,
+      "learning_rate": 0.00019919390989624054,
+      "loss": 0.981,
+      "step": 2365
+    },
+    {
+      "epoch": 0.13670973696354408,
+      "grad_norm": 0.284186989068985,
+      "learning_rate": 0.00019918110201239247,
+      "loss": 1.0279,
+      "step": 2370
+    },
+    {
+      "epoch": 0.13699815413013383,
+      "grad_norm": 0.2601034343242645,
+      "learning_rate": 0.00019916819359567001,
+      "loss": 1.0219,
+      "step": 2375
+    },
+    {
+      "epoch": 0.1372865712967236,
+      "grad_norm": 0.3391975164413452,
+      "learning_rate": 0.00019915518465915758,
+      "loss": 0.9432,
+      "step": 2380
+    },
+    {
+      "epoch": 0.13757498846331334,
+      "grad_norm": 0.3057229816913605,
+      "learning_rate": 0.0001991420752160414,
+      "loss": 1.0415,
+      "step": 2385
+    },
+    {
+      "epoch": 0.1378634056299031,
+      "grad_norm": 0.2857256829738617,
+      "learning_rate": 0.00019912886527960954,
+      "loss": 0.9896,
+      "step": 2390
+    },
+    {
+      "epoch": 0.13815182279649285,
+      "grad_norm": 0.4211989641189575,
+      "learning_rate": 0.00019911555486325203,
+      "loss": 1.0471,
+      "step": 2395
+    },
+    {
+      "epoch": 0.1384402399630826,
+      "grad_norm": 0.26847025752067566,
+      "learning_rate": 0.0001991021439804607,
+      "loss": 1.0071,
+      "step": 2400
+    },
+    {
+      "epoch": 0.13872865712967236,
+      "grad_norm": 0.27097341418266296,
+      "learning_rate": 0.00019908863264482917,
+      "loss": 0.9493,
+      "step": 2405
+    },
+    {
+      "epoch": 0.13901707429626212,
+      "grad_norm": 0.2873136103153229,
+      "learning_rate": 0.00019907502087005297,
+      "loss": 1.0064,
+      "step": 2410
+    },
+    {
+      "epoch": 0.13930549146285187,
+      "grad_norm": 0.2804831564426422,
+      "learning_rate": 0.00019906130866992935,
+      "loss": 0.9483,
+      "step": 2415
+    },
+    {
+      "epoch": 0.13959390862944163,
+      "grad_norm": 0.27144983410835266,
+      "learning_rate": 0.00019904749605835742,
+      "loss": 0.9541,
+      "step": 2420
+    },
+    {
+      "epoch": 0.13988232579603138,
+      "grad_norm": 0.2791461944580078,
+      "learning_rate": 0.00019903358304933805,
+      "loss": 1.0228,
+      "step": 2425
+    },
+    {
+      "epoch": 0.14017074296262114,
+      "grad_norm": 0.2839184105396271,
+      "learning_rate": 0.00019901956965697387,
+      "loss": 0.9853,
+      "step": 2430
+    },
+    {
+      "epoch": 0.1404591601292109,
+      "grad_norm": 0.2938236594200134,
+      "learning_rate": 0.0001990054558954693,
+      "loss": 1.0175,
+      "step": 2435
+    },
+    {
+      "epoch": 0.14074757729580065,
+      "grad_norm": 0.26195093989372253,
+      "learning_rate": 0.00019899124177913041,
+      "loss": 0.9927,
+      "step": 2440
+    },
+    {
+      "epoch": 0.1410359944623904,
+      "grad_norm": 0.282997727394104,
+      "learning_rate": 0.0001989769273223651,
+      "loss": 0.9148,
+      "step": 2445
+    },
+    {
+      "epoch": 0.14132441162898016,
+      "grad_norm": 0.2869815230369568,
+      "learning_rate": 0.00019896251253968288,
+      "loss": 0.9978,
+      "step": 2450
+    },
+    {
+      "epoch": 0.1416128287955699,
+      "grad_norm": 0.30306002497673035,
+      "learning_rate": 0.000198947997445695,
+      "loss": 0.9793,
+      "step": 2455
+    },
+    {
+      "epoch": 0.14190124596215967,
+      "grad_norm": 0.2726587951183319,
+      "learning_rate": 0.0001989333820551144,
+      "loss": 0.8918,
+      "step": 2460
+    },
+    {
+      "epoch": 0.14218966312874942,
+      "grad_norm": 0.3028129041194916,
+      "learning_rate": 0.00019891866638275564,
+      "loss": 1.0184,
+      "step": 2465
+    },
+    {
+      "epoch": 0.14247808029533918,
+      "grad_norm": 0.27245384454727173,
+      "learning_rate": 0.00019890385044353501,
+      "loss": 0.9187,
+      "step": 2470
+    },
+    {
+      "epoch": 0.14276649746192893,
+      "grad_norm": 0.26684272289276123,
+      "learning_rate": 0.00019888893425247032,
+      "loss": 0.94,
+      "step": 2475
+    },
+    {
+      "epoch": 0.1430549146285187,
+      "grad_norm": 0.26761725544929504,
+      "learning_rate": 0.00019887391782468113,
+      "loss": 0.9606,
+      "step": 2480
+    },
+    {
+      "epoch": 0.14334333179510844,
+      "grad_norm": 0.2789659798145294,
+      "learning_rate": 0.00019885880117538846,
+      "loss": 0.9361,
+      "step": 2485
+    },
+    {
+      "epoch": 0.1436317489616982,
+      "grad_norm": 0.2568376362323761,
+      "learning_rate": 0.000198843584319915,
+      "loss": 1.0155,
+      "step": 2490
+    },
+    {
+      "epoch": 0.14392016612828795,
+      "grad_norm": 0.29699787497520447,
+      "learning_rate": 0.00019882826727368508,
+      "loss": 1.0136,
+      "step": 2495
+    },
+    {
+      "epoch": 0.1442085832948777,
+      "grad_norm": 0.3011142313480377,
+      "learning_rate": 0.0001988128500522244,
+      "loss": 0.9967,
+      "step": 2500
+    },
+    {
+      "epoch": 0.14449700046146746,
+      "grad_norm": 0.27386248111724854,
+      "learning_rate": 0.00019879733267116035,
+      "loss": 1.0263,
+      "step": 2505
+    },
+    {
+      "epoch": 0.14478541762805722,
+      "grad_norm": 0.31453463435173035,
+      "learning_rate": 0.00019878171514622187,
+      "loss": 0.9307,
+      "step": 2510
+    },
+    {
+      "epoch": 0.14507383479464697,
+      "grad_norm": 0.2672314941883087,
+      "learning_rate": 0.0001987659974932392,
+      "loss": 0.9441,
+      "step": 2515
+    },
+    {
+      "epoch": 0.14536225196123673,
+      "grad_norm": 0.2847091257572174,
+      "learning_rate": 0.00019875017972814435,
+      "loss": 0.9868,
+      "step": 2520
+    },
+    {
+      "epoch": 0.14565066912782648,
+      "grad_norm": 0.28868651390075684,
+      "learning_rate": 0.0001987342618669706,
+      "loss": 0.9296,
+      "step": 2525
+    },
+    {
+      "epoch": 0.14593908629441624,
+      "grad_norm": 0.29168251156806946,
+      "learning_rate": 0.00019871824392585276,
+      "loss": 0.9317,
+      "step": 2530
+    },
+    {
+      "epoch": 0.146227503461006,
+      "grad_norm": 0.2743743062019348,
+      "learning_rate": 0.00019870212592102711,
+      "loss": 1.0277,
+      "step": 2535
+    },
+    {
+      "epoch": 0.14651592062759575,
+      "grad_norm": 0.2812393605709076,
+      "learning_rate": 0.00019868590786883134,
+      "loss": 1.0553,
+      "step": 2540
+    },
+    {
+      "epoch": 0.1468043377941855,
+      "grad_norm": 0.2678181231021881,
+      "learning_rate": 0.00019866958978570452,
+      "loss": 0.8821,
+      "step": 2545
+    },
+    {
+      "epoch": 0.14709275496077526,
+      "grad_norm": 0.3037974238395691,
+      "learning_rate": 0.00019865317168818713,
+      "loss": 0.9625,
+      "step": 2550
+    },
+    {
+      "epoch": 0.147381172127365,
+      "grad_norm": 0.2820071578025818,
+      "learning_rate": 0.00019863665359292108,
+      "loss": 1.0259,
+      "step": 2555
+    },
+    {
+      "epoch": 0.14766958929395477,
+      "grad_norm": 0.2591807544231415,
+      "learning_rate": 0.0001986200355166495,
+      "loss": 0.9521,
+      "step": 2560
+    },
+    {
+      "epoch": 0.14795800646054452,
+      "grad_norm": 0.26036834716796875,
+      "learning_rate": 0.0001986033174762171,
+      "loss": 0.94,
+      "step": 2565
+    },
+    {
+      "epoch": 0.14824642362713428,
+      "grad_norm": 0.27297431230545044,
+      "learning_rate": 0.0001985864994885697,
+      "loss": 0.9859,
+      "step": 2570
+    },
+    {
+      "epoch": 0.14853484079372403,
+      "grad_norm": 0.27806761860847473,
+      "learning_rate": 0.00019856958157075445,
+      "loss": 1.0,
+      "step": 2575
+    },
+    {
+      "epoch": 0.1488232579603138,
+      "grad_norm": 0.2749041020870209,
+      "learning_rate": 0.00019855256373991993,
+      "loss": 0.9111,
+      "step": 2580
+    },
+    {
+      "epoch": 0.14911167512690354,
+      "grad_norm": 0.28046393394470215,
+      "learning_rate": 0.0001985354460133159,
+      "loss": 0.9089,
+      "step": 2585
+    },
+    {
+      "epoch": 0.1494000922934933,
+      "grad_norm": 0.2683013379573822,
+      "learning_rate": 0.00019851822840829338,
+      "loss": 0.9122,
+      "step": 2590
+    },
+    {
+      "epoch": 0.14968850946008305,
+      "grad_norm": 0.28444692492485046,
+      "learning_rate": 0.0001985009109423046,
+      "loss": 0.9987,
+      "step": 2595
+    },
+    {
+      "epoch": 0.1499769266266728,
+      "grad_norm": 0.28526070713996887,
+      "learning_rate": 0.0001984834936329031,
+      "loss": 1.0177,
+      "step": 2600
+    },
+    {
+      "epoch": 0.15026534379326256,
+      "grad_norm": 0.2751544415950775,
+      "learning_rate": 0.00019846597649774358,
+      "loss": 1.0602,
+      "step": 2605
+    },
+    {
+      "epoch": 0.15055376095985232,
+      "grad_norm": 0.29558390378952026,
+      "learning_rate": 0.00019844835955458193,
+      "loss": 1.0015,
+      "step": 2610
+    },
+    {
+      "epoch": 0.15084217812644207,
+      "grad_norm": 0.27498286962509155,
+      "learning_rate": 0.00019843064282127511,
+      "loss": 0.9561,
+      "step": 2615
+    },
+    {
+      "epoch": 0.15113059529303183,
+      "grad_norm": 0.292961061000824,
+      "learning_rate": 0.00019841282631578145,
+      "loss": 0.9914,
+      "step": 2620
+    },
+    {
+      "epoch": 0.1514190124596216,
+      "grad_norm": 0.3029356896877289,
+      "learning_rate": 0.0001983949100561602,
+      "loss": 0.9801,
+      "step": 2625
+    },
+    {
+      "epoch": 0.15170742962621137,
+      "grad_norm": 0.2864689230918884,
+      "learning_rate": 0.00019837689406057183,
+      "loss": 0.9578,
+      "step": 2630
+    },
+    {
+      "epoch": 0.15199584679280112,
+      "grad_norm": 0.2750813961029053,
+      "learning_rate": 0.00019835877834727787,
+      "loss": 0.9483,
+      "step": 2635
+    },
+    {
+      "epoch": 0.15228426395939088,
+      "grad_norm": 0.27926185727119446,
+      "learning_rate": 0.00019834056293464093,
+      "loss": 1.0165,
+      "step": 2640
+    },
+    {
+      "epoch": 0.15257268112598063,
+      "grad_norm": 0.27533864974975586,
+      "learning_rate": 0.00019832224784112473,
+      "loss": 1.0241,
+      "step": 2645
+    },
+    {
+      "epoch": 0.15286109829257039,
+      "grad_norm": 0.276993989944458,
+      "learning_rate": 0.00019830383308529393,
+      "loss": 1.0444,
+      "step": 2650
+    },
+    {
+      "epoch": 0.15314951545916014,
+      "grad_norm": 0.2960858643054962,
+      "learning_rate": 0.0001982853186858143,
+      "loss": 0.9928,
+      "step": 2655
+    },
+    {
+      "epoch": 0.1534379326257499,
+      "grad_norm": 0.29162392020225525,
+      "learning_rate": 0.00019826670466145262,
+      "loss": 0.8887,
+      "step": 2660
+    },
+    {
+      "epoch": 0.15372634979233965,
+      "grad_norm": 0.2606879472732544,
+      "learning_rate": 0.0001982479910310765,
+      "loss": 0.9832,
+      "step": 2665
+    },
+    {
+      "epoch": 0.1540147669589294,
+      "grad_norm": 0.29048001766204834,
+      "learning_rate": 0.00019822917781365474,
+      "loss": 1.01,
+      "step": 2670
+    },
+    {
+      "epoch": 0.15430318412551916,
+      "grad_norm": 0.2942920923233032,
+      "learning_rate": 0.00019821026502825687,
+      "loss": 1.0289,
+      "step": 2675
+    },
+    {
+      "epoch": 0.15459160129210892,
+      "grad_norm": 0.2862975597381592,
+      "learning_rate": 0.00019819125269405352,
+      "loss": 0.9961,
+      "step": 2680
+    },
+    {
+      "epoch": 0.15488001845869867,
+      "grad_norm": 0.2896837890148163,
+      "learning_rate": 0.00019817214083031614,
+      "loss": 1.0002,
+      "step": 2685
+    },
+    {
+      "epoch": 0.15516843562528843,
+      "grad_norm": 0.26825401186943054,
+      "learning_rate": 0.00019815292945641705,
+      "loss": 0.9874,
+      "step": 2690
+    },
+    {
+      "epoch": 0.15545685279187818,
+      "grad_norm": 0.2813914120197296,
+      "learning_rate": 0.00019813361859182945,
+      "loss": 0.9919,
+      "step": 2695
+    },
+    {
+      "epoch": 0.15574526995846794,
+      "grad_norm": 0.284069687128067,
+      "learning_rate": 0.0001981142082561274,
+      "loss": 0.8997,
+      "step": 2700
+    },
+    {
+      "epoch": 0.1560336871250577,
+      "grad_norm": 0.2858209013938904,
+      "learning_rate": 0.00019809469846898586,
+      "loss": 0.9546,
+      "step": 2705
+    },
+    {
+      "epoch": 0.15632210429164745,
+      "grad_norm": 0.2836093604564667,
+      "learning_rate": 0.0001980750892501804,
+      "loss": 0.9254,
+      "step": 2710
+    },
+    {
+      "epoch": 0.1566105214582372,
+      "grad_norm": 0.32628414034843445,
+      "learning_rate": 0.00019805538061958765,
+      "loss": 0.94,
+      "step": 2715
+    },
+    {
+      "epoch": 0.15689893862482696,
+      "grad_norm": 0.2873879373073578,
+      "learning_rate": 0.0001980355725971847,
+      "loss": 0.9598,
+      "step": 2720
+    },
+    {
+      "epoch": 0.1571873557914167,
+      "grad_norm": 0.27270689606666565,
+      "learning_rate": 0.00019801566520304963,
+      "loss": 0.9622,
+      "step": 2725
+    },
+    {
+      "epoch": 0.15747577295800647,
+      "grad_norm": 0.25972458720207214,
+      "learning_rate": 0.0001979956584573612,
+      "loss": 0.9895,
+      "step": 2730
+    },
+    {
+      "epoch": 0.15776419012459622,
+      "grad_norm": 0.2917114198207855,
+      "learning_rate": 0.00019797555238039872,
+      "loss": 0.9528,
+      "step": 2735
+    },
+    {
+      "epoch": 0.15805260729118598,
+      "grad_norm": 0.26294592022895813,
+      "learning_rate": 0.00019795534699254238,
+      "loss": 0.9309,
+      "step": 2740
+    },
+    {
+      "epoch": 0.15834102445777573,
+      "grad_norm": 0.28122779726982117,
+      "learning_rate": 0.0001979350423142729,
+      "loss": 0.9853,
+      "step": 2745
+    },
+    {
+      "epoch": 0.15862944162436549,
+      "grad_norm": 0.29183605313301086,
+      "learning_rate": 0.00019791463836617176,
+      "loss": 0.9382,
+      "step": 2750
+    },
+    {
+      "epoch": 0.15891785879095524,
+      "grad_norm": 0.28074556589126587,
+      "learning_rate": 0.00019789413516892098,
+      "loss": 1.01,
+      "step": 2755
+    },
+    {
+      "epoch": 0.159206275957545,
+      "grad_norm": 0.2814944088459015,
+      "learning_rate": 0.00019787353274330313,
+      "loss": 1.0161,
+      "step": 2760
+    },
+    {
+      "epoch": 0.15949469312413475,
+      "grad_norm": 0.2898254990577698,
+      "learning_rate": 0.00019785283111020156,
+      "loss": 1.0388,
+      "step": 2765
+    },
+    {
+      "epoch": 0.1597831102907245,
+      "grad_norm": 0.2777402400970459,
+      "learning_rate": 0.00019783203029059997,
+      "loss": 0.9589,
+      "step": 2770
+    },
+    {
+      "epoch": 0.16007152745731426,
+      "grad_norm": 0.2646116316318512,
+      "learning_rate": 0.00019781113030558267,
+      "loss": 0.9569,
+      "step": 2775
+    },
+    {
+      "epoch": 0.16035994462390402,
+      "grad_norm": 0.3243483304977417,
+      "learning_rate": 0.00019779013117633454,
+      "loss": 0.9622,
+      "step": 2780
+    },
+    {
+      "epoch": 0.16064836179049377,
+      "grad_norm": 0.2765612304210663,
+      "learning_rate": 0.0001977690329241409,
+      "loss": 1.0068,
+      "step": 2785
+    },
+    {
+      "epoch": 0.16093677895708353,
+      "grad_norm": 0.30408522486686707,
+      "learning_rate": 0.00019774783557038755,
+      "loss": 0.969,
+      "step": 2790
+    },
+    {
+      "epoch": 0.16122519612367328,
+      "grad_norm": 0.26990190148353577,
+      "learning_rate": 0.00019772653913656076,
+      "loss": 1.025,
+      "step": 2795
+    },
+    {
+      "epoch": 0.16151361329026304,
+      "grad_norm": 0.31291985511779785,
+      "learning_rate": 0.00019770514364424725,
+      "loss": 1.0174,
+      "step": 2800
+    },
+    {
+      "epoch": 0.1618020304568528,
+      "grad_norm": 0.31198903918266296,
+      "learning_rate": 0.00019768364911513405,
+      "loss": 0.9603,
+      "step": 2805
+    },
+    {
+      "epoch": 0.16209044762344255,
+      "grad_norm": 0.28119274973869324,
+      "learning_rate": 0.00019766205557100868,
+      "loss": 0.9689,
+      "step": 2810
+    },
+    {
+      "epoch": 0.1623788647900323,
+      "grad_norm": 0.27684643864631653,
+      "learning_rate": 0.000197640363033759,
+      "loss": 0.9272,
+      "step": 2815
+    },
+    {
+      "epoch": 0.16266728195662206,
+      "grad_norm": 0.2740548253059387,
+      "learning_rate": 0.0001976185715253732,
+      "loss": 1.0165,
+      "step": 2820
+    },
+    {
+      "epoch": 0.1629556991232118,
+      "grad_norm": 0.3126582205295563,
+      "learning_rate": 0.00019759668106793975,
+      "loss": 0.9915,
+      "step": 2825
+    },
+    {
+      "epoch": 0.16324411628980157,
+      "grad_norm": 0.27744656801223755,
+      "learning_rate": 0.0001975746916836475,
+      "loss": 0.9971,
+      "step": 2830
+    },
+    {
+      "epoch": 0.16353253345639132,
+      "grad_norm": 0.280280202627182,
+      "learning_rate": 0.00019755260339478556,
+      "loss": 0.9637,
+      "step": 2835
+    },
+    {
+      "epoch": 0.16382095062298108,
+      "grad_norm": 0.2840816378593445,
+      "learning_rate": 0.0001975304162237432,
+      "loss": 0.9603,
+      "step": 2840
+    },
+    {
+      "epoch": 0.16410936778957083,
+      "grad_norm": 0.2826577126979828,
+      "learning_rate": 0.00019750813019301004,
+      "loss": 1.0331,
+      "step": 2845
+    },
+    {
+      "epoch": 0.1643977849561606,
+      "grad_norm": 0.2963692545890808,
+      "learning_rate": 0.00019748574532517586,
+      "loss": 0.999,
+      "step": 2850
+    },
+    {
+      "epoch": 0.16468620212275034,
+      "grad_norm": 0.2895634174346924,
+      "learning_rate": 0.00019746326164293056,
+      "loss": 0.9637,
+      "step": 2855
+    },
+    {
+      "epoch": 0.1649746192893401,
+      "grad_norm": 0.287422776222229,
+      "learning_rate": 0.0001974406791690643,
+      "loss": 0.9696,
+      "step": 2860
+    },
+    {
+      "epoch": 0.16526303645592985,
+      "grad_norm": 0.31378328800201416,
+      "learning_rate": 0.00019741799792646734,
+      "loss": 1.0066,
+      "step": 2865
+    },
+    {
+      "epoch": 0.1655514536225196,
+      "grad_norm": 0.28587618470191956,
+      "learning_rate": 0.00019739521793813006,
+      "loss": 0.9224,
+      "step": 2870
+    },
+    {
+      "epoch": 0.16583987078910936,
+      "grad_norm": 0.28385454416275024,
+      "learning_rate": 0.0001973723392271429,
+      "loss": 0.9961,
+      "step": 2875
+    },
+    {
+      "epoch": 0.16612828795569912,
+      "grad_norm": 0.27586954832077026,
+      "learning_rate": 0.00019734936181669638,
+      "loss": 1.065,
+      "step": 2880
+    },
+    {
+      "epoch": 0.16641670512228887,
+      "grad_norm": 0.30055347084999084,
+      "learning_rate": 0.00019732628573008114,
+      "loss": 1.0089,
+      "step": 2885
+    },
+    {
+      "epoch": 0.16670512228887863,
+      "grad_norm": 0.30119630694389343,
+      "learning_rate": 0.00019730311099068771,
+      "loss": 1.017,
+      "step": 2890
+    },
+    {
+      "epoch": 0.16699353945546838,
+      "grad_norm": 0.29206573963165283,
+      "learning_rate": 0.00019727983762200677,
+      "loss": 0.9635,
+      "step": 2895
+    },
+    {
+      "epoch": 0.16728195662205814,
+      "grad_norm": 0.2570163905620575,
+      "learning_rate": 0.00019725646564762878,
+      "loss": 0.9791,
+      "step": 2900
+    },
+    {
+      "epoch": 0.1675703737886479,
+      "grad_norm": 0.3360570967197418,
+      "learning_rate": 0.00019723299509124433,
+      "loss": 0.9498,
+      "step": 2905
+    },
+    {
+      "epoch": 0.16785879095523765,
+      "grad_norm": 0.29323843121528625,
+      "learning_rate": 0.00019720942597664385,
+      "loss": 0.986,
+      "step": 2910
+    },
+    {
+      "epoch": 0.1681472081218274,
+      "grad_norm": 0.30418166518211365,
+      "learning_rate": 0.00019718575832771768,
+      "loss": 0.9756,
+      "step": 2915
+    },
+    {
+      "epoch": 0.16843562528841716,
+      "grad_norm": 0.31183257699012756,
+      "learning_rate": 0.00019716199216845604,
+      "loss": 0.9997,
+      "step": 2920
+    },
+    {
+      "epoch": 0.1687240424550069,
+      "grad_norm": 0.26834046840667725,
+      "learning_rate": 0.000197138127522949,
+      "loss": 0.9315,
+      "step": 2925
+    },
+    {
+      "epoch": 0.16901245962159667,
+      "grad_norm": 0.27434879541397095,
+      "learning_rate": 0.00019711416441538652,
+      "loss": 1.0105,
+      "step": 2930
+    },
+    {
+      "epoch": 0.16930087678818642,
+      "grad_norm": 0.28828758001327515,
+      "learning_rate": 0.00019709010287005825,
+      "loss": 1.0128,
+      "step": 2935
+    },
+    {
+      "epoch": 0.16958929395477618,
+      "grad_norm": 0.2850480079650879,
+      "learning_rate": 0.00019706594291135366,
+      "loss": 0.9618,
+      "step": 2940
+    },
+    {
+      "epoch": 0.16987771112136593,
+      "grad_norm": 0.2937301993370056,
+      "learning_rate": 0.00019704168456376205,
+      "loss": 1.0175,
+      "step": 2945
+    },
+    {
+      "epoch": 0.1701661282879557,
+      "grad_norm": 0.28153088688850403,
+      "learning_rate": 0.0001970173278518724,
+      "loss": 0.9541,
+      "step": 2950
+    },
+    {
+      "epoch": 0.17045454545454544,
+      "grad_norm": 0.2839425802230835,
+      "learning_rate": 0.00019699287280037332,
+      "loss": 1.0139,
+      "step": 2955
+    },
+    {
+      "epoch": 0.1707429626211352,
+      "grad_norm": 0.28864094614982605,
+      "learning_rate": 0.00019696831943405324,
+      "loss": 1.0833,
+      "step": 2960
+    },
+    {
+      "epoch": 0.17103137978772495,
+      "grad_norm": 0.2697494626045227,
+      "learning_rate": 0.0001969436677778001,
+      "loss": 0.9827,
+      "step": 2965
+    },
+    {
+      "epoch": 0.1713197969543147,
+      "grad_norm": 0.2844550907611847,
+      "learning_rate": 0.0001969189178566016,
+      "loss": 1.005,
+      "step": 2970
+    },
+    {
+      "epoch": 0.1716082141209045,
+      "grad_norm": 0.30949264764785767,
+      "learning_rate": 0.000196894069695545,
+      "loss": 0.9696,
+      "step": 2975
+    },
+    {
+      "epoch": 0.17189663128749424,
+      "grad_norm": 0.2768407464027405,
+      "learning_rate": 0.00019686912331981702,
+      "loss": 0.9931,
+      "step": 2980
+    },
+    {
+      "epoch": 0.172185048454084,
+      "grad_norm": 0.28683245182037354,
+      "learning_rate": 0.00019684407875470415,
+      "loss": 1.0018,
+      "step": 2985
+    },
+    {
+      "epoch": 0.17247346562067375,
+      "grad_norm": 0.3155616223812103,
+      "learning_rate": 0.00019681893602559224,
+      "loss": 0.9813,
+      "step": 2990
+    },
+    {
+      "epoch": 0.1727618827872635,
+      "grad_norm": 0.3154447376728058,
+      "learning_rate": 0.0001967936951579667,
+      "loss": 0.9915,
+      "step": 2995
+    },
+    {
+      "epoch": 0.17305029995385326,
+      "grad_norm": 0.277576744556427,
+      "learning_rate": 0.00019676835617741249,
+      "loss": 0.9668,
+      "step": 3000
+    },
+    {
+      "epoch": 0.17333871712044302,
+      "grad_norm": 0.28618210554122925,
+      "learning_rate": 0.0001967429191096138,
+      "loss": 0.9745,
+      "step": 3005
+    },
+    {
+      "epoch": 0.17362713428703277,
+      "grad_norm": 0.27911707758903503,
+      "learning_rate": 0.0001967173839803545,
+      "loss": 0.9732,
+      "step": 3010
+    },
+    {
+      "epoch": 0.17391555145362253,
+      "grad_norm": 0.28373172879219055,
+      "learning_rate": 0.00019669175081551773,
+      "loss": 0.9797,
+      "step": 3015
+    },
+    {
+      "epoch": 0.17420396862021229,
+      "grad_norm": 0.29749229550361633,
+      "learning_rate": 0.00019666601964108598,
+      "loss": 0.94,
+      "step": 3020
+    },
+    {
+      "epoch": 0.17449238578680204,
+      "grad_norm": 0.31651487946510315,
+      "learning_rate": 0.00019664019048314116,
+      "loss": 0.9829,
+      "step": 3025
+    },
+    {
+      "epoch": 0.1747808029533918,
+      "grad_norm": 0.2834007740020752,
+      "learning_rate": 0.00019661426336786445,
+      "loss": 0.9336,
+      "step": 3030
+    },
+    {
+      "epoch": 0.17506922011998155,
+      "grad_norm": 0.2876712381839752,
+      "learning_rate": 0.00019658823832153632,
+      "loss": 0.9174,
+      "step": 3035
+    },
+    {
+      "epoch": 0.1753576372865713,
+      "grad_norm": 0.3259499669075012,
+      "learning_rate": 0.00019656211537053654,
+      "loss": 1.0362,
+      "step": 3040
+    },
+    {
+      "epoch": 0.17564605445316106,
+      "grad_norm": 0.26136502623558044,
+      "learning_rate": 0.00019653589454134406,
+      "loss": 0.9399,
+      "step": 3045
+    },
+    {
+      "epoch": 0.17593447161975082,
+      "grad_norm": 0.28630778193473816,
+      "learning_rate": 0.00019650957586053716,
+      "loss": 0.9861,
+      "step": 3050
+    },
+    {
+      "epoch": 0.17622288878634057,
+      "grad_norm": 0.2615172266960144,
+      "learning_rate": 0.00019648315935479315,
+      "loss": 1.0378,
+      "step": 3055
+    },
+    {
+      "epoch": 0.17651130595293033,
+      "grad_norm": 0.28133901953697205,
+      "learning_rate": 0.00019645664505088864,
+      "loss": 0.9746,
+      "step": 3060
+    },
+    {
+      "epoch": 0.17679972311952008,
+      "grad_norm": 0.3203901946544647,
+      "learning_rate": 0.00019643003297569923,
+      "loss": 0.9894,
+      "step": 3065
+    },
+    {
+      "epoch": 0.17708814028610984,
+      "grad_norm": 0.2845044434070587,
+      "learning_rate": 0.00019640332315619977,
+      "loss": 1.0024,
+      "step": 3070
+    },
+    {
+      "epoch": 0.1773765574526996,
+      "grad_norm": 0.28776776790618896,
+      "learning_rate": 0.0001963765156194641,
+      "loss": 1.0035,
+      "step": 3075
+    },
+    {
+      "epoch": 0.17766497461928935,
+      "grad_norm": 0.2923831343650818,
+      "learning_rate": 0.00019634961039266506,
+      "loss": 1.0253,
+      "step": 3080
+    },
+    {
+      "epoch": 0.1779533917858791,
+      "grad_norm": 0.29954782128334045,
+      "learning_rate": 0.00019632260750307467,
+      "loss": 0.9984,
+      "step": 3085
+    },
+    {
+      "epoch": 0.17824180895246886,
+      "grad_norm": 0.30335840582847595,
+      "learning_rate": 0.0001962955069780638,
+      "loss": 0.9339,
+      "step": 3090
+    },
+    {
+      "epoch": 0.1785302261190586,
+      "grad_norm": 0.28872916102409363,
+      "learning_rate": 0.00019626830884510236,
+      "loss": 1.0417,
+      "step": 3095
+    },
+    {
+      "epoch": 0.17881864328564837,
+      "grad_norm": 0.3210926949977875,
+      "learning_rate": 0.00019624101313175918,
+      "loss": 1.0293,
+      "step": 3100
+    },
+    {
+      "epoch": 0.17910706045223812,
+      "grad_norm": 0.29229721426963806,
+      "learning_rate": 0.00019621361986570194,
+      "loss": 0.9386,
+      "step": 3105
+    },
+    {
+      "epoch": 0.17939547761882788,
+      "grad_norm": 0.3137836754322052,
+      "learning_rate": 0.00019618612907469732,
+      "loss": 0.9874,
+      "step": 3110
+    },
+    {
+      "epoch": 0.17968389478541763,
+      "grad_norm": 0.27663466334342957,
+      "learning_rate": 0.00019615854078661077,
+      "loss": 0.9902,
+      "step": 3115
+    },
+    {
+      "epoch": 0.17997231195200739,
+      "grad_norm": 0.30164676904678345,
+      "learning_rate": 0.00019613085502940658,
+      "loss": 1.1187,
+      "step": 3120
+    },
+    {
+      "epoch": 0.18026072911859714,
+      "grad_norm": 0.2817506790161133,
+      "learning_rate": 0.00019610307183114787,
+      "loss": 0.9643,
+      "step": 3125
+    },
+    {
+      "epoch": 0.1805491462851869,
+      "grad_norm": 0.28451189398765564,
+      "learning_rate": 0.00019607519121999647,
+      "loss": 0.9553,
+      "step": 3130
+    },
+    {
+      "epoch": 0.18083756345177665,
+      "grad_norm": 0.3148361146450043,
+      "learning_rate": 0.00019604721322421303,
+      "loss": 0.9596,
+      "step": 3135
+    },
+    {
+      "epoch": 0.1811259806183664,
+      "grad_norm": 0.3131537437438965,
+      "learning_rate": 0.00019601913787215683,
+      "loss": 0.9841,
+      "step": 3140
+    },
+    {
+      "epoch": 0.18141439778495616,
+      "grad_norm": 0.301500141620636,
+      "learning_rate": 0.00019599096519228585,
+      "loss": 0.9387,
+      "step": 3145
+    },
+    {
+      "epoch": 0.18170281495154592,
+      "grad_norm": 0.2999275028705597,
+      "learning_rate": 0.0001959626952131568,
+      "loss": 0.8649,
+      "step": 3150
+    },
+    {
+      "epoch": 0.18199123211813567,
+      "grad_norm": 0.3055667281150818,
+      "learning_rate": 0.00019593432796342496,
+      "loss": 1.0364,
+      "step": 3155
+    },
+    {
+      "epoch": 0.18227964928472543,
+      "grad_norm": 0.30451443791389465,
+      "learning_rate": 0.00019590586347184417,
+      "loss": 1.0552,
+      "step": 3160
+    },
+    {
+      "epoch": 0.18256806645131518,
+      "grad_norm": 0.3046397566795349,
+      "learning_rate": 0.00019587730176726686,
+      "loss": 0.9897,
+      "step": 3165
+    },
+    {
+      "epoch": 0.18285648361790494,
+      "grad_norm": 0.3132875859737396,
+      "learning_rate": 0.00019584864287864408,
+      "loss": 0.953,
+      "step": 3170
+    },
+    {
+      "epoch": 0.1831449007844947,
+      "grad_norm": 0.2684531807899475,
+      "learning_rate": 0.00019581988683502525,
+      "loss": 1.0479,
+      "step": 3175
+    },
+    {
+      "epoch": 0.18343331795108445,
+      "grad_norm": 0.3220478594303131,
+      "learning_rate": 0.0001957910336655584,
+      "loss": 0.9818,
+      "step": 3180
+    },
+    {
+      "epoch": 0.1837217351176742,
+      "grad_norm": 0.29744499921798706,
+      "learning_rate": 0.00019576208339948988,
+      "loss": 0.985,
+      "step": 3185
+    },
+    {
+      "epoch": 0.18401015228426396,
+      "grad_norm": 0.26757848262786865,
+      "learning_rate": 0.00019573303606616459,
+      "loss": 0.9966,
+      "step": 3190
+    },
+    {
+      "epoch": 0.1842985694508537,
+      "grad_norm": 0.2966987192630768,
+      "learning_rate": 0.00019570389169502569,
+      "loss": 0.9853,
+      "step": 3195
+    },
+    {
+      "epoch": 0.18458698661744347,
+      "grad_norm": 0.2907325327396393,
+      "learning_rate": 0.00019567465031561487,
+      "loss": 1.0468,
+      "step": 3200
+    },
+    {
+      "epoch": 0.18487540378403322,
+      "grad_norm": 0.2841055989265442,
+      "learning_rate": 0.00019564531195757193,
+      "loss": 0.9837,
+      "step": 3205
+    },
+    {
+      "epoch": 0.18516382095062298,
+      "grad_norm": 0.2998584806919098,
+      "learning_rate": 0.0001956158766506352,
+      "loss": 1.0282,
+      "step": 3210
+    },
+    {
+      "epoch": 0.18545223811721273,
+      "grad_norm": 0.3043042719364166,
+      "learning_rate": 0.00019558634442464113,
+      "loss": 0.911,
+      "step": 3215
+    },
+    {
+      "epoch": 0.18574065528380249,
+      "grad_norm": 0.30067190527915955,
+      "learning_rate": 0.00019555671530952445,
+      "loss": 0.9701,
+      "step": 3220
+    },
+    {
+      "epoch": 0.18602907245039224,
+      "grad_norm": 0.297343373298645,
+      "learning_rate": 0.00019552698933531808,
+      "loss": 0.9935,
+      "step": 3225
+    },
+    {
+      "epoch": 0.186317489616982,
+      "grad_norm": 0.2842741310596466,
+      "learning_rate": 0.00019549716653215318,
+      "loss": 0.999,
+      "step": 3230
+    },
+    {
+      "epoch": 0.18660590678357175,
+      "grad_norm": 0.27844905853271484,
+      "learning_rate": 0.00019546724693025896,
+      "loss": 0.9668,
+      "step": 3235
+    },
+    {
+      "epoch": 0.1868943239501615,
+      "grad_norm": 0.29974377155303955,
+      "learning_rate": 0.00019543723055996282,
+      "loss": 0.9864,
+      "step": 3240
+    },
+    {
+      "epoch": 0.18718274111675126,
+      "grad_norm": 0.2982295751571655,
+      "learning_rate": 0.0001954071174516903,
+      "loss": 0.9902,
+      "step": 3245
+    },
+    {
+      "epoch": 0.18747115828334102,
+      "grad_norm": 0.3086935579776764,
+      "learning_rate": 0.00019537690763596487,
+      "loss": 0.9954,
+      "step": 3250
+    },
+    {
+      "epoch": 0.18775957544993077,
+      "grad_norm": 0.28824785351753235,
+      "learning_rate": 0.0001953466011434081,
+      "loss": 0.9979,
+      "step": 3255
+    },
+    {
+      "epoch": 0.18804799261652053,
+      "grad_norm": 0.2743071913719177,
+      "learning_rate": 0.00019531619800473952,
+      "loss": 0.9299,
+      "step": 3260
+    },
+    {
+      "epoch": 0.18833640978311028,
+      "grad_norm": 0.2896062433719635,
+      "learning_rate": 0.00019528569825077668,
+      "loss": 0.9861,
+      "step": 3265
+    },
+    {
+      "epoch": 0.18862482694970004,
+      "grad_norm": 0.29393669962882996,
+      "learning_rate": 0.00019525510191243498,
+      "loss": 1.0792,
+      "step": 3270
+    },
+    {
+      "epoch": 0.1889132441162898,
+      "grad_norm": 0.3489181399345398,
+      "learning_rate": 0.00019522440902072782,
+      "loss": 1.0056,
+      "step": 3275
+    },
+    {
+      "epoch": 0.18920166128287955,
+      "grad_norm": 0.31945231556892395,
+      "learning_rate": 0.0001951936196067664,
+      "loss": 1.0386,
+      "step": 3280
+    },
+    {
+      "epoch": 0.1894900784494693,
+      "grad_norm": 0.30114686489105225,
+      "learning_rate": 0.00019516273370175972,
+      "loss": 0.9667,
+      "step": 3285
+    },
+    {
+      "epoch": 0.18977849561605906,
+      "grad_norm": 0.3653857409954071,
+      "learning_rate": 0.00019513175133701474,
+      "loss": 0.9465,
+      "step": 3290
+    },
+    {
+      "epoch": 0.1900669127826488,
+      "grad_norm": 0.2919418513774872,
+      "learning_rate": 0.000195100672543936,
+      "loss": 0.9252,
+      "step": 3295
+    },
+    {
+      "epoch": 0.19035532994923857,
+      "grad_norm": 0.29241377115249634,
+      "learning_rate": 0.00019506949735402588,
+      "loss": 0.929,
+      "step": 3300
+    },
+    {
+      "epoch": 0.19064374711582832,
+      "grad_norm": 0.30068260431289673,
+      "learning_rate": 0.00019503822579888453,
+      "loss": 1.0254,
+      "step": 3305
+    },
+    {
+      "epoch": 0.19093216428241808,
+      "grad_norm": 0.2954903542995453,
+      "learning_rate": 0.00019500685791020968,
+      "loss": 0.9485,
+      "step": 3310
+    },
+    {
+      "epoch": 0.19122058144900783,
+      "grad_norm": 0.2899206876754761,
+      "learning_rate": 0.00019497539371979674,
+      "loss": 1.036,
+      "step": 3315
+    },
+    {
+      "epoch": 0.1915089986155976,
+      "grad_norm": 0.3165214955806732,
+      "learning_rate": 0.00019494383325953875,
+      "loss": 0.9616,
+      "step": 3320
+    },
+    {
+      "epoch": 0.19179741578218737,
+      "grad_norm": 0.3250178396701813,
+      "learning_rate": 0.0001949121765614263,
+      "loss": 0.9648,
+      "step": 3325
+    },
+    {
+      "epoch": 0.19208583294877712,
+      "grad_norm": 0.2635006904602051,
+      "learning_rate": 0.00019488042365754758,
+      "loss": 0.9789,
+      "step": 3330
+    },
+    {
+      "epoch": 0.19237425011536688,
+      "grad_norm": 0.2964721620082855,
+      "learning_rate": 0.0001948485745800882,
+      "loss": 0.9432,
+      "step": 3335
+    },
+    {
+      "epoch": 0.19266266728195663,
+      "grad_norm": 0.2993474006652832,
+      "learning_rate": 0.0001948166293613314,
+      "loss": 0.9556,
+      "step": 3340
+    },
+    {
+      "epoch": 0.1929510844485464,
+      "grad_norm": 0.28304216265678406,
+      "learning_rate": 0.00019478458803365772,
+      "loss": 0.9445,
+      "step": 3345
+    },
+    {
+      "epoch": 0.19323950161513614,
+      "grad_norm": 0.2697024941444397,
+      "learning_rate": 0.00019475245062954523,
+      "loss": 1.0552,
+      "step": 3350
+    },
+    {
+      "epoch": 0.1935279187817259,
+      "grad_norm": 0.2875863015651703,
+      "learning_rate": 0.00019472021718156937,
+      "loss": 0.9319,
+      "step": 3355
+    },
+    {
+      "epoch": 0.19381633594831565,
+      "grad_norm": 0.3006811738014221,
+      "learning_rate": 0.00019468788772240286,
+      "loss": 1.0049,
+      "step": 3360
+    },
+    {
+      "epoch": 0.1941047531149054,
+      "grad_norm": 0.30004388093948364,
+      "learning_rate": 0.0001946554622848158,
+      "loss": 1.0181,
+      "step": 3365
+    },
+    {
+      "epoch": 0.19439317028149516,
+      "grad_norm": 0.3029836118221283,
+      "learning_rate": 0.00019462294090167554,
+      "loss": 1.045,
+      "step": 3370
+    },
+    {
+      "epoch": 0.19468158744808492,
+      "grad_norm": 0.2854270339012146,
+      "learning_rate": 0.00019459032360594677,
+      "loss": 0.9876,
+      "step": 3375
+    },
+    {
+      "epoch": 0.19497000461467467,
+      "grad_norm": 0.3001527786254883,
+      "learning_rate": 0.0001945576104306913,
+      "loss": 0.9083,
+      "step": 3380
+    },
+    {
+      "epoch": 0.19525842178126443,
+      "grad_norm": 0.2907600700855255,
+      "learning_rate": 0.00019452480140906819,
+      "loss": 0.9734,
+      "step": 3385
+    },
+    {
+      "epoch": 0.19554683894785418,
+      "grad_norm": 0.2804548442363739,
+      "learning_rate": 0.00019449189657433358,
+      "loss": 1.0032,
+      "step": 3390
+    },
+    {
+      "epoch": 0.19583525611444394,
+      "grad_norm": 0.29847756028175354,
+      "learning_rate": 0.0001944588959598408,
+      "loss": 0.9485,
+      "step": 3395
+    },
+    {
+      "epoch": 0.1961236732810337,
+      "grad_norm": 0.28965532779693604,
+      "learning_rate": 0.00019442579959904024,
+      "loss": 0.9713,
+      "step": 3400
+    },
+    {
+      "epoch": 0.19641209044762345,
+      "grad_norm": 0.295213520526886,
+      "learning_rate": 0.00019439260752547935,
+      "loss": 0.9486,
+      "step": 3405
+    },
+    {
+      "epoch": 0.1967005076142132,
+      "grad_norm": 0.2934512794017792,
+      "learning_rate": 0.0001943593197728026,
+      "loss": 1.0448,
+      "step": 3410
+    },
+    {
+      "epoch": 0.19698892478080296,
+      "grad_norm": 0.29289090633392334,
+      "learning_rate": 0.00019432593637475138,
+      "loss": 0.9959,
+      "step": 3415
+    },
+    {
+      "epoch": 0.19727734194739271,
+      "grad_norm": 0.2757977545261383,
+      "learning_rate": 0.00019429245736516415,
+      "loss": 0.9612,
+      "step": 3420
+    },
+    {
+      "epoch": 0.19756575911398247,
+      "grad_norm": 0.28514814376831055,
+      "learning_rate": 0.00019425888277797615,
+      "loss": 1.0246,
+      "step": 3425
+    },
+    {
+      "epoch": 0.19785417628057222,
+      "grad_norm": 0.32380256056785583,
+      "learning_rate": 0.00019422521264721962,
+      "loss": 0.9404,
+      "step": 3430
+    },
+    {
+      "epoch": 0.19814259344716198,
+      "grad_norm": 0.28507691621780396,
+      "learning_rate": 0.0001941914470070236,
+      "loss": 0.8902,
+      "step": 3435
+    },
+    {
+      "epoch": 0.19843101061375173,
+      "grad_norm": 0.3757873773574829,
+      "learning_rate": 0.00019415758589161385,
+      "loss": 1.0038,
+      "step": 3440
+    },
+    {
+      "epoch": 0.1987194277803415,
+      "grad_norm": 0.3061589300632477,
+      "learning_rate": 0.00019412362933531307,
+      "loss": 0.8961,
+      "step": 3445
+    },
+    {
+      "epoch": 0.19900784494693124,
+      "grad_norm": 0.29617950320243835,
+      "learning_rate": 0.0001940895773725406,
+      "loss": 0.9573,
+      "step": 3450
+    },
+    {
+      "epoch": 0.199296262113521,
+      "grad_norm": 0.27990731596946716,
+      "learning_rate": 0.00019405543003781251,
+      "loss": 1.044,
+      "step": 3455
+    },
+    {
+      "epoch": 0.19958467928011075,
+      "grad_norm": 0.29822319746017456,
+      "learning_rate": 0.00019402118736574155,
+      "loss": 0.9799,
+      "step": 3460
+    },
+    {
+      "epoch": 0.1998730964467005,
+      "grad_norm": 0.3118431866168976,
+      "learning_rate": 0.00019398684939103707,
+      "loss": 1.0417,
+      "step": 3465
+    },
+    {
+      "epoch": 0.20016151361329027,
+      "grad_norm": 0.3202954828739166,
+      "learning_rate": 0.00019395241614850504,
+      "loss": 0.9731,
+      "step": 3470
+    },
+    {
+      "epoch": 0.20044993077988002,
+      "grad_norm": 0.3098292052745819,
+      "learning_rate": 0.00019391788767304804,
+      "loss": 0.985,
+      "step": 3475
+    },
+    {
+      "epoch": 0.20073834794646978,
+      "grad_norm": 0.2931598722934723,
+      "learning_rate": 0.00019388326399966515,
+      "loss": 1.0129,
+      "step": 3480
+    },
+    {
+      "epoch": 0.20102676511305953,
+      "grad_norm": 0.2935352027416229,
+      "learning_rate": 0.0001938485451634519,
+      "loss": 0.9402,
+      "step": 3485
+    },
+    {
+      "epoch": 0.20131518227964929,
+      "grad_norm": 0.3236974775791168,
+      "learning_rate": 0.00019381373119960033,
+      "loss": 1.0507,
+      "step": 3490
+    },
+    {
+      "epoch": 0.20160359944623904,
+      "grad_norm": 0.3834960162639618,
+      "learning_rate": 0.00019377882214339893,
+      "loss": 0.9554,
+      "step": 3495
+    },
+    {
+      "epoch": 0.2018920166128288,
+      "grad_norm": 0.2892552316188812,
+      "learning_rate": 0.00019374381803023252,
+      "loss": 1.0119,
+      "step": 3500
+    },
+    {
+      "epoch": 0.20218043377941855,
+      "grad_norm": 0.29538676142692566,
+      "learning_rate": 0.0001937087188955823,
+      "loss": 0.9977,
+      "step": 3505
+    },
+    {
+      "epoch": 0.2024688509460083,
+      "grad_norm": 0.2964411973953247,
+      "learning_rate": 0.00019367352477502576,
+      "loss": 0.9636,
+      "step": 3510
+    },
+    {
+      "epoch": 0.20275726811259806,
+      "grad_norm": 0.3167349696159363,
+      "learning_rate": 0.00019363823570423675,
+      "loss": 0.9345,
+      "step": 3515
+    },
+    {
+      "epoch": 0.20304568527918782,
+      "grad_norm": 0.3199044466018677,
+      "learning_rate": 0.0001936028517189852,
+      "loss": 0.913,
+      "step": 3520
+    },
+    {
+      "epoch": 0.20333410244577757,
+      "grad_norm": 0.27600806951522827,
+      "learning_rate": 0.00019356737285513748,
+      "loss": 0.959,
+      "step": 3525
+    },
+    {
+      "epoch": 0.20362251961236733,
+      "grad_norm": 0.31621217727661133,
+      "learning_rate": 0.00019353179914865596,
+      "loss": 1.0437,
+      "step": 3530
+    },
+    {
+      "epoch": 0.20391093677895708,
+      "grad_norm": 0.30049943923950195,
+      "learning_rate": 0.00019349613063559916,
+      "loss": 0.9675,
+      "step": 3535
+    },
+    {
+      "epoch": 0.20419935394554684,
+      "grad_norm": 0.3039463460445404,
+      "learning_rate": 0.00019346036735212177,
+      "loss": 1.0542,
+      "step": 3540
+    },
+    {
+      "epoch": 0.2044877711121366,
+      "grad_norm": 0.3049977123737335,
+      "learning_rate": 0.00019342450933447448,
+      "loss": 0.8974,
+      "step": 3545
+    },
+    {
+      "epoch": 0.20477618827872635,
+      "grad_norm": 0.2853706181049347,
+      "learning_rate": 0.00019338855661900405,
+      "loss": 0.9711,
+      "step": 3550
+    },
+    {
+      "epoch": 0.2050646054453161,
+      "grad_norm": 0.2970394492149353,
+      "learning_rate": 0.00019335250924215318,
+      "loss": 0.9516,
+      "step": 3555
+    },
+    {
+      "epoch": 0.20535302261190586,
+      "grad_norm": 0.3310398459434509,
+      "learning_rate": 0.00019331636724046058,
+      "loss": 0.9293,
+      "step": 3560
+    },
+    {
+      "epoch": 0.2056414397784956,
+      "grad_norm": 0.2932792901992798,
+      "learning_rate": 0.0001932801306505608,
+      "loss": 1.0088,
+      "step": 3565
+    },
+    {
+      "epoch": 0.20592985694508537,
+      "grad_norm": 0.3343851566314697,
+      "learning_rate": 0.00019324379950918437,
+      "loss": 1.0363,
+      "step": 3570
+    },
+    {
+      "epoch": 0.20621827411167512,
+      "grad_norm": 0.30094677209854126,
+      "learning_rate": 0.00019320737385315756,
+      "loss": 1.0072,
+      "step": 3575
+    },
+    {
+      "epoch": 0.20650669127826488,
+      "grad_norm": 0.28837206959724426,
+      "learning_rate": 0.00019317085371940246,
+      "loss": 0.9139,
+      "step": 3580
+    },
+    {
+      "epoch": 0.20679510844485463,
+      "grad_norm": 0.29000407457351685,
+      "learning_rate": 0.00019313423914493703,
+      "loss": 0.9431,
+      "step": 3585
+    },
+    {
+      "epoch": 0.20708352561144439,
+      "grad_norm": 0.28823748230934143,
+      "learning_rate": 0.00019309753016687477,
+      "loss": 0.9281,
+      "step": 3590
+    },
+    {
+      "epoch": 0.20737194277803414,
+      "grad_norm": 0.30797070264816284,
+      "learning_rate": 0.00019306072682242505,
+      "loss": 0.9611,
+      "step": 3595
+    },
+    {
+      "epoch": 0.2076603599446239,
+      "grad_norm": 0.2971121370792389,
+      "learning_rate": 0.00019302382914889284,
+      "loss": 1.0199,
+      "step": 3600
+    },
+    {
+      "epoch": 0.20794877711121365,
+      "grad_norm": 0.2938947081565857,
+      "learning_rate": 0.00019298683718367864,
+      "loss": 0.9275,
+      "step": 3605
+    },
+    {
+      "epoch": 0.2082371942778034,
+      "grad_norm": 0.3001919686794281,
+      "learning_rate": 0.00019294975096427862,
+      "loss": 0.9963,
+      "step": 3610
+    },
+    {
+      "epoch": 0.20852561144439316,
+      "grad_norm": 0.3122607469558716,
+      "learning_rate": 0.00019291257052828447,
+      "loss": 1.0458,
+      "step": 3615
+    },
+    {
+      "epoch": 0.20881402861098292,
+      "grad_norm": 0.2895052433013916,
+      "learning_rate": 0.00019287529591338333,
+      "loss": 0.9592,
+      "step": 3620
+    },
+    {
+      "epoch": 0.20910244577757267,
+      "grad_norm": 0.2828371822834015,
+      "learning_rate": 0.0001928379271573579,
+      "loss": 0.9518,
+      "step": 3625
+    },
+    {
+      "epoch": 0.20939086294416243,
+      "grad_norm": 0.30132856965065,
+      "learning_rate": 0.0001928004642980862,
+      "loss": 0.9374,
+      "step": 3630
+    },
+    {
+      "epoch": 0.20967928011075218,
+      "grad_norm": 0.4656534194946289,
+      "learning_rate": 0.0001927629073735417,
+      "loss": 0.9824,
+      "step": 3635
+    },
+    {
+      "epoch": 0.20996769727734194,
+      "grad_norm": 0.2774214744567871,
+      "learning_rate": 0.00019272525642179323,
+      "loss": 0.9528,
+      "step": 3640
+    },
+    {
+      "epoch": 0.2102561144439317,
+      "grad_norm": 0.2919476330280304,
+      "learning_rate": 0.00019268751148100486,
+      "loss": 0.9404,
+      "step": 3645
+    },
+    {
+      "epoch": 0.21054453161052145,
+      "grad_norm": 0.3007878065109253,
+      "learning_rate": 0.00019264967258943595,
+      "loss": 0.96,
+      "step": 3650
+    },
+    {
+      "epoch": 0.2108329487771112,
+      "grad_norm": 0.30731719732284546,
+      "learning_rate": 0.0001926117397854412,
+      "loss": 0.9321,
+      "step": 3655
+    },
+    {
+      "epoch": 0.21112136594370096,
+      "grad_norm": 0.32939255237579346,
+      "learning_rate": 0.0001925737131074703,
+      "loss": 1.0182,
+      "step": 3660
+    },
+    {
+      "epoch": 0.2114097831102907,
+      "grad_norm": 0.29776227474212646,
+      "learning_rate": 0.0001925355925940683,
+      "loss": 1.0224,
+      "step": 3665
+    },
+    {
+      "epoch": 0.2116982002768805,
+      "grad_norm": 0.3057902753353119,
+      "learning_rate": 0.00019249737828387522,
+      "loss": 0.9812,
+      "step": 3670
+    },
+    {
+      "epoch": 0.21198661744347025,
+      "grad_norm": 0.3011026382446289,
+      "learning_rate": 0.0001924590702156262,
+      "loss": 0.9753,
+      "step": 3675
+    },
+    {
+      "epoch": 0.21227503461006,
+      "grad_norm": 0.2978782653808594,
+      "learning_rate": 0.00019242066842815146,
+      "loss": 1.0129,
+      "step": 3680
+    },
+    {
+      "epoch": 0.21256345177664976,
+      "grad_norm": 0.2966994047164917,
+      "learning_rate": 0.00019238217296037614,
+      "loss": 1.0068,
+      "step": 3685
+    },
+    {
+      "epoch": 0.21285186894323951,
+      "grad_norm": 0.2818816602230072,
+      "learning_rate": 0.00019234358385132038,
+      "loss": 1.0062,
+      "step": 3690
+    },
+    {
+      "epoch": 0.21314028610982927,
+      "grad_norm": 0.280269980430603,
+      "learning_rate": 0.00019230490114009928,
+      "loss": 0.9392,
+      "step": 3695
+    },
+    {
+      "epoch": 0.21342870327641902,
+      "grad_norm": 0.29371026158332825,
+      "learning_rate": 0.00019226612486592271,
+      "loss": 0.8971,
+      "step": 3700
+    },
+    {
+      "epoch": 0.21371712044300878,
+      "grad_norm": 0.3066560924053192,
+      "learning_rate": 0.00019222725506809547,
+      "loss": 0.9893,
+      "step": 3705
+    },
+    {
+      "epoch": 0.21400553760959853,
+      "grad_norm": 0.31458479166030884,
+      "learning_rate": 0.00019218829178601713,
+      "loss": 1.0389,
+      "step": 3710
+    },
+    {
+      "epoch": 0.2142939547761883,
+      "grad_norm": 0.3057044446468353,
+      "learning_rate": 0.00019214923505918202,
+      "loss": 1.0005,
+      "step": 3715
+    },
+    {
+      "epoch": 0.21458237194277804,
+      "grad_norm": 0.27441418170928955,
+      "learning_rate": 0.00019211008492717914,
+      "loss": 0.9777,
+      "step": 3720
+    },
+    {
+      "epoch": 0.2148707891093678,
+      "grad_norm": 0.2985784113407135,
+      "learning_rate": 0.00019207084142969225,
+      "loss": 1.0475,
+      "step": 3725
+    },
+    {
+      "epoch": 0.21515920627595755,
+      "grad_norm": 0.305512934923172,
+      "learning_rate": 0.0001920315046064997,
+      "loss": 0.9554,
+      "step": 3730
+    },
+    {
+      "epoch": 0.2154476234425473,
+      "grad_norm": 0.3009251356124878,
+      "learning_rate": 0.0001919920744974745,
+      "loss": 0.9912,
+      "step": 3735
+    },
+    {
+      "epoch": 0.21573604060913706,
+      "grad_norm": 0.29489755630493164,
+      "learning_rate": 0.00019195255114258408,
+      "loss": 0.9554,
+      "step": 3740
+    },
+    {
+      "epoch": 0.21602445777572682,
+      "grad_norm": 0.3059771955013275,
+      "learning_rate": 0.0001919129345818905,
+      "loss": 0.9819,
+      "step": 3745
+    },
+    {
+      "epoch": 0.21631287494231657,
+      "grad_norm": 0.3015615940093994,
+      "learning_rate": 0.00019187322485555031,
+      "loss": 0.9948,
+      "step": 3750
+    },
+    {
+      "epoch": 0.21660129210890633,
+      "grad_norm": 0.3108586072921753,
+      "learning_rate": 0.0001918334220038144,
+      "loss": 0.9818,
+      "step": 3755
+    },
+    {
+      "epoch": 0.21688970927549608,
+      "grad_norm": 0.30573326349258423,
+      "learning_rate": 0.00019179352606702813,
+      "loss": 0.9519,
+      "step": 3760
+    },
+    {
+      "epoch": 0.21717812644208584,
+      "grad_norm": 0.2957397997379303,
+      "learning_rate": 0.00019175353708563117,
+      "loss": 1.0094,
+      "step": 3765
+    },
+    {
+      "epoch": 0.2174665436086756,
+      "grad_norm": 0.2969014644622803,
+      "learning_rate": 0.00019171345510015758,
+      "loss": 1.0162,
+      "step": 3770
+    },
+    {
+      "epoch": 0.21775496077526535,
+      "grad_norm": 0.33074361085891724,
+      "learning_rate": 0.00019167328015123558,
+      "loss": 0.9382,
+      "step": 3775
+    },
+    {
+      "epoch": 0.2180433779418551,
+      "grad_norm": 0.2909998297691345,
+      "learning_rate": 0.0001916330122795877,
+      "loss": 0.9768,
+      "step": 3780
+    },
+    {
+      "epoch": 0.21833179510844486,
+      "grad_norm": 0.28647512197494507,
+      "learning_rate": 0.00019159265152603064,
+      "loss": 0.9658,
+      "step": 3785
+    },
+    {
+      "epoch": 0.21862021227503461,
+      "grad_norm": 0.3733946979045868,
+      "learning_rate": 0.00019155219793147522,
+      "loss": 1.037,
+      "step": 3790
+    },
+    {
+      "epoch": 0.21890862944162437,
+      "grad_norm": 0.2883405089378357,
+      "learning_rate": 0.00019151165153692644,
+      "loss": 0.9551,
+      "step": 3795
+    },
+    {
+      "epoch": 0.21919704660821412,
+      "grad_norm": 0.33625394105911255,
+      "learning_rate": 0.00019147101238348326,
+      "loss": 0.995,
+      "step": 3800
+    },
+    {
+      "epoch": 0.21948546377480388,
+      "grad_norm": 0.4042999744415283,
+      "learning_rate": 0.00019143028051233873,
+      "loss": 0.9512,
+      "step": 3805
+    },
+    {
+      "epoch": 0.21977388094139363,
+      "grad_norm": 0.277295857667923,
+      "learning_rate": 0.00019138945596477994,
+      "loss": 0.9281,
+      "step": 3810
+    },
+    {
+      "epoch": 0.2200622981079834,
+      "grad_norm": 0.3070628046989441,
+      "learning_rate": 0.0001913485387821877,
+      "loss": 0.938,
+      "step": 3815
+    },
+    {
+      "epoch": 0.22035071527457314,
+      "grad_norm": 0.2898661494255066,
+      "learning_rate": 0.00019130752900603702,
+      "loss": 1.0103,
+      "step": 3820
+    },
+    {
+      "epoch": 0.2206391324411629,
+      "grad_norm": 0.2981604039669037,
+      "learning_rate": 0.00019126642667789654,
+      "loss": 0.9787,
+      "step": 3825
+    },
+    {
+      "epoch": 0.22092754960775265,
+      "grad_norm": 0.2816370129585266,
+      "learning_rate": 0.00019122523183942879,
+      "loss": 1.039,
+      "step": 3830
+    },
+    {
+      "epoch": 0.2212159667743424,
+      "grad_norm": 0.306822806596756,
+      "learning_rate": 0.00019118394453239006,
+      "loss": 1.0161,
+      "step": 3835
+    },
+    {
+      "epoch": 0.22150438394093216,
+      "grad_norm": 0.29982468485832214,
+      "learning_rate": 0.00019114256479863038,
+      "loss": 0.959,
+      "step": 3840
+    },
+    {
+      "epoch": 0.22179280110752192,
+      "grad_norm": 0.2966124713420868,
+      "learning_rate": 0.00019110109268009347,
+      "loss": 0.9996,
+      "step": 3845
+    },
+    {
+      "epoch": 0.22208121827411167,
+      "grad_norm": 0.3192947208881378,
+      "learning_rate": 0.00019105952821881668,
+      "loss": 1.0132,
+      "step": 3850
+    },
+    {
+      "epoch": 0.22236963544070143,
+      "grad_norm": 0.2927592694759369,
+      "learning_rate": 0.00019101787145693098,
+      "loss": 0.9738,
+      "step": 3855
+    },
+    {
+      "epoch": 0.22265805260729118,
+      "grad_norm": 0.2782720923423767,
+      "learning_rate": 0.00019097612243666086,
+      "loss": 0.9538,
+      "step": 3860
+    },
+    {
+      "epoch": 0.22294646977388094,
+      "grad_norm": 0.32348090410232544,
+      "learning_rate": 0.0001909342812003244,
+      "loss": 0.9593,
+      "step": 3865
+    },
+    {
+      "epoch": 0.2232348869404707,
+      "grad_norm": 0.32968342304229736,
+      "learning_rate": 0.00019089234779033306,
+      "loss": 0.9899,
+      "step": 3870
+    },
+    {
+      "epoch": 0.22352330410706045,
+      "grad_norm": 0.29580381512641907,
+      "learning_rate": 0.00019085032224919177,
+      "loss": 0.9515,
+      "step": 3875
+    },
+    {
+      "epoch": 0.2238117212736502,
+      "grad_norm": 0.27999478578567505,
+      "learning_rate": 0.00019080820461949886,
+      "loss": 0.9596,
+      "step": 3880
+    },
+    {
+      "epoch": 0.22410013844023996,
+      "grad_norm": 0.31083959341049194,
+      "learning_rate": 0.00019076599494394602,
+      "loss": 1.0069,
+      "step": 3885
+    },
+    {
+      "epoch": 0.22438855560682971,
+      "grad_norm": 0.2649812400341034,
+      "learning_rate": 0.00019072369326531824,
+      "loss": 0.9238,
+      "step": 3890
+    },
+    {
+      "epoch": 0.22467697277341947,
+      "grad_norm": 0.2908613383769989,
+      "learning_rate": 0.00019068129962649365,
+      "loss": 0.9745,
+      "step": 3895
+    },
+    {
+      "epoch": 0.22496538994000922,
+      "grad_norm": 0.2983262538909912,
+      "learning_rate": 0.00019063881407044373,
+      "loss": 0.9155,
+      "step": 3900
+    },
+    {
+      "epoch": 0.22525380710659898,
+      "grad_norm": 0.3074907660484314,
+      "learning_rate": 0.00019059623664023311,
+      "loss": 1.0384,
+      "step": 3905
+    },
+    {
+      "epoch": 0.22554222427318874,
+      "grad_norm": 0.3024677336215973,
+      "learning_rate": 0.00019055356737901952,
+      "loss": 1.0626,
+      "step": 3910
+    },
+    {
+      "epoch": 0.2258306414397785,
+      "grad_norm": 0.324719101190567,
+      "learning_rate": 0.00019051080633005372,
+      "loss": 0.9757,
+      "step": 3915
+    },
+    {
+      "epoch": 0.22611905860636825,
+      "grad_norm": 0.31149742007255554,
+      "learning_rate": 0.00019046795353667965,
+      "loss": 1.0294,
+      "step": 3920
+    },
+    {
+      "epoch": 0.226407475772958,
+      "grad_norm": 0.3361373543739319,
+      "learning_rate": 0.00019042500904233408,
+      "loss": 0.949,
+      "step": 3925
+    },
+    {
+      "epoch": 0.22669589293954776,
+      "grad_norm": 0.3346847593784332,
+      "learning_rate": 0.00019038197289054684,
+      "loss": 0.9531,
+      "step": 3930
+    },
+    {
+      "epoch": 0.2269843101061375,
+      "grad_norm": 0.3011166453361511,
+      "learning_rate": 0.00019033884512494064,
+      "loss": 0.9515,
+      "step": 3935
+    },
+    {
+      "epoch": 0.22727272727272727,
+      "grad_norm": 0.350754052400589,
+      "learning_rate": 0.00019029562578923106,
+      "loss": 0.9878,
+      "step": 3940
+    },
+    {
+      "epoch": 0.22756114443931702,
+      "grad_norm": 0.3115714192390442,
+      "learning_rate": 0.00019025231492722643,
+      "loss": 0.9914,
+      "step": 3945
+    },
+    {
+      "epoch": 0.22784956160590678,
+      "grad_norm": 0.29641732573509216,
+      "learning_rate": 0.000190208912582828,
+      "loss": 0.9508,
+      "step": 3950
+    },
+    {
+      "epoch": 0.22813797877249653,
+      "grad_norm": 0.3013533353805542,
+      "learning_rate": 0.0001901654188000296,
+      "loss": 0.9551,
+      "step": 3955
+    },
+    {
+      "epoch": 0.22842639593908629,
+      "grad_norm": 0.3072235584259033,
+      "learning_rate": 0.0001901218336229178,
+      "loss": 1.0324,
+      "step": 3960
+    },
+    {
+      "epoch": 0.22871481310567604,
+      "grad_norm": 0.2967047691345215,
+      "learning_rate": 0.00019007815709567183,
+      "loss": 0.9767,
+      "step": 3965
+    },
+    {
+      "epoch": 0.2290032302722658,
+      "grad_norm": 0.3344308137893677,
+      "learning_rate": 0.0001900343892625635,
+      "loss": 1.053,
+      "step": 3970
+    },
+    {
+      "epoch": 0.22929164743885555,
+      "grad_norm": 0.279471218585968,
+      "learning_rate": 0.00018999053016795719,
+      "loss": 0.9597,
+      "step": 3975
+    },
+    {
+      "epoch": 0.2295800646054453,
+      "grad_norm": 0.3151692748069763,
+      "learning_rate": 0.00018994657985630972,
+      "loss": 0.981,
+      "step": 3980
+    },
+    {
+      "epoch": 0.22986848177203506,
+      "grad_norm": 0.29757049679756165,
+      "learning_rate": 0.00018990253837217042,
+      "loss": 0.9948,
+      "step": 3985
+    },
+    {
+      "epoch": 0.23015689893862482,
+      "grad_norm": 0.29068654775619507,
+      "learning_rate": 0.00018985840576018107,
+      "loss": 0.9492,
+      "step": 3990
+    },
+    {
+      "epoch": 0.23044531610521457,
+      "grad_norm": 0.29149913787841797,
+      "learning_rate": 0.00018981418206507575,
+      "loss": 0.9603,
+      "step": 3995
+    },
+    {
+      "epoch": 0.23073373327180433,
+      "grad_norm": 0.2850954830646515,
+      "learning_rate": 0.00018976986733168093,
+      "loss": 1.0198,
+      "step": 4000
+    },
+    {
+      "epoch": 0.23102215043839408,
+      "grad_norm": 0.3014662563800812,
+      "learning_rate": 0.00018972546160491528,
+      "loss": 1.0628,
+      "step": 4005
+    },
+    {
+      "epoch": 0.23131056760498384,
+      "grad_norm": 0.29958969354629517,
+      "learning_rate": 0.00018968096492978976,
+      "loss": 0.9891,
+      "step": 4010
+    },
+    {
+      "epoch": 0.2315989847715736,
+      "grad_norm": 0.29551297426223755,
+      "learning_rate": 0.0001896363773514075,
+      "loss": 0.9811,
+      "step": 4015
+    },
+    {
+      "epoch": 0.23188740193816337,
+      "grad_norm": 0.30971017479896545,
+      "learning_rate": 0.0001895916989149638,
+      "loss": 1.0459,
+      "step": 4020
+    },
+    {
+      "epoch": 0.23217581910475313,
+      "grad_norm": 0.3282906115055084,
+      "learning_rate": 0.000189546929665746,
+      "loss": 1.0698,
+      "step": 4025
+    },
+    {
+      "epoch": 0.23246423627134288,
+      "grad_norm": 0.3017507493495941,
+      "learning_rate": 0.00018950206964913355,
+      "loss": 0.9867,
+      "step": 4030
+    },
+    {
+      "epoch": 0.23275265343793264,
+      "grad_norm": 0.34195518493652344,
+      "learning_rate": 0.0001894571189105979,
+      "loss": 0.9247,
+      "step": 4035
+    },
+    {
+      "epoch": 0.2330410706045224,
+      "grad_norm": 0.33378762006759644,
+      "learning_rate": 0.00018941207749570237,
+      "loss": 1.0384,
+      "step": 4040
+    },
+    {
+      "epoch": 0.23332948777111215,
+      "grad_norm": 0.325948029756546,
+      "learning_rate": 0.00018936694545010232,
+      "loss": 0.9698,
+      "step": 4045
+    },
+    {
+      "epoch": 0.2336179049377019,
+      "grad_norm": 0.2848076820373535,
+      "learning_rate": 0.0001893217228195449,
+      "loss": 1.0036,
+      "step": 4050
+    },
+    {
+      "epoch": 0.23390632210429166,
+      "grad_norm": 0.30070775747299194,
+      "learning_rate": 0.0001892764096498691,
+      "loss": 1.0397,
+      "step": 4055
+    },
+    {
+      "epoch": 0.2341947392708814,
+      "grad_norm": 0.3177594244480133,
+      "learning_rate": 0.00018923100598700561,
+      "loss": 1.0136,
+      "step": 4060
+    },
+    {
+      "epoch": 0.23448315643747117,
+      "grad_norm": 0.31077563762664795,
+      "learning_rate": 0.00018918551187697703,
+      "loss": 0.9457,
+      "step": 4065
+    },
+    {
+      "epoch": 0.23477157360406092,
+      "grad_norm": 0.2947135865688324,
+      "learning_rate": 0.00018913992736589746,
+      "loss": 0.9988,
+      "step": 4070
+    },
+    {
+      "epoch": 0.23505999077065068,
+      "grad_norm": 0.26377373933792114,
+      "learning_rate": 0.00018909425249997267,
+      "loss": 0.9891,
+      "step": 4075
+    },
+    {
+      "epoch": 0.23534840793724043,
+      "grad_norm": 0.3427537977695465,
+      "learning_rate": 0.0001890484873255001,
+      "loss": 0.993,
+      "step": 4080
+    },
+    {
+      "epoch": 0.2356368251038302,
+      "grad_norm": 0.28606218099594116,
+      "learning_rate": 0.00018900263188886864,
+      "loss": 0.9609,
+      "step": 4085
+    },
+    {
+      "epoch": 0.23592524227041994,
+      "grad_norm": 0.31335821747779846,
+      "learning_rate": 0.00018895668623655873,
+      "loss": 0.9278,
+      "step": 4090
+    },
+    {
+      "epoch": 0.2362136594370097,
+      "grad_norm": 0.3148699104785919,
+      "learning_rate": 0.00018891065041514224,
+      "loss": 0.9486,
+      "step": 4095
+    },
+    {
+      "epoch": 0.23650207660359945,
+      "grad_norm": 0.30335333943367004,
+      "learning_rate": 0.0001888645244712824,
+      "loss": 0.9604,
+      "step": 4100
+    },
+    {
+      "epoch": 0.2367904937701892,
+      "grad_norm": 0.2990083396434784,
+      "learning_rate": 0.0001888183084517338,
+      "loss": 0.9277,
+      "step": 4105
+    },
+    {
+      "epoch": 0.23707891093677896,
+      "grad_norm": 0.3039418160915375,
+      "learning_rate": 0.00018877200240334236,
+      "loss": 1.0381,
+      "step": 4110
+    },
+    {
+      "epoch": 0.23736732810336872,
+      "grad_norm": 0.3109247386455536,
+      "learning_rate": 0.0001887256063730453,
+      "loss": 1.0214,
+      "step": 4115
+    },
+    {
+      "epoch": 0.23765574526995847,
+      "grad_norm": 0.29135051369667053,
+      "learning_rate": 0.00018867912040787096,
+      "loss": 1.0111,
+      "step": 4120
+    },
+    {
+      "epoch": 0.23794416243654823,
+      "grad_norm": 0.29950061440467834,
+      "learning_rate": 0.0001886325445549389,
+      "loss": 0.9879,
+      "step": 4125
+    },
+    {
+      "epoch": 0.23823257960313798,
+      "grad_norm": 0.3028976619243622,
+      "learning_rate": 0.00018858587886145975,
+      "loss": 0.9808,
+      "step": 4130
+    },
+    {
+      "epoch": 0.23852099676972774,
+      "grad_norm": 0.2960391342639923,
+      "learning_rate": 0.0001885391233747352,
+      "loss": 0.9033,
+      "step": 4135
+    },
+    {
+      "epoch": 0.2388094139363175,
+      "grad_norm": 0.28858163952827454,
+      "learning_rate": 0.00018849227814215805,
+      "loss": 0.8774,
+      "step": 4140
+    },
+    {
+      "epoch": 0.23909783110290725,
+      "grad_norm": 0.3187437653541565,
+      "learning_rate": 0.00018844534321121195,
+      "loss": 1.032,
+      "step": 4145
+    },
+    {
+      "epoch": 0.239386248269497,
+      "grad_norm": 0.30050045251846313,
+      "learning_rate": 0.00018839831862947152,
+      "loss": 0.9785,
+      "step": 4150
+    },
+    {
+      "epoch": 0.23967466543608676,
+      "grad_norm": 0.3172016739845276,
+      "learning_rate": 0.0001883512044446023,
+      "loss": 1.0049,
+      "step": 4155
+    },
+    {
+      "epoch": 0.23996308260267651,
+      "grad_norm": 0.2758901119232178,
+      "learning_rate": 0.00018830400070436057,
+      "loss": 0.8758,
+      "step": 4160
+    },
+    {
+      "epoch": 0.24025149976926627,
+      "grad_norm": 0.31265828013420105,
+      "learning_rate": 0.00018825670745659345,
+      "loss": 0.9875,
+      "step": 4165
+    },
+    {
+      "epoch": 0.24053991693585602,
+      "grad_norm": 0.2935623526573181,
+      "learning_rate": 0.00018820932474923873,
+      "loss": 0.9738,
+      "step": 4170
+    },
+    {
+      "epoch": 0.24082833410244578,
+      "grad_norm": 0.31961116194725037,
+      "learning_rate": 0.00018816185263032496,
+      "loss": 0.985,
+      "step": 4175
+    },
+    {
+      "epoch": 0.24111675126903553,
+      "grad_norm": 0.302990198135376,
+      "learning_rate": 0.00018811429114797123,
+      "loss": 0.9693,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2414051684356253,
+      "grad_norm": 0.3246656358242035,
+      "learning_rate": 0.00018806664035038727,
+      "loss": 0.9715,
+      "step": 4185
+    },
+    {
+      "epoch": 0.24169358560221504,
+      "grad_norm": 0.30691856145858765,
+      "learning_rate": 0.00018801890028587333,
+      "loss": 0.9967,
+      "step": 4190
+    },
+    {
+      "epoch": 0.2419820027688048,
+      "grad_norm": 0.3090788424015045,
+      "learning_rate": 0.00018797107100282015,
+      "loss": 1.0014,
+      "step": 4195
+    },
+    {
+      "epoch": 0.24227041993539455,
+      "grad_norm": 0.28349974751472473,
+      "learning_rate": 0.0001879231525497089,
+      "loss": 0.9426,
+      "step": 4200
+    },
+    {
+      "epoch": 0.2425588371019843,
+      "grad_norm": 0.3226814270019531,
+      "learning_rate": 0.00018787514497511104,
+      "loss": 1.0058,
+      "step": 4205
+    },
+    {
+      "epoch": 0.24284725426857406,
+      "grad_norm": 0.3090320825576782,
+      "learning_rate": 0.0001878270483276886,
+      "loss": 0.9565,
+      "step": 4210
+    },
+    {
+      "epoch": 0.24313567143516382,
+      "grad_norm": 0.29639485478401184,
+      "learning_rate": 0.00018777886265619365,
+      "loss": 0.9994,
+      "step": 4215
+    },
+    {
+      "epoch": 0.24342408860175357,
+      "grad_norm": 0.30157527327537537,
+      "learning_rate": 0.00018773058800946858,
+      "loss": 0.9349,
+      "step": 4220
+    },
+    {
+      "epoch": 0.24371250576834333,
+      "grad_norm": 0.2847401797771454,
+      "learning_rate": 0.0001876822244364461,
+      "loss": 0.9882,
+      "step": 4225
+    },
+    {
+      "epoch": 0.24400092293493308,
+      "grad_norm": 0.2939082086086273,
+      "learning_rate": 0.00018763377198614887,
+      "loss": 0.9545,
+      "step": 4230
+    },
+    {
+      "epoch": 0.24428934010152284,
+      "grad_norm": 0.30300137400627136,
+      "learning_rate": 0.00018758523070768973,
+      "loss": 0.9069,
+      "step": 4235
+    },
+    {
+      "epoch": 0.2445777572681126,
+      "grad_norm": 0.2980591952800751,
+      "learning_rate": 0.00018753660065027152,
+      "loss": 0.9992,
+      "step": 4240
+    },
+    {
+      "epoch": 0.24486617443470235,
+      "grad_norm": 0.31828731298446655,
+      "learning_rate": 0.00018748788186318712,
+      "loss": 0.9711,
+      "step": 4245
+    },
+    {
+      "epoch": 0.2451545916012921,
+      "grad_norm": 0.31123876571655273,
+      "learning_rate": 0.00018743907439581933,
+      "loss": 0.9393,
+      "step": 4250
+    },
+    {
+      "epoch": 0.24544300876788186,
+      "grad_norm": 0.29812201857566833,
+      "learning_rate": 0.00018739017829764082,
+      "loss": 0.9653,
+      "step": 4255
+    },
+    {
+      "epoch": 0.24573142593447161,
+      "grad_norm": 0.33146384358406067,
+      "learning_rate": 0.0001873411936182141,
+      "loss": 0.9758,
+      "step": 4260
+    },
+    {
+      "epoch": 0.24601984310106137,
+      "grad_norm": 0.3051407039165497,
+      "learning_rate": 0.0001872921204071915,
+      "loss": 1.0172,
+      "step": 4265
+    },
+    {
+      "epoch": 0.24630826026765112,
+      "grad_norm": 0.30195561051368713,
+      "learning_rate": 0.000187242958714315,
+      "loss": 0.9868,
+      "step": 4270
+    },
+    {
+      "epoch": 0.24659667743424088,
+      "grad_norm": 0.2948630750179291,
+      "learning_rate": 0.00018719370858941644,
+      "loss": 0.9771,
+      "step": 4275
+    },
+    {
+      "epoch": 0.24688509460083063,
+      "grad_norm": 0.3198891282081604,
+      "learning_rate": 0.00018714437008241709,
+      "loss": 1.04,
+      "step": 4280
+    },
+    {
+      "epoch": 0.2471735117674204,
+      "grad_norm": 0.3208988606929779,
+      "learning_rate": 0.000187094943243328,
+      "loss": 0.9666,
+      "step": 4285
+    },
+    {
+      "epoch": 0.24746192893401014,
+      "grad_norm": 0.3209957182407379,
+      "learning_rate": 0.00018704542812224956,
+      "loss": 0.9374,
+      "step": 4290
+    },
+    {
+      "epoch": 0.2477503461005999,
+      "grad_norm": 0.3006252348423004,
+      "learning_rate": 0.00018699582476937185,
+      "loss": 0.9798,
+      "step": 4295
+    },
+    {
+      "epoch": 0.24803876326718965,
+      "grad_norm": 0.3490176796913147,
+      "learning_rate": 0.00018694613323497422,
+      "loss": 1.0087,
+      "step": 4300
+    },
+    {
+      "epoch": 0.2483271804337794,
+      "grad_norm": 0.3163358271121979,
+      "learning_rate": 0.0001868963535694255,
+      "loss": 1.043,
+      "step": 4305
+    },
+    {
+      "epoch": 0.24861559760036916,
+      "grad_norm": 0.298026442527771,
+      "learning_rate": 0.0001868464858231838,
+      "loss": 1.0404,
+      "step": 4310
+    },
+    {
+      "epoch": 0.24890401476695892,
+      "grad_norm": 0.3209499418735504,
+      "learning_rate": 0.00018679653004679655,
+      "loss": 0.9687,
+      "step": 4315
+    },
+    {
+      "epoch": 0.24919243193354867,
+      "grad_norm": 0.3158719539642334,
+      "learning_rate": 0.0001867464862909004,
+      "loss": 0.9548,
+      "step": 4320
+    },
+    {
+      "epoch": 0.24948084910013843,
+      "grad_norm": 0.28783926367759705,
+      "learning_rate": 0.00018669635460622107,
+      "loss": 0.9042,
+      "step": 4325
+    },
+    {
+      "epoch": 0.24976926626672818,
+      "grad_norm": 0.2980654835700989,
+      "learning_rate": 0.00018664613504357366,
+      "loss": 0.97,
+      "step": 4330
+    },
+    {
+      "epoch": 0.25005768343331797,
+      "grad_norm": 0.2950812876224518,
+      "learning_rate": 0.00018659582765386204,
+      "loss": 1.0261,
+      "step": 4335
+    },
+    {
+      "epoch": 0.2503461005999077,
+      "grad_norm": 0.2984694540500641,
+      "learning_rate": 0.0001865454324880794,
+      "loss": 0.9859,
+      "step": 4340
+    },
+    {
+      "epoch": 0.2506345177664975,
+      "grad_norm": 0.3119395971298218,
+      "learning_rate": 0.00018649494959730765,
+      "loss": 1.03,
+      "step": 4345
+    },
+    {
+      "epoch": 0.2509229349330872,
+      "grad_norm": 0.3380660116672516,
+      "learning_rate": 0.00018644437903271778,
+      "loss": 1.0373,
+      "step": 4350
+    },
+    {
+      "epoch": 0.251211352099677,
+      "grad_norm": 0.310693621635437,
+      "learning_rate": 0.0001863937208455696,
+      "loss": 0.977,
+      "step": 4355
+    },
+    {
+      "epoch": 0.2514997692662667,
+      "grad_norm": 0.3119440972805023,
+      "learning_rate": 0.00018634297508721167,
+      "loss": 0.9384,
+      "step": 4360
+    },
+    {
+      "epoch": 0.2517881864328565,
+      "grad_norm": 0.3072355389595032,
+      "learning_rate": 0.00018629214180908144,
+      "loss": 1.0126,
+      "step": 4365
+    },
+    {
+      "epoch": 0.2520766035994462,
+      "grad_norm": 0.3056802749633789,
+      "learning_rate": 0.00018624122106270506,
+      "loss": 0.9496,
+      "step": 4370
+    },
+    {
+      "epoch": 0.252365020766036,
+      "grad_norm": 0.34883102774620056,
+      "learning_rate": 0.00018619021289969717,
+      "loss": 0.9626,
+      "step": 4375
+    },
+    {
+      "epoch": 0.25265343793262574,
+      "grad_norm": 0.2876664698123932,
+      "learning_rate": 0.00018613911737176125,
+      "loss": 0.9452,
+      "step": 4380
+    },
+    {
+      "epoch": 0.2529418550992155,
+      "grad_norm": 0.3051524758338928,
+      "learning_rate": 0.00018608793453068914,
+      "loss": 0.996,
+      "step": 4385
+    },
+    {
+      "epoch": 0.25323027226580525,
+      "grad_norm": 0.2734985053539276,
+      "learning_rate": 0.0001860366644283613,
+      "loss": 0.9395,
+      "step": 4390
+    },
+    {
+      "epoch": 0.25351868943239503,
+      "grad_norm": 0.30163031816482544,
+      "learning_rate": 0.00018598530711674667,
+      "loss": 0.9608,
+      "step": 4395
+    },
+    {
+      "epoch": 0.25380710659898476,
+      "grad_norm": 0.2709837555885315,
+      "learning_rate": 0.00018593386264790243,
+      "loss": 0.9611,
+      "step": 4400
+    },
+    {
+      "epoch": 0.25409552376557454,
+      "grad_norm": 0.3166120946407318,
+      "learning_rate": 0.00018588233107397429,
+      "loss": 0.8999,
+      "step": 4405
+    },
+    {
+      "epoch": 0.25438394093216427,
+      "grad_norm": 0.2956826090812683,
+      "learning_rate": 0.00018583071244719607,
+      "loss": 0.9097,
+      "step": 4410
+    },
+    {
+      "epoch": 0.25467235809875405,
+      "grad_norm": 0.31426194310188293,
+      "learning_rate": 0.00018577900681989,
+      "loss": 0.941,
+      "step": 4415
+    },
+    {
+      "epoch": 0.2549607752653438,
+      "grad_norm": 0.2746027410030365,
+      "learning_rate": 0.0001857272142444664,
+      "loss": 0.9168,
+      "step": 4420
+    },
+    {
+      "epoch": 0.25524919243193356,
+      "grad_norm": 0.2936379015445709,
+      "learning_rate": 0.00018567533477342377,
+      "loss": 0.9536,
+      "step": 4425
+    },
+    {
+      "epoch": 0.2555376095985233,
+      "grad_norm": 0.31358134746551514,
+      "learning_rate": 0.0001856233684593486,
+      "loss": 0.9569,
+      "step": 4430
+    },
+    {
+      "epoch": 0.25582602676511307,
+      "grad_norm": 0.31144851446151733,
+      "learning_rate": 0.0001855713153549155,
+      "loss": 0.9447,
+      "step": 4435
+    },
+    {
+      "epoch": 0.2561144439317028,
+      "grad_norm": 0.31088197231292725,
+      "learning_rate": 0.00018551917551288706,
+      "loss": 0.9873,
+      "step": 4440
+    },
+    {
+      "epoch": 0.2564028610982926,
+      "grad_norm": 0.31137150526046753,
+      "learning_rate": 0.0001854669489861137,
+      "loss": 0.9769,
+      "step": 4445
+    },
+    {
+      "epoch": 0.2566912782648823,
+      "grad_norm": 0.3470550775527954,
+      "learning_rate": 0.0001854146358275338,
+      "loss": 0.9824,
+      "step": 4450
+    },
+    {
+      "epoch": 0.2569796954314721,
+      "grad_norm": 0.305550754070282,
+      "learning_rate": 0.00018536223609017348,
+      "loss": 1.0573,
+      "step": 4455
+    },
+    {
+      "epoch": 0.2572681125980618,
+      "grad_norm": 0.30111902952194214,
+      "learning_rate": 0.00018530974982714667,
+      "loss": 0.9919,
+      "step": 4460
+    },
+    {
+      "epoch": 0.2575565297646516,
+      "grad_norm": 0.29458123445510864,
+      "learning_rate": 0.00018525717709165498,
+      "loss": 1.0249,
+      "step": 4465
+    },
+    {
+      "epoch": 0.2578449469312413,
+      "grad_norm": 0.2974050045013428,
+      "learning_rate": 0.0001852045179369877,
+      "loss": 1.0155,
+      "step": 4470
+    },
+    {
+      "epoch": 0.2581333640978311,
+      "grad_norm": 0.27646365761756897,
+      "learning_rate": 0.00018515177241652163,
+      "loss": 0.9477,
+      "step": 4475
+    },
+    {
+      "epoch": 0.25842178126442084,
+      "grad_norm": 0.3065283000469208,
+      "learning_rate": 0.0001850989405837212,
+      "loss": 0.9789,
+      "step": 4480
+    },
+    {
+      "epoch": 0.2587101984310106,
+      "grad_norm": 0.31208351254463196,
+      "learning_rate": 0.00018504602249213838,
+      "loss": 1.0209,
+      "step": 4485
+    },
+    {
+      "epoch": 0.25899861559760035,
+      "grad_norm": 0.27680978178977966,
+      "learning_rate": 0.0001849930181954124,
+      "loss": 0.9937,
+      "step": 4490
+    },
+    {
+      "epoch": 0.25928703276419013,
+      "grad_norm": 0.35537493228912354,
+      "learning_rate": 0.00018493992774727005,
+      "loss": 1.019,
+      "step": 4495
+    },
+    {
+      "epoch": 0.25957544993077986,
+      "grad_norm": 0.2992296814918518,
+      "learning_rate": 0.00018488675120152532,
+      "loss": 0.9409,
+      "step": 4500
+    },
+    {
+      "epoch": 0.25986386709736964,
+      "grad_norm": 0.2907122075557709,
+      "learning_rate": 0.00018483348861207953,
+      "loss": 0.9925,
+      "step": 4505
+    },
+    {
+      "epoch": 0.26015228426395937,
+      "grad_norm": 0.3083319664001465,
+      "learning_rate": 0.00018478014003292116,
+      "loss": 0.9494,
+      "step": 4510
+    },
+    {
+      "epoch": 0.26044070143054915,
+      "grad_norm": 0.2940841615200043,
+      "learning_rate": 0.00018472670551812596,
+      "loss": 1.0234,
+      "step": 4515
+    },
+    {
+      "epoch": 0.2607291185971389,
+      "grad_norm": 0.3526857793331146,
+      "learning_rate": 0.0001846731851218567,
+      "loss": 1.0047,
+      "step": 4520
+    },
+    {
+      "epoch": 0.26101753576372866,
+      "grad_norm": 0.2867284119129181,
+      "learning_rate": 0.00018461957889836324,
+      "loss": 0.953,
+      "step": 4525
+    },
+    {
+      "epoch": 0.2613059529303184,
+      "grad_norm": 0.28662440180778503,
+      "learning_rate": 0.00018456588690198236,
+      "loss": 0.9734,
+      "step": 4530
+    },
+    {
+      "epoch": 0.26159437009690817,
+      "grad_norm": 0.2874925136566162,
+      "learning_rate": 0.0001845121091871379,
+      "loss": 1.012,
+      "step": 4535
+    },
+    {
+      "epoch": 0.2618827872634979,
+      "grad_norm": 0.30890873074531555,
+      "learning_rate": 0.0001844582458083405,
+      "loss": 0.9317,
+      "step": 4540
+    },
+    {
+      "epoch": 0.2621712044300877,
+      "grad_norm": 0.2991410791873932,
+      "learning_rate": 0.0001844042968201877,
+      "loss": 0.9488,
+      "step": 4545
+    },
+    {
+      "epoch": 0.26245962159667746,
+      "grad_norm": 0.29846030473709106,
+      "learning_rate": 0.0001843502622773637,
+      "loss": 0.9722,
+      "step": 4550
+    },
+    {
+      "epoch": 0.2627480387632672,
+      "grad_norm": 0.30086445808410645,
+      "learning_rate": 0.0001842961422346396,
+      "loss": 0.9901,
+      "step": 4555
+    },
+    {
+      "epoch": 0.26303645592985697,
+      "grad_norm": 0.3020675778388977,
+      "learning_rate": 0.00018424193674687297,
+      "loss": 1.0275,
+      "step": 4560
+    },
+    {
+      "epoch": 0.2633248730964467,
+      "grad_norm": 0.3111262023448944,
+      "learning_rate": 0.00018418764586900817,
+      "loss": 0.9977,
+      "step": 4565
+    },
+    {
+      "epoch": 0.2636132902630365,
+      "grad_norm": 0.3167891204357147,
+      "learning_rate": 0.00018413326965607593,
+      "loss": 1.0266,
+      "step": 4570
+    },
+    {
+      "epoch": 0.2639017074296262,
+      "grad_norm": 0.28536850214004517,
+      "learning_rate": 0.00018407880816319363,
+      "loss": 0.9475,
+      "step": 4575
+    },
+    {
+      "epoch": 0.264190124596216,
+      "grad_norm": 0.30811807513237,
+      "learning_rate": 0.00018402426144556504,
+      "loss": 0.9549,
+      "step": 4580
+    },
+    {
+      "epoch": 0.2644785417628057,
+      "grad_norm": 0.2881765365600586,
+      "learning_rate": 0.0001839696295584803,
+      "loss": 1.0276,
+      "step": 4585
+    },
+    {
+      "epoch": 0.2647669589293955,
+      "grad_norm": 0.3339601159095764,
+      "learning_rate": 0.0001839149125573159,
+      "loss": 0.9772,
+      "step": 4590
+    },
+    {
+      "epoch": 0.26505537609598523,
+      "grad_norm": 0.2897505760192871,
+      "learning_rate": 0.0001838601104975346,
+      "loss": 1.0897,
+      "step": 4595
+    },
+    {
+      "epoch": 0.265343793262575,
+      "grad_norm": 0.3119150400161743,
+      "learning_rate": 0.00018380522343468532,
+      "loss": 0.9842,
+      "step": 4600
+    },
+    {
+      "epoch": 0.265343793262575,
+      "step": 4600,
+      "total_flos": 3.2343958172802744e+18,
+      "train_loss": 0.0,
+      "train_runtime": 0.0325,
+      "train_samples_per_second": 4262129.534,
+      "train_steps_per_second": 133192.508
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 4334,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.2343958172802744e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}