diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6489 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.265343793262575, + "eval_steps": 500, + "global_step": 4600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.7683433317951084e-05, + "grad_norm": 0.3952319025993347, + "learning_rate": 1.1534025374855825e-07, + "loss": 1.182, + "step": 1 + }, + { + "epoch": 0.0002884171665897554, + "grad_norm": 0.3334461748600006, + "learning_rate": 5.767012687427913e-07, + "loss": 1.0887, + "step": 5 + }, + { + "epoch": 0.0005768343331795108, + "grad_norm": 0.41704559326171875, + "learning_rate": 1.1534025374855826e-06, + "loss": 1.2132, + "step": 10 + }, + { + "epoch": 0.0008652514997692663, + "grad_norm": 0.4982852637767792, + "learning_rate": 1.7301038062283738e-06, + "loss": 1.1888, + "step": 15 + }, + { + "epoch": 0.0011536686663590216, + "grad_norm": 0.3702298104763031, + "learning_rate": 2.3068050749711653e-06, + "loss": 1.2105, + "step": 20 + }, + { + "epoch": 0.001442085832948777, + "grad_norm": 0.3640645444393158, + "learning_rate": 2.8835063437139563e-06, + "loss": 1.1714, + "step": 25 + }, + { + "epoch": 0.0017305029995385325, + "grad_norm": 0.31508558988571167, + "learning_rate": 3.4602076124567477e-06, + "loss": 1.0438, + "step": 30 + }, + { + "epoch": 0.0020189201661282878, + "grad_norm": 0.3910152018070221, + "learning_rate": 4.036908881199539e-06, + "loss": 1.212, + "step": 35 + }, + { + "epoch": 0.0023073373327180432, + "grad_norm": 0.32711583375930786, + "learning_rate": 4.6136101499423305e-06, + "loss": 1.1552, + "step": 40 + }, + { + "epoch": 0.0025957544993077987, + "grad_norm": 0.37455540895462036, + "learning_rate": 5.190311418685121e-06, + "loss": 1.1355, + "step": 45 + }, + { + "epoch": 0.002884171665897554, + "grad_norm": 0.32155269384384155, + "learning_rate": 5.7670126874279126e-06, + "loss": 1.1375, + "step": 50 + }, + { + "epoch": 0.0031725888324873096, + "grad_norm": 0.29815641045570374, + "learning_rate": 6.3437139561707036e-06, + "loss": 1.1193, + "step": 55 + }, + { + "epoch": 0.003461005999077065, + "grad_norm": 0.39492201805114746, + "learning_rate": 6.920415224913495e-06, + "loss": 1.1053, + "step": 60 + }, + { + "epoch": 0.0037494231656668205, + "grad_norm": 0.3298701345920563, + "learning_rate": 7.497116493656286e-06, + "loss": 1.107, + "step": 65 + }, + { + "epoch": 0.0040378403322565756, + "grad_norm": 0.3114672005176544, + "learning_rate": 8.073817762399077e-06, + "loss": 1.0677, + "step": 70 + }, + { + "epoch": 0.0043262574988463314, + "grad_norm": 0.3159383535385132, + "learning_rate": 8.650519031141868e-06, + "loss": 1.0959, + "step": 75 + }, + { + "epoch": 0.0046146746654360865, + "grad_norm": 0.2858622074127197, + "learning_rate": 9.227220299884661e-06, + "loss": 1.0435, + "step": 80 + }, + { + "epoch": 0.004903091832025842, + "grad_norm": 0.3337515890598297, + "learning_rate": 9.803921568627451e-06, + "loss": 0.9889, + "step": 85 + }, + { + "epoch": 0.005191508998615597, + "grad_norm": 0.3027825951576233, + "learning_rate": 1.0380622837370241e-05, + "loss": 1.1145, + "step": 90 + }, + { + "epoch": 0.005479926165205353, + "grad_norm": 0.34131115674972534, + "learning_rate": 1.0957324106113035e-05, + "loss": 1.0596, + "step": 95 + }, + { + "epoch": 0.005768343331795108, + "grad_norm": 0.3263566792011261, + "learning_rate": 1.1534025374855825e-05, + "loss": 0.9887, + "step": 100 + }, + { + "epoch": 0.006056760498384864, + "grad_norm": 0.325528085231781, + "learning_rate": 1.2110726643598615e-05, + "loss": 1.0143, + "step": 105 + }, + { + "epoch": 0.006345177664974619, + "grad_norm": 0.3773256242275238, + "learning_rate": 1.2687427912341407e-05, + "loss": 1.0, + "step": 110 + }, + { + "epoch": 0.006633594831564375, + "grad_norm": 0.2968287765979767, + "learning_rate": 1.3264129181084197e-05, + "loss": 0.9572, + "step": 115 + }, + { + "epoch": 0.00692201199815413, + "grad_norm": 0.29874077439308167, + "learning_rate": 1.384083044982699e-05, + "loss": 1.0344, + "step": 120 + }, + { + "epoch": 0.007210429164743885, + "grad_norm": 0.3251142203807831, + "learning_rate": 1.4417531718569783e-05, + "loss": 1.0183, + "step": 125 + }, + { + "epoch": 0.007498846331333641, + "grad_norm": 0.29589974880218506, + "learning_rate": 1.4994232987312573e-05, + "loss": 1.047, + "step": 130 + }, + { + "epoch": 0.007787263497923396, + "grad_norm": 0.3242173194885254, + "learning_rate": 1.5570934256055363e-05, + "loss": 1.0461, + "step": 135 + }, + { + "epoch": 0.008075680664513151, + "grad_norm": 0.31147414445877075, + "learning_rate": 1.6147635524798155e-05, + "loss": 1.047, + "step": 140 + }, + { + "epoch": 0.008364097831102908, + "grad_norm": 0.31779709458351135, + "learning_rate": 1.6724336793540947e-05, + "loss": 1.0784, + "step": 145 + }, + { + "epoch": 0.008652514997692663, + "grad_norm": 0.3391679525375366, + "learning_rate": 1.7301038062283735e-05, + "loss": 1.0576, + "step": 150 + }, + { + "epoch": 0.008940932164282418, + "grad_norm": 0.3228215277194977, + "learning_rate": 1.787773933102653e-05, + "loss": 1.0145, + "step": 155 + }, + { + "epoch": 0.009229349330872173, + "grad_norm": 0.30271971225738525, + "learning_rate": 1.8454440599769322e-05, + "loss": 0.9874, + "step": 160 + }, + { + "epoch": 0.00951776649746193, + "grad_norm": 0.30643004179000854, + "learning_rate": 1.903114186851211e-05, + "loss": 0.9733, + "step": 165 + }, + { + "epoch": 0.009806183664051685, + "grad_norm": 0.36777183413505554, + "learning_rate": 1.9607843137254903e-05, + "loss": 1.0242, + "step": 170 + }, + { + "epoch": 0.01009460083064144, + "grad_norm": 0.3419516086578369, + "learning_rate": 2.0184544405997694e-05, + "loss": 1.1211, + "step": 175 + }, + { + "epoch": 0.010383017997231195, + "grad_norm": 0.3591030538082123, + "learning_rate": 2.0761245674740483e-05, + "loss": 1.0323, + "step": 180 + }, + { + "epoch": 0.01067143516382095, + "grad_norm": 0.38365352153778076, + "learning_rate": 2.1337946943483278e-05, + "loss": 0.9613, + "step": 185 + }, + { + "epoch": 0.010959852330410707, + "grad_norm": 0.3436645269393921, + "learning_rate": 2.191464821222607e-05, + "loss": 1.0753, + "step": 190 + }, + { + "epoch": 0.011248269497000462, + "grad_norm": 0.341776967048645, + "learning_rate": 2.249134948096886e-05, + "loss": 1.064, + "step": 195 + }, + { + "epoch": 0.011536686663590217, + "grad_norm": 0.38297685980796814, + "learning_rate": 2.306805074971165e-05, + "loss": 1.0105, + "step": 200 + }, + { + "epoch": 0.011825103830179972, + "grad_norm": 0.3430030643939972, + "learning_rate": 2.3644752018454442e-05, + "loss": 1.0103, + "step": 205 + }, + { + "epoch": 0.012113520996769728, + "grad_norm": 0.3319534361362457, + "learning_rate": 2.422145328719723e-05, + "loss": 1.0671, + "step": 210 + }, + { + "epoch": 0.012401938163359483, + "grad_norm": 0.3615305423736572, + "learning_rate": 2.4798154555940022e-05, + "loss": 0.9236, + "step": 215 + }, + { + "epoch": 0.012690355329949238, + "grad_norm": 0.4457886517047882, + "learning_rate": 2.5374855824682814e-05, + "loss": 1.0461, + "step": 220 + }, + { + "epoch": 0.012978772496538993, + "grad_norm": 0.7715578675270081, + "learning_rate": 2.5951557093425606e-05, + "loss": 1.0131, + "step": 225 + }, + { + "epoch": 0.01326718966312875, + "grad_norm": 0.4368738830089569, + "learning_rate": 2.6528258362168395e-05, + "loss": 1.0255, + "step": 230 + }, + { + "epoch": 0.013555606829718505, + "grad_norm": 0.38978299498558044, + "learning_rate": 2.7104959630911193e-05, + "loss": 0.9773, + "step": 235 + }, + { + "epoch": 0.01384402399630826, + "grad_norm": 0.35930851101875305, + "learning_rate": 2.768166089965398e-05, + "loss": 1.0043, + "step": 240 + }, + { + "epoch": 0.014132441162898015, + "grad_norm": 0.37871646881103516, + "learning_rate": 2.8258362168396773e-05, + "loss": 1.0082, + "step": 245 + }, + { + "epoch": 0.01442085832948777, + "grad_norm": 0.3493201732635498, + "learning_rate": 2.8835063437139565e-05, + "loss": 0.9856, + "step": 250 + }, + { + "epoch": 0.014709275496077527, + "grad_norm": 0.364734947681427, + "learning_rate": 2.9411764705882354e-05, + "loss": 1.0379, + "step": 255 + }, + { + "epoch": 0.014997692662667282, + "grad_norm": 0.3644263446331024, + "learning_rate": 2.9988465974625146e-05, + "loss": 1.006, + "step": 260 + }, + { + "epoch": 0.015286109829257037, + "grad_norm": 0.3671714961528778, + "learning_rate": 3.0565167243367934e-05, + "loss": 0.9499, + "step": 265 + }, + { + "epoch": 0.015574526995846792, + "grad_norm": 0.384804904460907, + "learning_rate": 3.1141868512110726e-05, + "loss": 1.0438, + "step": 270 + }, + { + "epoch": 0.015862944162436547, + "grad_norm": 0.36940938234329224, + "learning_rate": 3.171856978085352e-05, + "loss": 0.9476, + "step": 275 + }, + { + "epoch": 0.016151361329026302, + "grad_norm": 0.38267725706100464, + "learning_rate": 3.229527104959631e-05, + "loss": 0.9689, + "step": 280 + }, + { + "epoch": 0.01643977849561606, + "grad_norm": 0.3497903347015381, + "learning_rate": 3.28719723183391e-05, + "loss": 0.9143, + "step": 285 + }, + { + "epoch": 0.016728195662205816, + "grad_norm": 0.3465529978275299, + "learning_rate": 3.344867358708189e-05, + "loss": 0.9616, + "step": 290 + }, + { + "epoch": 0.01701661282879557, + "grad_norm": 0.3548210859298706, + "learning_rate": 3.4025374855824685e-05, + "loss": 0.9695, + "step": 295 + }, + { + "epoch": 0.017305029995385326, + "grad_norm": 0.3769378662109375, + "learning_rate": 3.460207612456747e-05, + "loss": 0.963, + "step": 300 + }, + { + "epoch": 0.01759344716197508, + "grad_norm": 0.3663967549800873, + "learning_rate": 3.517877739331027e-05, + "loss": 1.0924, + "step": 305 + }, + { + "epoch": 0.017881864328564836, + "grad_norm": 0.38498544692993164, + "learning_rate": 3.575547866205306e-05, + "loss": 1.0481, + "step": 310 + }, + { + "epoch": 0.01817028149515459, + "grad_norm": 0.3465900123119354, + "learning_rate": 3.633217993079585e-05, + "loss": 1.0396, + "step": 315 + }, + { + "epoch": 0.018458698661744346, + "grad_norm": 0.3498382270336151, + "learning_rate": 3.6908881199538644e-05, + "loss": 1.0005, + "step": 320 + }, + { + "epoch": 0.0187471158283341, + "grad_norm": 0.3397336006164551, + "learning_rate": 3.748558246828143e-05, + "loss": 0.9682, + "step": 325 + }, + { + "epoch": 0.01903553299492386, + "grad_norm": 0.33760690689086914, + "learning_rate": 3.806228373702422e-05, + "loss": 0.9975, + "step": 330 + }, + { + "epoch": 0.019323950161513614, + "grad_norm": 0.32710301876068115, + "learning_rate": 3.863898500576701e-05, + "loss": 0.985, + "step": 335 + }, + { + "epoch": 0.01961236732810337, + "grad_norm": 0.40678462386131287, + "learning_rate": 3.9215686274509805e-05, + "loss": 0.9664, + "step": 340 + }, + { + "epoch": 0.019900784494693124, + "grad_norm": 0.38339948654174805, + "learning_rate": 3.97923875432526e-05, + "loss": 0.9962, + "step": 345 + }, + { + "epoch": 0.02018920166128288, + "grad_norm": 0.3516389727592468, + "learning_rate": 4.036908881199539e-05, + "loss": 0.9385, + "step": 350 + }, + { + "epoch": 0.020477618827872635, + "grad_norm": 0.3469911515712738, + "learning_rate": 4.094579008073818e-05, + "loss": 0.9795, + "step": 355 + }, + { + "epoch": 0.02076603599446239, + "grad_norm": 0.351566344499588, + "learning_rate": 4.1522491349480966e-05, + "loss": 1.0131, + "step": 360 + }, + { + "epoch": 0.021054453161052145, + "grad_norm": 0.3254294991493225, + "learning_rate": 4.209919261822376e-05, + "loss": 0.9784, + "step": 365 + }, + { + "epoch": 0.0213428703276419, + "grad_norm": 0.352115660905838, + "learning_rate": 4.2675893886966556e-05, + "loss": 1.0013, + "step": 370 + }, + { + "epoch": 0.021631287494231658, + "grad_norm": 0.35616523027420044, + "learning_rate": 4.325259515570935e-05, + "loss": 1.0209, + "step": 375 + }, + { + "epoch": 0.021919704660821413, + "grad_norm": 0.3402170240879059, + "learning_rate": 4.382929642445214e-05, + "loss": 0.976, + "step": 380 + }, + { + "epoch": 0.022208121827411168, + "grad_norm": 0.30762144923210144, + "learning_rate": 4.440599769319493e-05, + "loss": 0.8757, + "step": 385 + }, + { + "epoch": 0.022496538994000923, + "grad_norm": 0.33472269773483276, + "learning_rate": 4.498269896193772e-05, + "loss": 1.0687, + "step": 390 + }, + { + "epoch": 0.022784956160590678, + "grad_norm": 0.3568858802318573, + "learning_rate": 4.555940023068051e-05, + "loss": 1.0279, + "step": 395 + }, + { + "epoch": 0.023073373327180433, + "grad_norm": 0.3303862512111664, + "learning_rate": 4.61361014994233e-05, + "loss": 1.0061, + "step": 400 + }, + { + "epoch": 0.023361790493770188, + "grad_norm": 0.3586498498916626, + "learning_rate": 4.671280276816609e-05, + "loss": 1.0007, + "step": 405 + }, + { + "epoch": 0.023650207660359943, + "grad_norm": 0.34804537892341614, + "learning_rate": 4.7289504036908884e-05, + "loss": 0.9913, + "step": 410 + }, + { + "epoch": 0.0239386248269497, + "grad_norm": 0.33361154794692993, + "learning_rate": 4.7866205305651676e-05, + "loss": 0.9615, + "step": 415 + }, + { + "epoch": 0.024227041993539457, + "grad_norm": 0.30743229389190674, + "learning_rate": 4.844290657439446e-05, + "loss": 1.0062, + "step": 420 + }, + { + "epoch": 0.024515459160129212, + "grad_norm": 0.3414464294910431, + "learning_rate": 4.901960784313725e-05, + "loss": 1.0266, + "step": 425 + }, + { + "epoch": 0.024803876326718967, + "grad_norm": 0.311254620552063, + "learning_rate": 4.9596309111880045e-05, + "loss": 0.9525, + "step": 430 + }, + { + "epoch": 0.025092293493308722, + "grad_norm": 0.3211973011493683, + "learning_rate": 5.017301038062284e-05, + "loss": 1.0204, + "step": 435 + }, + { + "epoch": 0.025380710659898477, + "grad_norm": 0.32264503836631775, + "learning_rate": 5.074971164936563e-05, + "loss": 0.9187, + "step": 440 + }, + { + "epoch": 0.025669127826488232, + "grad_norm": 0.3149093985557556, + "learning_rate": 5.132641291810843e-05, + "loss": 1.0324, + "step": 445 + }, + { + "epoch": 0.025957544993077987, + "grad_norm": 0.31910112500190735, + "learning_rate": 5.190311418685121e-05, + "loss": 0.9924, + "step": 450 + }, + { + "epoch": 0.026245962159667742, + "grad_norm": 0.329057514667511, + "learning_rate": 5.2479815455594004e-05, + "loss": 1.0235, + "step": 455 + }, + { + "epoch": 0.0265343793262575, + "grad_norm": 0.32927969098091125, + "learning_rate": 5.305651672433679e-05, + "loss": 0.9986, + "step": 460 + }, + { + "epoch": 0.026822796492847256, + "grad_norm": 0.30113425850868225, + "learning_rate": 5.363321799307959e-05, + "loss": 0.9996, + "step": 465 + }, + { + "epoch": 0.02711121365943701, + "grad_norm": 0.31802427768707275, + "learning_rate": 5.4209919261822386e-05, + "loss": 0.903, + "step": 470 + }, + { + "epoch": 0.027399630826026766, + "grad_norm": 0.31492453813552856, + "learning_rate": 5.478662053056517e-05, + "loss": 0.9627, + "step": 475 + }, + { + "epoch": 0.02768804799261652, + "grad_norm": 0.32527875900268555, + "learning_rate": 5.536332179930796e-05, + "loss": 0.9842, + "step": 480 + }, + { + "epoch": 0.027976465159206276, + "grad_norm": 0.3000083267688751, + "learning_rate": 5.594002306805075e-05, + "loss": 0.9275, + "step": 485 + }, + { + "epoch": 0.02826488232579603, + "grad_norm": 0.30580878257751465, + "learning_rate": 5.651672433679355e-05, + "loss": 1.0111, + "step": 490 + }, + { + "epoch": 0.028553299492385786, + "grad_norm": 0.3029692769050598, + "learning_rate": 5.709342560553633e-05, + "loss": 0.9997, + "step": 495 + }, + { + "epoch": 0.02884171665897554, + "grad_norm": 0.29320913553237915, + "learning_rate": 5.767012687427913e-05, + "loss": 0.9728, + "step": 500 + }, + { + "epoch": 0.0291301338255653, + "grad_norm": 0.27277612686157227, + "learning_rate": 5.8246828143021916e-05, + "loss": 0.9481, + "step": 505 + }, + { + "epoch": 0.029418550992155054, + "grad_norm": 0.3065517544746399, + "learning_rate": 5.882352941176471e-05, + "loss": 1.0068, + "step": 510 + }, + { + "epoch": 0.02970696815874481, + "grad_norm": 0.30595871806144714, + "learning_rate": 5.940023068050749e-05, + "loss": 1.0394, + "step": 515 + }, + { + "epoch": 0.029995385325334564, + "grad_norm": 0.2905437648296356, + "learning_rate": 5.997693194925029e-05, + "loss": 0.8914, + "step": 520 + }, + { + "epoch": 0.03028380249192432, + "grad_norm": 0.30169710516929626, + "learning_rate": 6.0553633217993076e-05, + "loss": 1.0714, + "step": 525 + }, + { + "epoch": 0.030572219658514074, + "grad_norm": 0.30245259404182434, + "learning_rate": 6.113033448673587e-05, + "loss": 0.9748, + "step": 530 + }, + { + "epoch": 0.03086063682510383, + "grad_norm": 0.31071239709854126, + "learning_rate": 6.170703575547867e-05, + "loss": 1.0307, + "step": 535 + }, + { + "epoch": 0.031149053991693584, + "grad_norm": 0.301554799079895, + "learning_rate": 6.228373702422145e-05, + "loss": 0.9904, + "step": 540 + }, + { + "epoch": 0.03143747115828334, + "grad_norm": 0.29832157492637634, + "learning_rate": 6.286043829296425e-05, + "loss": 0.965, + "step": 545 + }, + { + "epoch": 0.031725888324873094, + "grad_norm": 0.2960033118724823, + "learning_rate": 6.343713956170704e-05, + "loss": 0.9661, + "step": 550 + }, + { + "epoch": 0.03201430549146285, + "grad_norm": 0.2793910503387451, + "learning_rate": 6.401384083044983e-05, + "loss": 0.9691, + "step": 555 + }, + { + "epoch": 0.032302722658052604, + "grad_norm": 0.2931232750415802, + "learning_rate": 6.459054209919262e-05, + "loss": 1.0152, + "step": 560 + }, + { + "epoch": 0.03259113982464236, + "grad_norm": 0.29276397824287415, + "learning_rate": 6.516724336793542e-05, + "loss": 0.9644, + "step": 565 + }, + { + "epoch": 0.03287955699123212, + "grad_norm": 0.2859160304069519, + "learning_rate": 6.57439446366782e-05, + "loss": 0.8926, + "step": 570 + }, + { + "epoch": 0.033167974157821876, + "grad_norm": 0.2981337308883667, + "learning_rate": 6.6320645905421e-05, + "loss": 0.9805, + "step": 575 + }, + { + "epoch": 0.03345639132441163, + "grad_norm": 0.28318145871162415, + "learning_rate": 6.689734717416379e-05, + "loss": 0.9828, + "step": 580 + }, + { + "epoch": 0.033744808491001387, + "grad_norm": 0.2922738194465637, + "learning_rate": 6.747404844290659e-05, + "loss": 0.9495, + "step": 585 + }, + { + "epoch": 0.03403322565759114, + "grad_norm": 0.3307567536830902, + "learning_rate": 6.805074971164937e-05, + "loss": 0.975, + "step": 590 + }, + { + "epoch": 0.0343216428241809, + "grad_norm": 0.2792339622974396, + "learning_rate": 6.862745098039216e-05, + "loss": 1.0021, + "step": 595 + }, + { + "epoch": 0.03461005999077065, + "grad_norm": 0.26365357637405396, + "learning_rate": 6.920415224913494e-05, + "loss": 1.0316, + "step": 600 + }, + { + "epoch": 0.03489847715736041, + "grad_norm": 0.285918265581131, + "learning_rate": 6.978085351787774e-05, + "loss": 1.0025, + "step": 605 + }, + { + "epoch": 0.03518689432395016, + "grad_norm": 0.290382444858551, + "learning_rate": 7.035755478662054e-05, + "loss": 1.0198, + "step": 610 + }, + { + "epoch": 0.03547531149053992, + "grad_norm": 0.2909998595714569, + "learning_rate": 7.093425605536332e-05, + "loss": 1.0522, + "step": 615 + }, + { + "epoch": 0.03576372865712967, + "grad_norm": 0.2691628038883209, + "learning_rate": 7.151095732410612e-05, + "loss": 1.0285, + "step": 620 + }, + { + "epoch": 0.03605214582371943, + "grad_norm": 0.2793739140033722, + "learning_rate": 7.20876585928489e-05, + "loss": 0.9431, + "step": 625 + }, + { + "epoch": 0.03634056299030918, + "grad_norm": 0.28252139687538147, + "learning_rate": 7.26643598615917e-05, + "loss": 0.954, + "step": 630 + }, + { + "epoch": 0.03662898015689894, + "grad_norm": 0.2551520764827728, + "learning_rate": 7.324106113033449e-05, + "loss": 0.9477, + "step": 635 + }, + { + "epoch": 0.03691739732348869, + "grad_norm": 0.2769528925418854, + "learning_rate": 7.381776239907729e-05, + "loss": 1.0228, + "step": 640 + }, + { + "epoch": 0.03720581449007845, + "grad_norm": 0.26769739389419556, + "learning_rate": 7.439446366782007e-05, + "loss": 0.9844, + "step": 645 + }, + { + "epoch": 0.0374942316566682, + "grad_norm": 0.2822119891643524, + "learning_rate": 7.497116493656286e-05, + "loss": 1.0532, + "step": 650 + }, + { + "epoch": 0.03778264882325796, + "grad_norm": 0.2787601053714752, + "learning_rate": 7.554786620530564e-05, + "loss": 1.0154, + "step": 655 + }, + { + "epoch": 0.03807106598984772, + "grad_norm": 0.27694109082221985, + "learning_rate": 7.612456747404844e-05, + "loss": 0.9775, + "step": 660 + }, + { + "epoch": 0.038359483156437474, + "grad_norm": 0.4112897217273712, + "learning_rate": 7.670126874279123e-05, + "loss": 1.0071, + "step": 665 + }, + { + "epoch": 0.03864790032302723, + "grad_norm": 0.26005199551582336, + "learning_rate": 7.727797001153403e-05, + "loss": 0.9632, + "step": 670 + }, + { + "epoch": 0.038936317489616984, + "grad_norm": 0.25056615471839905, + "learning_rate": 7.785467128027682e-05, + "loss": 0.9773, + "step": 675 + }, + { + "epoch": 0.03922473465620674, + "grad_norm": 0.27164942026138306, + "learning_rate": 7.843137254901961e-05, + "loss": 0.9927, + "step": 680 + }, + { + "epoch": 0.039513151822796494, + "grad_norm": 0.26238757371902466, + "learning_rate": 7.900807381776241e-05, + "loss": 0.9612, + "step": 685 + }, + { + "epoch": 0.03980156898938625, + "grad_norm": 0.28629186749458313, + "learning_rate": 7.95847750865052e-05, + "loss": 0.9579, + "step": 690 + }, + { + "epoch": 0.040089986155976004, + "grad_norm": 0.2650497555732727, + "learning_rate": 8.016147635524799e-05, + "loss": 0.9667, + "step": 695 + }, + { + "epoch": 0.04037840332256576, + "grad_norm": 0.26934972405433655, + "learning_rate": 8.073817762399078e-05, + "loss": 0.9257, + "step": 700 + }, + { + "epoch": 0.040666820489155514, + "grad_norm": 0.27391955256462097, + "learning_rate": 8.131487889273358e-05, + "loss": 1.0725, + "step": 705 + }, + { + "epoch": 0.04095523765574527, + "grad_norm": 0.2905539274215698, + "learning_rate": 8.189158016147636e-05, + "loss": 0.9979, + "step": 710 + }, + { + "epoch": 0.041243654822335024, + "grad_norm": 0.26050031185150146, + "learning_rate": 8.246828143021915e-05, + "loss": 0.9901, + "step": 715 + }, + { + "epoch": 0.04153207198892478, + "grad_norm": 0.4822568893432617, + "learning_rate": 8.304498269896193e-05, + "loss": 0.9753, + "step": 720 + }, + { + "epoch": 0.041820489155514534, + "grad_norm": 0.27065780758857727, + "learning_rate": 8.362168396770473e-05, + "loss": 0.961, + "step": 725 + }, + { + "epoch": 0.04210890632210429, + "grad_norm": 0.27039390802383423, + "learning_rate": 8.419838523644751e-05, + "loss": 1.0218, + "step": 730 + }, + { + "epoch": 0.042397323488694044, + "grad_norm": 0.267991304397583, + "learning_rate": 8.477508650519031e-05, + "loss": 0.8937, + "step": 735 + }, + { + "epoch": 0.0426857406552838, + "grad_norm": 0.2698671519756317, + "learning_rate": 8.535178777393311e-05, + "loss": 1.0203, + "step": 740 + }, + { + "epoch": 0.04297415782187356, + "grad_norm": 0.25605538487434387, + "learning_rate": 8.59284890426759e-05, + "loss": 1.0398, + "step": 745 + }, + { + "epoch": 0.043262574988463316, + "grad_norm": 0.26644793152809143, + "learning_rate": 8.65051903114187e-05, + "loss": 1.0212, + "step": 750 + }, + { + "epoch": 0.04355099215505307, + "grad_norm": 0.2879778742790222, + "learning_rate": 8.708189158016148e-05, + "loss": 0.9854, + "step": 755 + }, + { + "epoch": 0.043839409321642826, + "grad_norm": 0.26750192046165466, + "learning_rate": 8.765859284890428e-05, + "loss": 1.0168, + "step": 760 + }, + { + "epoch": 0.04412782648823258, + "grad_norm": 0.2743099331855774, + "learning_rate": 8.823529411764706e-05, + "loss": 0.9447, + "step": 765 + }, + { + "epoch": 0.044416243654822336, + "grad_norm": 0.27284887433052063, + "learning_rate": 8.881199538638986e-05, + "loss": 1.016, + "step": 770 + }, + { + "epoch": 0.04470466082141209, + "grad_norm": 0.26251500844955444, + "learning_rate": 8.938869665513265e-05, + "loss": 0.9275, + "step": 775 + }, + { + "epoch": 0.044993077988001846, + "grad_norm": 0.26898619532585144, + "learning_rate": 8.996539792387543e-05, + "loss": 0.9258, + "step": 780 + }, + { + "epoch": 0.0452814951545916, + "grad_norm": 0.2636859118938446, + "learning_rate": 9.054209919261822e-05, + "loss": 1.1368, + "step": 785 + }, + { + "epoch": 0.045569912321181356, + "grad_norm": 0.25750333070755005, + "learning_rate": 9.111880046136102e-05, + "loss": 0.9829, + "step": 790 + }, + { + "epoch": 0.04585832948777111, + "grad_norm": 0.26251962780952454, + "learning_rate": 9.16955017301038e-05, + "loss": 1.0722, + "step": 795 + }, + { + "epoch": 0.046146746654360866, + "grad_norm": 0.24186044931411743, + "learning_rate": 9.22722029988466e-05, + "loss": 0.9681, + "step": 800 + }, + { + "epoch": 0.04643516382095062, + "grad_norm": 0.2631891965866089, + "learning_rate": 9.28489042675894e-05, + "loss": 1.0082, + "step": 805 + }, + { + "epoch": 0.046723580987540377, + "grad_norm": 0.25769105553627014, + "learning_rate": 9.342560553633218e-05, + "loss": 0.9419, + "step": 810 + }, + { + "epoch": 0.04701199815413013, + "grad_norm": 0.26983222365379333, + "learning_rate": 9.400230680507498e-05, + "loss": 0.9698, + "step": 815 + }, + { + "epoch": 0.04730041532071989, + "grad_norm": 0.268951952457428, + "learning_rate": 9.457900807381777e-05, + "loss": 1.0199, + "step": 820 + }, + { + "epoch": 0.04758883248730964, + "grad_norm": 0.2618368864059448, + "learning_rate": 9.515570934256057e-05, + "loss": 1.0474, + "step": 825 + }, + { + "epoch": 0.0478772496538994, + "grad_norm": 0.2535788118839264, + "learning_rate": 9.573241061130335e-05, + "loss": 1.051, + "step": 830 + }, + { + "epoch": 0.04816566682048916, + "grad_norm": 0.24797338247299194, + "learning_rate": 9.630911188004614e-05, + "loss": 0.9787, + "step": 835 + }, + { + "epoch": 0.048454083987078914, + "grad_norm": 0.2542094886302948, + "learning_rate": 9.688581314878892e-05, + "loss": 1.0301, + "step": 840 + }, + { + "epoch": 0.04874250115366867, + "grad_norm": 0.34137168526649475, + "learning_rate": 9.746251441753172e-05, + "loss": 0.8916, + "step": 845 + }, + { + "epoch": 0.049030918320258424, + "grad_norm": 0.25905948877334595, + "learning_rate": 9.80392156862745e-05, + "loss": 1.0086, + "step": 850 + }, + { + "epoch": 0.04931933548684818, + "grad_norm": 0.24208292365074158, + "learning_rate": 9.86159169550173e-05, + "loss": 0.962, + "step": 855 + }, + { + "epoch": 0.049607752653437934, + "grad_norm": 0.2500937879085541, + "learning_rate": 9.919261822376009e-05, + "loss": 0.983, + "step": 860 + }, + { + "epoch": 0.04989616982002769, + "grad_norm": 0.2481968104839325, + "learning_rate": 9.976931949250289e-05, + "loss": 0.9798, + "step": 865 + }, + { + "epoch": 0.050184586986617444, + "grad_norm": 0.25975415110588074, + "learning_rate": 0.00010034602076124569, + "loss": 0.9621, + "step": 870 + }, + { + "epoch": 0.0504730041532072, + "grad_norm": 0.25389575958251953, + "learning_rate": 0.00010092272202998847, + "loss": 0.9959, + "step": 875 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 0.26200932264328003, + "learning_rate": 0.00010149942329873126, + "loss": 0.9432, + "step": 880 + }, + { + "epoch": 0.05104983848638671, + "grad_norm": 0.25433865189552307, + "learning_rate": 0.00010207612456747407, + "loss": 1.0272, + "step": 885 + }, + { + "epoch": 0.051338255652976464, + "grad_norm": 0.29402443766593933, + "learning_rate": 0.00010265282583621685, + "loss": 1.018, + "step": 890 + }, + { + "epoch": 0.05162667281956622, + "grad_norm": 0.2625313699245453, + "learning_rate": 0.00010322952710495964, + "loss": 1.0326, + "step": 895 + }, + { + "epoch": 0.051915089986155974, + "grad_norm": 0.2682657241821289, + "learning_rate": 0.00010380622837370242, + "loss": 1.0215, + "step": 900 + }, + { + "epoch": 0.05220350715274573, + "grad_norm": 0.27114447951316833, + "learning_rate": 0.00010438292964244522, + "loss": 0.9736, + "step": 905 + }, + { + "epoch": 0.052491924319335484, + "grad_norm": 0.2469518631696701, + "learning_rate": 0.00010495963091118801, + "loss": 0.93, + "step": 910 + }, + { + "epoch": 0.05278034148592524, + "grad_norm": 0.262253999710083, + "learning_rate": 0.00010553633217993079, + "loss": 0.9477, + "step": 915 + }, + { + "epoch": 0.053068758652515, + "grad_norm": 0.25354915857315063, + "learning_rate": 0.00010611303344867358, + "loss": 0.9926, + "step": 920 + }, + { + "epoch": 0.053357175819104756, + "grad_norm": 0.24856913089752197, + "learning_rate": 0.00010668973471741639, + "loss": 0.9726, + "step": 925 + }, + { + "epoch": 0.05364559298569451, + "grad_norm": 0.24939557909965515, + "learning_rate": 0.00010726643598615918, + "loss": 0.9575, + "step": 930 + }, + { + "epoch": 0.053934010152284266, + "grad_norm": 0.2722608745098114, + "learning_rate": 0.00010784313725490196, + "loss": 1.0017, + "step": 935 + }, + { + "epoch": 0.05422242731887402, + "grad_norm": 0.25203198194503784, + "learning_rate": 0.00010841983852364477, + "loss": 0.9141, + "step": 940 + }, + { + "epoch": 0.054510844485463776, + "grad_norm": 0.2586802840232849, + "learning_rate": 0.00010899653979238756, + "loss": 1.0066, + "step": 945 + }, + { + "epoch": 0.05479926165205353, + "grad_norm": 0.24033570289611816, + "learning_rate": 0.00010957324106113034, + "loss": 1.0113, + "step": 950 + }, + { + "epoch": 0.055087678818643286, + "grad_norm": 0.2373732328414917, + "learning_rate": 0.00011014994232987313, + "loss": 1.0172, + "step": 955 + }, + { + "epoch": 0.05537609598523304, + "grad_norm": 0.25045233964920044, + "learning_rate": 0.00011072664359861593, + "loss": 0.9548, + "step": 960 + }, + { + "epoch": 0.055664513151822796, + "grad_norm": 0.25307127833366394, + "learning_rate": 0.00011130334486735871, + "loss": 0.8803, + "step": 965 + }, + { + "epoch": 0.05595293031841255, + "grad_norm": 0.2580971121788025, + "learning_rate": 0.0001118800461361015, + "loss": 1.0257, + "step": 970 + }, + { + "epoch": 0.056241347485002306, + "grad_norm": 0.3492274284362793, + "learning_rate": 0.00011245674740484428, + "loss": 0.9915, + "step": 975 + }, + { + "epoch": 0.05652976465159206, + "grad_norm": 0.3969261944293976, + "learning_rate": 0.0001130334486735871, + "loss": 0.9871, + "step": 980 + }, + { + "epoch": 0.056818181818181816, + "grad_norm": 0.2512189447879791, + "learning_rate": 0.00011361014994232988, + "loss": 0.9999, + "step": 985 + }, + { + "epoch": 0.05710659898477157, + "grad_norm": 0.24583379924297333, + "learning_rate": 0.00011418685121107266, + "loss": 1.019, + "step": 990 + }, + { + "epoch": 0.057395016151361326, + "grad_norm": 0.23418952524662018, + "learning_rate": 0.00011476355247981545, + "loss": 0.9976, + "step": 995 + }, + { + "epoch": 0.05768343331795108, + "grad_norm": 0.24816179275512695, + "learning_rate": 0.00011534025374855826, + "loss": 0.9787, + "step": 1000 + }, + { + "epoch": 0.05797185048454084, + "grad_norm": 0.238878071308136, + "learning_rate": 0.00011591695501730105, + "loss": 0.9831, + "step": 1005 + }, + { + "epoch": 0.0582602676511306, + "grad_norm": 0.240176260471344, + "learning_rate": 0.00011649365628604383, + "loss": 0.9604, + "step": 1010 + }, + { + "epoch": 0.05854868481772035, + "grad_norm": 0.24366143345832825, + "learning_rate": 0.00011707035755478663, + "loss": 1.0633, + "step": 1015 + }, + { + "epoch": 0.05883710198431011, + "grad_norm": 0.24254244565963745, + "learning_rate": 0.00011764705882352942, + "loss": 1.0299, + "step": 1020 + }, + { + "epoch": 0.05912551915089986, + "grad_norm": 0.2483944445848465, + "learning_rate": 0.0001182237600922722, + "loss": 1.0325, + "step": 1025 + }, + { + "epoch": 0.05941393631748962, + "grad_norm": 0.23639345169067383, + "learning_rate": 0.00011880046136101499, + "loss": 0.9192, + "step": 1030 + }, + { + "epoch": 0.059702353484079373, + "grad_norm": 0.26320794224739075, + "learning_rate": 0.0001193771626297578, + "loss": 0.973, + "step": 1035 + }, + { + "epoch": 0.05999077065066913, + "grad_norm": 0.26271867752075195, + "learning_rate": 0.00011995386389850058, + "loss": 1.0339, + "step": 1040 + }, + { + "epoch": 0.060279187817258884, + "grad_norm": 0.2515929043292999, + "learning_rate": 0.00012053056516724337, + "loss": 0.9777, + "step": 1045 + }, + { + "epoch": 0.06056760498384864, + "grad_norm": 0.24450047314167023, + "learning_rate": 0.00012110726643598615, + "loss": 0.9781, + "step": 1050 + }, + { + "epoch": 0.060856022150438394, + "grad_norm": 0.247002974152565, + "learning_rate": 0.00012168396770472896, + "loss": 0.9742, + "step": 1055 + }, + { + "epoch": 0.06114443931702815, + "grad_norm": 0.22039633989334106, + "learning_rate": 0.00012226066897347174, + "loss": 0.9602, + "step": 1060 + }, + { + "epoch": 0.061432856483617904, + "grad_norm": 0.25299662351608276, + "learning_rate": 0.00012283737024221453, + "loss": 0.9429, + "step": 1065 + }, + { + "epoch": 0.06172127365020766, + "grad_norm": 0.24021919071674347, + "learning_rate": 0.00012341407151095733, + "loss": 1.0543, + "step": 1070 + }, + { + "epoch": 0.062009690816797414, + "grad_norm": 0.2851802408695221, + "learning_rate": 0.00012399077277970013, + "loss": 1.0169, + "step": 1075 + }, + { + "epoch": 0.06229810798338717, + "grad_norm": 0.2532206177711487, + "learning_rate": 0.0001245674740484429, + "loss": 0.9388, + "step": 1080 + }, + { + "epoch": 0.06258652514997692, + "grad_norm": 0.2355235517024994, + "learning_rate": 0.0001251441753171857, + "loss": 0.9283, + "step": 1085 + }, + { + "epoch": 0.06287494231656668, + "grad_norm": 0.2673757076263428, + "learning_rate": 0.0001257208765859285, + "loss": 1.0022, + "step": 1090 + }, + { + "epoch": 0.06316335948315643, + "grad_norm": 0.22847038507461548, + "learning_rate": 0.0001262975778546713, + "loss": 0.9481, + "step": 1095 + }, + { + "epoch": 0.06345177664974619, + "grad_norm": 0.25772714614868164, + "learning_rate": 0.00012687427912341407, + "loss": 0.9909, + "step": 1100 + }, + { + "epoch": 0.06374019381633594, + "grad_norm": 0.238713800907135, + "learning_rate": 0.00012745098039215687, + "loss": 0.9379, + "step": 1105 + }, + { + "epoch": 0.0640286109829257, + "grad_norm": 0.24460141360759735, + "learning_rate": 0.00012802768166089967, + "loss": 0.9398, + "step": 1110 + }, + { + "epoch": 0.06431702814951545, + "grad_norm": 0.23570501804351807, + "learning_rate": 0.00012860438292964244, + "loss": 0.9292, + "step": 1115 + }, + { + "epoch": 0.06460544531610521, + "grad_norm": 0.26408931612968445, + "learning_rate": 0.00012918108419838524, + "loss": 1.026, + "step": 1120 + }, + { + "epoch": 0.06489386248269496, + "grad_norm": 0.2372530698776245, + "learning_rate": 0.00012975778546712804, + "loss": 0.9906, + "step": 1125 + }, + { + "epoch": 0.06518227964928472, + "grad_norm": 0.2314678579568863, + "learning_rate": 0.00013033448673587084, + "loss": 0.9447, + "step": 1130 + }, + { + "epoch": 0.06547069681587447, + "grad_norm": 0.25254136323928833, + "learning_rate": 0.0001309111880046136, + "loss": 1.0364, + "step": 1135 + }, + { + "epoch": 0.06575911398246424, + "grad_norm": 0.23922473192214966, + "learning_rate": 0.0001314878892733564, + "loss": 1.0091, + "step": 1140 + }, + { + "epoch": 0.066047531149054, + "grad_norm": 0.24500273168087006, + "learning_rate": 0.0001320645905420992, + "loss": 0.9951, + "step": 1145 + }, + { + "epoch": 0.06633594831564375, + "grad_norm": 0.23815661668777466, + "learning_rate": 0.000132641291810842, + "loss": 1.0065, + "step": 1150 + }, + { + "epoch": 0.06662436548223351, + "grad_norm": 0.26173415780067444, + "learning_rate": 0.00013321799307958477, + "loss": 1.0159, + "step": 1155 + }, + { + "epoch": 0.06691278264882326, + "grad_norm": 0.22709496319293976, + "learning_rate": 0.00013379469434832757, + "loss": 0.9121, + "step": 1160 + }, + { + "epoch": 0.06720119981541302, + "grad_norm": 0.2595439553260803, + "learning_rate": 0.00013437139561707037, + "loss": 1.0136, + "step": 1165 + }, + { + "epoch": 0.06748961698200277, + "grad_norm": 0.23945558071136475, + "learning_rate": 0.00013494809688581317, + "loss": 0.9508, + "step": 1170 + }, + { + "epoch": 0.06777803414859253, + "grad_norm": 0.2526959478855133, + "learning_rate": 0.00013552479815455594, + "loss": 0.9304, + "step": 1175 + }, + { + "epoch": 0.06806645131518228, + "grad_norm": 0.2385508418083191, + "learning_rate": 0.00013610149942329874, + "loss": 1.012, + "step": 1180 + }, + { + "epoch": 0.06835486848177204, + "grad_norm": 0.25558724999427795, + "learning_rate": 0.00013667820069204154, + "loss": 1.0289, + "step": 1185 + }, + { + "epoch": 0.0686432856483618, + "grad_norm": 0.26076334714889526, + "learning_rate": 0.0001372549019607843, + "loss": 0.9564, + "step": 1190 + }, + { + "epoch": 0.06893170281495155, + "grad_norm": 0.24157829582691193, + "learning_rate": 0.0001378316032295271, + "loss": 1.0265, + "step": 1195 + }, + { + "epoch": 0.0692201199815413, + "grad_norm": 0.2505204379558563, + "learning_rate": 0.00013840830449826988, + "loss": 0.965, + "step": 1200 + }, + { + "epoch": 0.06950853714813106, + "grad_norm": 0.2583898603916168, + "learning_rate": 0.0001389850057670127, + "loss": 1.0161, + "step": 1205 + }, + { + "epoch": 0.06979695431472081, + "grad_norm": 0.24660265445709229, + "learning_rate": 0.00013956170703575548, + "loss": 1.0086, + "step": 1210 + }, + { + "epoch": 0.07008537148131057, + "grad_norm": 0.2303483486175537, + "learning_rate": 0.00014013840830449828, + "loss": 1.0004, + "step": 1215 + }, + { + "epoch": 0.07037378864790032, + "grad_norm": 0.25441575050354004, + "learning_rate": 0.00014071510957324108, + "loss": 1.0218, + "step": 1220 + }, + { + "epoch": 0.07066220581449008, + "grad_norm": 0.2441866099834442, + "learning_rate": 0.00014129181084198387, + "loss": 0.9947, + "step": 1225 + }, + { + "epoch": 0.07095062298107983, + "grad_norm": 0.2431473582983017, + "learning_rate": 0.00014186851211072665, + "loss": 0.977, + "step": 1230 + }, + { + "epoch": 0.07123904014766959, + "grad_norm": 0.22348998486995697, + "learning_rate": 0.00014244521337946944, + "loss": 0.9626, + "step": 1235 + }, + { + "epoch": 0.07152745731425934, + "grad_norm": 0.25038719177246094, + "learning_rate": 0.00014302191464821224, + "loss": 1.0234, + "step": 1240 + }, + { + "epoch": 0.0718158744808491, + "grad_norm": 0.24543331563472748, + "learning_rate": 0.00014359861591695501, + "loss": 0.9782, + "step": 1245 + }, + { + "epoch": 0.07210429164743885, + "grad_norm": 0.2646369934082031, + "learning_rate": 0.0001441753171856978, + "loss": 1.0049, + "step": 1250 + }, + { + "epoch": 0.07239270881402861, + "grad_norm": 0.24707183241844177, + "learning_rate": 0.00014475201845444058, + "loss": 1.0426, + "step": 1255 + }, + { + "epoch": 0.07268112598061836, + "grad_norm": 0.24609191715717316, + "learning_rate": 0.0001453287197231834, + "loss": 0.9978, + "step": 1260 + }, + { + "epoch": 0.07296954314720812, + "grad_norm": 0.2498229593038559, + "learning_rate": 0.00014590542099192618, + "loss": 1.0299, + "step": 1265 + }, + { + "epoch": 0.07325796031379787, + "grad_norm": 0.24294817447662354, + "learning_rate": 0.00014648212226066898, + "loss": 0.9387, + "step": 1270 + }, + { + "epoch": 0.07354637748038763, + "grad_norm": 0.22789110243320465, + "learning_rate": 0.00014705882352941178, + "loss": 0.9859, + "step": 1275 + }, + { + "epoch": 0.07383479464697738, + "grad_norm": 0.2392035871744156, + "learning_rate": 0.00014763552479815458, + "loss": 0.9821, + "step": 1280 + }, + { + "epoch": 0.07412321181356714, + "grad_norm": 0.24138358235359192, + "learning_rate": 0.00014821222606689735, + "loss": 0.9644, + "step": 1285 + }, + { + "epoch": 0.0744116289801569, + "grad_norm": 0.2574746012687683, + "learning_rate": 0.00014878892733564015, + "loss": 0.9894, + "step": 1290 + }, + { + "epoch": 0.07470004614674665, + "grad_norm": 0.2577558755874634, + "learning_rate": 0.00014936562860438295, + "loss": 1.0049, + "step": 1295 + }, + { + "epoch": 0.0749884633133364, + "grad_norm": 0.2638446092605591, + "learning_rate": 0.00014994232987312572, + "loss": 0.9866, + "step": 1300 + }, + { + "epoch": 0.07527688047992616, + "grad_norm": 0.2279583364725113, + "learning_rate": 0.00015051903114186852, + "loss": 0.9697, + "step": 1305 + }, + { + "epoch": 0.07556529764651591, + "grad_norm": 0.25132206082344055, + "learning_rate": 0.0001510957324106113, + "loss": 0.9654, + "step": 1310 + }, + { + "epoch": 0.07585371481310568, + "grad_norm": 0.24250829219818115, + "learning_rate": 0.00015167243367935411, + "loss": 0.9594, + "step": 1315 + }, + { + "epoch": 0.07614213197969544, + "grad_norm": 0.24679099023342133, + "learning_rate": 0.00015224913494809689, + "loss": 0.9514, + "step": 1320 + }, + { + "epoch": 0.07643054914628519, + "grad_norm": 0.26517555117607117, + "learning_rate": 0.00015282583621683968, + "loss": 0.9575, + "step": 1325 + }, + { + "epoch": 0.07671896631287495, + "grad_norm": 0.23794426023960114, + "learning_rate": 0.00015340253748558246, + "loss": 0.9982, + "step": 1330 + }, + { + "epoch": 0.0770073834794647, + "grad_norm": 0.2488831728696823, + "learning_rate": 0.00015397923875432528, + "loss": 0.9454, + "step": 1335 + }, + { + "epoch": 0.07729580064605446, + "grad_norm": 0.26782914996147156, + "learning_rate": 0.00015455594002306805, + "loss": 1.0235, + "step": 1340 + }, + { + "epoch": 0.07758421781264421, + "grad_norm": 0.25021234154701233, + "learning_rate": 0.00015513264129181085, + "loss": 0.9243, + "step": 1345 + }, + { + "epoch": 0.07787263497923397, + "grad_norm": 0.2522822618484497, + "learning_rate": 0.00015570934256055365, + "loss": 1.0428, + "step": 1350 + }, + { + "epoch": 0.07816105214582372, + "grad_norm": 0.27001574635505676, + "learning_rate": 0.00015628604382929645, + "loss": 0.9755, + "step": 1355 + }, + { + "epoch": 0.07844946931241348, + "grad_norm": 0.24071645736694336, + "learning_rate": 0.00015686274509803922, + "loss": 1.013, + "step": 1360 + }, + { + "epoch": 0.07873788647900323, + "grad_norm": 0.24303098022937775, + "learning_rate": 0.00015743944636678202, + "loss": 0.9862, + "step": 1365 + }, + { + "epoch": 0.07902630364559299, + "grad_norm": 0.2542005479335785, + "learning_rate": 0.00015801614763552482, + "loss": 0.9709, + "step": 1370 + }, + { + "epoch": 0.07931472081218274, + "grad_norm": 0.2585870325565338, + "learning_rate": 0.0001585928489042676, + "loss": 1.0085, + "step": 1375 + }, + { + "epoch": 0.0796031379787725, + "grad_norm": 0.2629243731498718, + "learning_rate": 0.0001591695501730104, + "loss": 0.985, + "step": 1380 + }, + { + "epoch": 0.07989155514536225, + "grad_norm": 0.24008338153362274, + "learning_rate": 0.00015974625144175316, + "loss": 0.9839, + "step": 1385 + }, + { + "epoch": 0.08017997231195201, + "grad_norm": 0.2442033439874649, + "learning_rate": 0.00016032295271049598, + "loss": 0.8798, + "step": 1390 + }, + { + "epoch": 0.08046838947854176, + "grad_norm": 0.250362366437912, + "learning_rate": 0.00016089965397923876, + "loss": 0.9301, + "step": 1395 + }, + { + "epoch": 0.08075680664513152, + "grad_norm": 0.2477293759584427, + "learning_rate": 0.00016147635524798155, + "loss": 0.9561, + "step": 1400 + }, + { + "epoch": 0.08104522381172127, + "grad_norm": 0.23329582810401917, + "learning_rate": 0.00016205305651672435, + "loss": 0.9505, + "step": 1405 + }, + { + "epoch": 0.08133364097831103, + "grad_norm": 0.24549901485443115, + "learning_rate": 0.00016262975778546715, + "loss": 1.0284, + "step": 1410 + }, + { + "epoch": 0.08162205814490078, + "grad_norm": 0.24419653415679932, + "learning_rate": 0.00016320645905420992, + "loss": 0.9114, + "step": 1415 + }, + { + "epoch": 0.08191047531149054, + "grad_norm": 0.24551044404506683, + "learning_rate": 0.00016378316032295272, + "loss": 0.9574, + "step": 1420 + }, + { + "epoch": 0.0821988924780803, + "grad_norm": 0.29641515016555786, + "learning_rate": 0.00016435986159169552, + "loss": 0.9821, + "step": 1425 + }, + { + "epoch": 0.08248730964467005, + "grad_norm": 0.24953129887580872, + "learning_rate": 0.0001649365628604383, + "loss": 0.9966, + "step": 1430 + }, + { + "epoch": 0.0827757268112598, + "grad_norm": 0.25181591510772705, + "learning_rate": 0.0001655132641291811, + "loss": 1.023, + "step": 1435 + }, + { + "epoch": 0.08306414397784956, + "grad_norm": 0.2478877305984497, + "learning_rate": 0.00016608996539792386, + "loss": 0.9762, + "step": 1440 + }, + { + "epoch": 0.08335256114443931, + "grad_norm": 0.24414442479610443, + "learning_rate": 0.0001666666666666667, + "loss": 0.9339, + "step": 1445 + }, + { + "epoch": 0.08364097831102907, + "grad_norm": 0.24295495450496674, + "learning_rate": 0.00016724336793540946, + "loss": 1.0144, + "step": 1450 + }, + { + "epoch": 0.08392939547761882, + "grad_norm": 0.25291165709495544, + "learning_rate": 0.00016782006920415226, + "loss": 0.916, + "step": 1455 + }, + { + "epoch": 0.08421781264420858, + "grad_norm": 0.23744194209575653, + "learning_rate": 0.00016839677047289503, + "loss": 0.952, + "step": 1460 + }, + { + "epoch": 0.08450622981079833, + "grad_norm": 0.24316394329071045, + "learning_rate": 0.00016897347174163786, + "loss": 0.9725, + "step": 1465 + }, + { + "epoch": 0.08479464697738809, + "grad_norm": 0.23748493194580078, + "learning_rate": 0.00016955017301038063, + "loss": 0.9831, + "step": 1470 + }, + { + "epoch": 0.08508306414397784, + "grad_norm": 0.25356602668762207, + "learning_rate": 0.00017012687427912343, + "loss": 0.9632, + "step": 1475 + }, + { + "epoch": 0.0853714813105676, + "grad_norm": 0.24660415947437286, + "learning_rate": 0.00017070357554786622, + "loss": 0.9319, + "step": 1480 + }, + { + "epoch": 0.08565989847715735, + "grad_norm": 0.25426214933395386, + "learning_rate": 0.000171280276816609, + "loss": 1.0245, + "step": 1485 + }, + { + "epoch": 0.08594831564374712, + "grad_norm": 0.23765899240970612, + "learning_rate": 0.0001718569780853518, + "loss": 0.9202, + "step": 1490 + }, + { + "epoch": 0.08623673281033688, + "grad_norm": 0.24204228818416595, + "learning_rate": 0.00017243367935409457, + "loss": 0.9974, + "step": 1495 + }, + { + "epoch": 0.08652514997692663, + "grad_norm": 0.23034018278121948, + "learning_rate": 0.0001730103806228374, + "loss": 0.9251, + "step": 1500 + }, + { + "epoch": 0.08681356714351639, + "grad_norm": 0.24768561124801636, + "learning_rate": 0.00017358708189158016, + "loss": 0.957, + "step": 1505 + }, + { + "epoch": 0.08710198431010614, + "grad_norm": 0.24252378940582275, + "learning_rate": 0.00017416378316032296, + "loss": 0.9347, + "step": 1510 + }, + { + "epoch": 0.0873904014766959, + "grad_norm": 0.24422116577625275, + "learning_rate": 0.00017474048442906573, + "loss": 0.956, + "step": 1515 + }, + { + "epoch": 0.08767881864328565, + "grad_norm": 0.25470009446144104, + "learning_rate": 0.00017531718569780856, + "loss": 0.9355, + "step": 1520 + }, + { + "epoch": 0.08796723580987541, + "grad_norm": 0.240427628159523, + "learning_rate": 0.00017589388696655133, + "loss": 1.0345, + "step": 1525 + }, + { + "epoch": 0.08825565297646516, + "grad_norm": 0.2679055631160736, + "learning_rate": 0.00017647058823529413, + "loss": 1.0215, + "step": 1530 + }, + { + "epoch": 0.08854407014305492, + "grad_norm": 0.2706778943538666, + "learning_rate": 0.00017704728950403693, + "loss": 0.9951, + "step": 1535 + }, + { + "epoch": 0.08883248730964467, + "grad_norm": 0.24882011115550995, + "learning_rate": 0.00017762399077277973, + "loss": 1.0267, + "step": 1540 + }, + { + "epoch": 0.08912090447623443, + "grad_norm": 0.24369126558303833, + "learning_rate": 0.0001782006920415225, + "loss": 1.046, + "step": 1545 + }, + { + "epoch": 0.08940932164282418, + "grad_norm": 0.27035751938819885, + "learning_rate": 0.0001787773933102653, + "loss": 1.0522, + "step": 1550 + }, + { + "epoch": 0.08969773880941394, + "grad_norm": 0.25707873702049255, + "learning_rate": 0.0001793540945790081, + "loss": 0.9507, + "step": 1555 + }, + { + "epoch": 0.08998615597600369, + "grad_norm": 0.26456013321876526, + "learning_rate": 0.00017993079584775087, + "loss": 0.9941, + "step": 1560 + }, + { + "epoch": 0.09027457314259345, + "grad_norm": 0.26937803626060486, + "learning_rate": 0.00018050749711649367, + "loss": 1.0267, + "step": 1565 + }, + { + "epoch": 0.0905629903091832, + "grad_norm": 0.2615615725517273, + "learning_rate": 0.00018108419838523644, + "loss": 0.984, + "step": 1570 + }, + { + "epoch": 0.09085140747577296, + "grad_norm": 0.23720060288906097, + "learning_rate": 0.00018166089965397926, + "loss": 0.9401, + "step": 1575 + }, + { + "epoch": 0.09113982464236271, + "grad_norm": 0.24640457332134247, + "learning_rate": 0.00018223760092272203, + "loss": 1.086, + "step": 1580 + }, + { + "epoch": 0.09142824180895247, + "grad_norm": 0.2521013915538788, + "learning_rate": 0.00018281430219146483, + "loss": 0.9619, + "step": 1585 + }, + { + "epoch": 0.09171665897554222, + "grad_norm": 0.23948408663272858, + "learning_rate": 0.0001833910034602076, + "loss": 0.9835, + "step": 1590 + }, + { + "epoch": 0.09200507614213198, + "grad_norm": 0.25325456261634827, + "learning_rate": 0.00018396770472895043, + "loss": 1.0552, + "step": 1595 + }, + { + "epoch": 0.09229349330872173, + "grad_norm": 0.24731087684631348, + "learning_rate": 0.0001845444059976932, + "loss": 0.9253, + "step": 1600 + }, + { + "epoch": 0.09258191047531149, + "grad_norm": 0.26164206862449646, + "learning_rate": 0.000185121107266436, + "loss": 0.9396, + "step": 1605 + }, + { + "epoch": 0.09287032764190124, + "grad_norm": 0.25318196415901184, + "learning_rate": 0.0001856978085351788, + "loss": 0.9431, + "step": 1610 + }, + { + "epoch": 0.093158744808491, + "grad_norm": 0.2592536211013794, + "learning_rate": 0.00018627450980392157, + "loss": 0.9955, + "step": 1615 + }, + { + "epoch": 0.09344716197508075, + "grad_norm": 0.2497592270374298, + "learning_rate": 0.00018685121107266437, + "loss": 0.9844, + "step": 1620 + }, + { + "epoch": 0.09373557914167051, + "grad_norm": 0.2648375630378723, + "learning_rate": 0.00018742791234140714, + "loss": 0.9655, + "step": 1625 + }, + { + "epoch": 0.09402399630826026, + "grad_norm": 0.25172188878059387, + "learning_rate": 0.00018800461361014997, + "loss": 1.0322, + "step": 1630 + }, + { + "epoch": 0.09431241347485002, + "grad_norm": 0.24844340980052948, + "learning_rate": 0.00018858131487889274, + "loss": 0.9636, + "step": 1635 + }, + { + "epoch": 0.09460083064143977, + "grad_norm": 0.25023674964904785, + "learning_rate": 0.00018915801614763554, + "loss": 0.9601, + "step": 1640 + }, + { + "epoch": 0.09488924780802953, + "grad_norm": 0.2417484074831009, + "learning_rate": 0.0001897347174163783, + "loss": 0.9748, + "step": 1645 + }, + { + "epoch": 0.09517766497461928, + "grad_norm": 0.2597021162509918, + "learning_rate": 0.00019031141868512113, + "loss": 0.9672, + "step": 1650 + }, + { + "epoch": 0.09546608214120904, + "grad_norm": 0.25209182500839233, + "learning_rate": 0.0001908881199538639, + "loss": 0.9766, + "step": 1655 + }, + { + "epoch": 0.0957544993077988, + "grad_norm": 0.2704354226589203, + "learning_rate": 0.0001914648212226067, + "loss": 0.9658, + "step": 1660 + }, + { + "epoch": 0.09604291647438856, + "grad_norm": 0.2553963363170624, + "learning_rate": 0.00019204152249134948, + "loss": 0.972, + "step": 1665 + }, + { + "epoch": 0.09633133364097832, + "grad_norm": 0.25183454155921936, + "learning_rate": 0.00019261822376009227, + "loss": 0.9312, + "step": 1670 + }, + { + "epoch": 0.09661975080756807, + "grad_norm": 0.27272742986679077, + "learning_rate": 0.00019319492502883507, + "loss": 1.0585, + "step": 1675 + }, + { + "epoch": 0.09690816797415783, + "grad_norm": 0.25347381830215454, + "learning_rate": 0.00019377162629757784, + "loss": 1.0013, + "step": 1680 + }, + { + "epoch": 0.09719658514074758, + "grad_norm": 0.26412150263786316, + "learning_rate": 0.00019434832756632067, + "loss": 0.9175, + "step": 1685 + }, + { + "epoch": 0.09748500230733734, + "grad_norm": 0.2841266393661499, + "learning_rate": 0.00019492502883506344, + "loss": 0.8907, + "step": 1690 + }, + { + "epoch": 0.09777341947392709, + "grad_norm": 0.2843879163265228, + "learning_rate": 0.00019550173010380624, + "loss": 0.9952, + "step": 1695 + }, + { + "epoch": 0.09806183664051685, + "grad_norm": 0.24573901295661926, + "learning_rate": 0.000196078431372549, + "loss": 1.0093, + "step": 1700 + }, + { + "epoch": 0.0983502538071066, + "grad_norm": 0.25996410846710205, + "learning_rate": 0.00019665513264129184, + "loss": 1.0403, + "step": 1705 + }, + { + "epoch": 0.09863867097369636, + "grad_norm": 0.26386144757270813, + "learning_rate": 0.0001972318339100346, + "loss": 1.0211, + "step": 1710 + }, + { + "epoch": 0.09892708814028611, + "grad_norm": 0.26584669947624207, + "learning_rate": 0.0001978085351787774, + "loss": 0.9985, + "step": 1715 + }, + { + "epoch": 0.09921550530687587, + "grad_norm": 0.25835517048835754, + "learning_rate": 0.00019838523644752018, + "loss": 0.9615, + "step": 1720 + }, + { + "epoch": 0.09950392247346562, + "grad_norm": 0.2537446618080139, + "learning_rate": 0.000198961937716263, + "loss": 0.9851, + "step": 1725 + }, + { + "epoch": 0.09979233964005538, + "grad_norm": 0.2637675702571869, + "learning_rate": 0.00019953863898500578, + "loss": 0.9991, + "step": 1730 + }, + { + "epoch": 0.10008075680664513, + "grad_norm": 0.2486466020345688, + "learning_rate": 0.00019999999797274117, + "loss": 0.928, + "step": 1735 + }, + { + "epoch": 0.10036917397323489, + "grad_norm": 0.31705260276794434, + "learning_rate": 0.0001999999270186907, + "loss": 0.9909, + "step": 1740 + }, + { + "epoch": 0.10065759113982464, + "grad_norm": 0.2822314500808716, + "learning_rate": 0.0001999997547017808, + "loss": 0.9688, + "step": 1745 + }, + { + "epoch": 0.1009460083064144, + "grad_norm": 0.2564781606197357, + "learning_rate": 0.0001999994810221862, + "loss": 0.9515, + "step": 1750 + }, + { + "epoch": 0.10123442547300415, + "grad_norm": 0.2958817183971405, + "learning_rate": 0.00019999910598018426, + "loss": 0.9859, + "step": 1755 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 0.25060567259788513, + "learning_rate": 0.00019999862957615513, + "loss": 1.0043, + "step": 1760 + }, + { + "epoch": 0.10181125980618366, + "grad_norm": 0.2674092650413513, + "learning_rate": 0.00019999805181058176, + "loss": 0.9626, + "step": 1765 + }, + { + "epoch": 0.10209967697277342, + "grad_norm": 0.2575248181819916, + "learning_rate": 0.00019999737268404973, + "loss": 1.0265, + "step": 1770 + }, + { + "epoch": 0.10238809413936317, + "grad_norm": 0.2554805278778076, + "learning_rate": 0.00019999659219724749, + "loss": 0.9661, + "step": 1775 + }, + { + "epoch": 0.10267651130595293, + "grad_norm": 0.26680126786231995, + "learning_rate": 0.00019999571035096608, + "loss": 1.0231, + "step": 1780 + }, + { + "epoch": 0.10296492847254268, + "grad_norm": 0.25776219367980957, + "learning_rate": 0.00019999472714609943, + "loss": 0.9058, + "step": 1785 + }, + { + "epoch": 0.10325334563913244, + "grad_norm": 0.2542843818664551, + "learning_rate": 0.00019999364258364413, + "loss": 0.9773, + "step": 1790 + }, + { + "epoch": 0.10354176280572219, + "grad_norm": 0.2621992826461792, + "learning_rate": 0.0001999924566646995, + "loss": 0.9559, + "step": 1795 + }, + { + "epoch": 0.10383017997231195, + "grad_norm": 0.2683923840522766, + "learning_rate": 0.00019999116939046764, + "loss": 1.0355, + "step": 1800 + }, + { + "epoch": 0.1041185971389017, + "grad_norm": 0.24701032042503357, + "learning_rate": 0.0001999897807622534, + "loss": 1.0906, + "step": 1805 + }, + { + "epoch": 0.10440701430549146, + "grad_norm": 0.25396963953971863, + "learning_rate": 0.0001999882907814643, + "loss": 1.0226, + "step": 1810 + }, + { + "epoch": 0.10469543147208121, + "grad_norm": 0.28205832839012146, + "learning_rate": 0.00019998669944961062, + "loss": 0.9224, + "step": 1815 + }, + { + "epoch": 0.10498384863867097, + "grad_norm": 0.26078683137893677, + "learning_rate": 0.0001999850067683054, + "loss": 0.9427, + "step": 1820 + }, + { + "epoch": 0.10527226580526072, + "grad_norm": 0.25481727719306946, + "learning_rate": 0.00019998321273926437, + "loss": 1.0042, + "step": 1825 + }, + { + "epoch": 0.10556068297185048, + "grad_norm": 0.25570574402809143, + "learning_rate": 0.00019998131736430604, + "loss": 0.9722, + "step": 1830 + }, + { + "epoch": 0.10584910013844025, + "grad_norm": 0.2734397351741791, + "learning_rate": 0.00019997932064535158, + "loss": 1.001, + "step": 1835 + }, + { + "epoch": 0.10613751730503, + "grad_norm": 0.27242162823677063, + "learning_rate": 0.00019997722258442499, + "loss": 0.9647, + "step": 1840 + }, + { + "epoch": 0.10642593447161976, + "grad_norm": 0.2732183635234833, + "learning_rate": 0.00019997502318365286, + "loss": 0.9697, + "step": 1845 + }, + { + "epoch": 0.10671435163820951, + "grad_norm": 0.26898330450057983, + "learning_rate": 0.00019997272244526456, + "loss": 0.9284, + "step": 1850 + }, + { + "epoch": 0.10700276880479927, + "grad_norm": 0.2656812071800232, + "learning_rate": 0.00019997032037159224, + "loss": 1.0368, + "step": 1855 + }, + { + "epoch": 0.10729118597138902, + "grad_norm": 0.2728678584098816, + "learning_rate": 0.00019996781696507069, + "loss": 1.0147, + "step": 1860 + }, + { + "epoch": 0.10757960313797878, + "grad_norm": 0.2543455958366394, + "learning_rate": 0.00019996521222823743, + "loss": 0.954, + "step": 1865 + }, + { + "epoch": 0.10786802030456853, + "grad_norm": 0.27658751606941223, + "learning_rate": 0.00019996250616373268, + "loss": 0.9796, + "step": 1870 + }, + { + "epoch": 0.10815643747115829, + "grad_norm": 0.27136722207069397, + "learning_rate": 0.00019995969877429945, + "loss": 0.9125, + "step": 1875 + }, + { + "epoch": 0.10844485463774804, + "grad_norm": 0.2712014317512512, + "learning_rate": 0.0001999567900627833, + "loss": 1.0053, + "step": 1880 + }, + { + "epoch": 0.1087332718043378, + "grad_norm": 0.2740635573863983, + "learning_rate": 0.0001999537800321327, + "loss": 0.9951, + "step": 1885 + }, + { + "epoch": 0.10902168897092755, + "grad_norm": 0.26667481660842896, + "learning_rate": 0.0001999506686853986, + "loss": 1.0062, + "step": 1890 + }, + { + "epoch": 0.10931010613751731, + "grad_norm": 0.2604423463344574, + "learning_rate": 0.0001999474560257348, + "loss": 0.9852, + "step": 1895 + }, + { + "epoch": 0.10959852330410706, + "grad_norm": 0.27640554308891296, + "learning_rate": 0.00019994414205639775, + "loss": 0.959, + "step": 1900 + }, + { + "epoch": 0.10988694047069682, + "grad_norm": 0.25489839911460876, + "learning_rate": 0.00019994072678074655, + "loss": 0.9957, + "step": 1905 + }, + { + "epoch": 0.11017535763728657, + "grad_norm": 0.2796529233455658, + "learning_rate": 0.00019993721020224308, + "loss": 0.9418, + "step": 1910 + }, + { + "epoch": 0.11046377480387633, + "grad_norm": 0.2622373402118683, + "learning_rate": 0.00019993359232445176, + "loss": 0.9573, + "step": 1915 + }, + { + "epoch": 0.11075219197046608, + "grad_norm": 0.2514156997203827, + "learning_rate": 0.0001999298731510399, + "loss": 0.9373, + "step": 1920 + }, + { + "epoch": 0.11104060913705584, + "grad_norm": 0.2672327160835266, + "learning_rate": 0.00019992605268577727, + "loss": 0.9097, + "step": 1925 + }, + { + "epoch": 0.11132902630364559, + "grad_norm": 0.26772674918174744, + "learning_rate": 0.00019992213093253643, + "loss": 1.0108, + "step": 1930 + }, + { + "epoch": 0.11161744347023535, + "grad_norm": 0.2462950050830841, + "learning_rate": 0.00019991810789529257, + "loss": 1.0006, + "step": 1935 + }, + { + "epoch": 0.1119058606368251, + "grad_norm": 0.26759883761405945, + "learning_rate": 0.0001999139835781236, + "loss": 0.9758, + "step": 1940 + }, + { + "epoch": 0.11219427780341486, + "grad_norm": 0.2841535806655884, + "learning_rate": 0.00019990975798521, + "loss": 1.0408, + "step": 1945 + }, + { + "epoch": 0.11248269497000461, + "grad_norm": 0.2822214365005493, + "learning_rate": 0.00019990543112083503, + "loss": 0.9317, + "step": 1950 + }, + { + "epoch": 0.11277111213659437, + "grad_norm": 0.2670351564884186, + "learning_rate": 0.00019990100298938442, + "loss": 0.9536, + "step": 1955 + }, + { + "epoch": 0.11305952930318412, + "grad_norm": 0.27470991015434265, + "learning_rate": 0.00019989647359534672, + "loss": 1.0404, + "step": 1960 + }, + { + "epoch": 0.11334794646977388, + "grad_norm": 0.2892574071884155, + "learning_rate": 0.00019989184294331308, + "loss": 0.9912, + "step": 1965 + }, + { + "epoch": 0.11363636363636363, + "grad_norm": 0.28786224126815796, + "learning_rate": 0.0001998871110379772, + "loss": 1.048, + "step": 1970 + }, + { + "epoch": 0.11392478080295339, + "grad_norm": 0.2730783522129059, + "learning_rate": 0.0001998822778841355, + "loss": 1.0148, + "step": 1975 + }, + { + "epoch": 0.11421319796954314, + "grad_norm": 0.25908493995666504, + "learning_rate": 0.00019987734348668706, + "loss": 0.9237, + "step": 1980 + }, + { + "epoch": 0.1145016151361329, + "grad_norm": 0.2924931049346924, + "learning_rate": 0.00019987230785063344, + "loss": 1.0084, + "step": 1985 + }, + { + "epoch": 0.11479003230272265, + "grad_norm": 0.2685001790523529, + "learning_rate": 0.00019986717098107896, + "loss": 0.977, + "step": 1990 + }, + { + "epoch": 0.11507844946931241, + "grad_norm": 0.26407670974731445, + "learning_rate": 0.0001998619328832305, + "loss": 1.0132, + "step": 1995 + }, + { + "epoch": 0.11536686663590216, + "grad_norm": 0.2581160366535187, + "learning_rate": 0.00019985659356239758, + "loss": 1.0553, + "step": 2000 + }, + { + "epoch": 0.11565528380249192, + "grad_norm": 0.2579261064529419, + "learning_rate": 0.0001998511530239922, + "loss": 0.992, + "step": 2005 + }, + { + "epoch": 0.11594370096908169, + "grad_norm": 0.27874529361724854, + "learning_rate": 0.00019984561127352914, + "loss": 1.0208, + "step": 2010 + }, + { + "epoch": 0.11623211813567144, + "grad_norm": 0.2448752522468567, + "learning_rate": 0.00019983996831662566, + "loss": 1.0272, + "step": 2015 + }, + { + "epoch": 0.1165205353022612, + "grad_norm": 0.2515913248062134, + "learning_rate": 0.00019983422415900158, + "loss": 1.0251, + "step": 2020 + }, + { + "epoch": 0.11680895246885095, + "grad_norm": 0.2612157464027405, + "learning_rate": 0.0001998283788064794, + "loss": 0.9298, + "step": 2025 + }, + { + "epoch": 0.1170973696354407, + "grad_norm": 0.2781950533390045, + "learning_rate": 0.00019982243226498411, + "loss": 1.0191, + "step": 2030 + }, + { + "epoch": 0.11738578680203046, + "grad_norm": 0.27393776178359985, + "learning_rate": 0.00019981638454054333, + "loss": 0.8712, + "step": 2035 + }, + { + "epoch": 0.11767420396862022, + "grad_norm": 0.271932452917099, + "learning_rate": 0.00019981023563928716, + "loss": 0.9644, + "step": 2040 + }, + { + "epoch": 0.11796262113520997, + "grad_norm": 0.2659457325935364, + "learning_rate": 0.00019980398556744837, + "loss": 0.9295, + "step": 2045 + }, + { + "epoch": 0.11825103830179973, + "grad_norm": 0.2813827395439148, + "learning_rate": 0.00019979763433136216, + "loss": 0.975, + "step": 2050 + }, + { + "epoch": 0.11853945546838948, + "grad_norm": 0.24046528339385986, + "learning_rate": 0.00019979118193746637, + "loss": 0.9836, + "step": 2055 + }, + { + "epoch": 0.11882787263497924, + "grad_norm": 0.27069780230522156, + "learning_rate": 0.00019978462839230133, + "loss": 1.0503, + "step": 2060 + }, + { + "epoch": 0.11911628980156899, + "grad_norm": 0.2609676718711853, + "learning_rate": 0.00019977797370250986, + "loss": 0.959, + "step": 2065 + }, + { + "epoch": 0.11940470696815875, + "grad_norm": 0.2760465145111084, + "learning_rate": 0.0001997712178748374, + "loss": 1.0014, + "step": 2070 + }, + { + "epoch": 0.1196931241347485, + "grad_norm": 0.2539708614349365, + "learning_rate": 0.00019976436091613184, + "loss": 1.0215, + "step": 2075 + }, + { + "epoch": 0.11998154130133826, + "grad_norm": 0.27062153816223145, + "learning_rate": 0.0001997574028333436, + "loss": 0.964, + "step": 2080 + }, + { + "epoch": 0.12026995846792801, + "grad_norm": 0.26900675892829895, + "learning_rate": 0.00019975034363352556, + "loss": 0.935, + "step": 2085 + }, + { + "epoch": 0.12055837563451777, + "grad_norm": 0.27462172508239746, + "learning_rate": 0.0001997431833238332, + "loss": 0.974, + "step": 2090 + }, + { + "epoch": 0.12084679280110752, + "grad_norm": 0.3665010333061218, + "learning_rate": 0.00019973592191152437, + "loss": 1.0159, + "step": 2095 + }, + { + "epoch": 0.12113520996769728, + "grad_norm": 0.28900420665740967, + "learning_rate": 0.00019972855940395947, + "loss": 1.0202, + "step": 2100 + }, + { + "epoch": 0.12142362713428703, + "grad_norm": 0.2706412374973297, + "learning_rate": 0.00019972109580860132, + "loss": 0.9766, + "step": 2105 + }, + { + "epoch": 0.12171204430087679, + "grad_norm": 0.28748854994773865, + "learning_rate": 0.00019971353113301527, + "loss": 1.095, + "step": 2110 + }, + { + "epoch": 0.12200046146746654, + "grad_norm": 0.2745112180709839, + "learning_rate": 0.0001997058653848691, + "loss": 0.9995, + "step": 2115 + }, + { + "epoch": 0.1222888786340563, + "grad_norm": 0.27372869849205017, + "learning_rate": 0.00019969809857193306, + "loss": 0.9582, + "step": 2120 + }, + { + "epoch": 0.12257729580064605, + "grad_norm": 0.2714395821094513, + "learning_rate": 0.00019969023070207973, + "loss": 0.9423, + "step": 2125 + }, + { + "epoch": 0.12286571296723581, + "grad_norm": 0.26695722341537476, + "learning_rate": 0.0001996822617832843, + "loss": 0.9192, + "step": 2130 + }, + { + "epoch": 0.12315413013382556, + "grad_norm": 0.2779480814933777, + "learning_rate": 0.00019967419182362429, + "loss": 0.9577, + "step": 2135 + }, + { + "epoch": 0.12344254730041532, + "grad_norm": 0.279851496219635, + "learning_rate": 0.0001996660208312796, + "loss": 0.9946, + "step": 2140 + }, + { + "epoch": 0.12373096446700507, + "grad_norm": 0.2676329016685486, + "learning_rate": 0.00019965774881453263, + "loss": 1.0293, + "step": 2145 + }, + { + "epoch": 0.12401938163359483, + "grad_norm": 0.2577393054962158, + "learning_rate": 0.00019964937578176816, + "loss": 0.9845, + "step": 2150 + }, + { + "epoch": 0.12430779880018458, + "grad_norm": 0.2870205342769623, + "learning_rate": 0.00019964090174147327, + "loss": 0.9747, + "step": 2155 + }, + { + "epoch": 0.12459621596677434, + "grad_norm": 0.2597945034503937, + "learning_rate": 0.00019963232670223752, + "loss": 0.9896, + "step": 2160 + }, + { + "epoch": 0.12488463313336409, + "grad_norm": 0.3189765512943268, + "learning_rate": 0.00019962365067275286, + "loss": 0.9538, + "step": 2165 + }, + { + "epoch": 0.12517305029995385, + "grad_norm": 0.27205929160118103, + "learning_rate": 0.00019961487366181355, + "loss": 0.9626, + "step": 2170 + }, + { + "epoch": 0.1254614674665436, + "grad_norm": 0.26647019386291504, + "learning_rate": 0.0001996059956783162, + "loss": 1.0142, + "step": 2175 + }, + { + "epoch": 0.12574988463313336, + "grad_norm": 0.2724989652633667, + "learning_rate": 0.00019959701673125983, + "loss": 1.0228, + "step": 2180 + }, + { + "epoch": 0.1260383017997231, + "grad_norm": 0.27627307176589966, + "learning_rate": 0.00019958793682974574, + "loss": 0.9744, + "step": 2185 + }, + { + "epoch": 0.12632671896631287, + "grad_norm": 0.2836136221885681, + "learning_rate": 0.00019957875598297759, + "loss": 1.0011, + "step": 2190 + }, + { + "epoch": 0.12661513613290262, + "grad_norm": 0.26454490423202515, + "learning_rate": 0.00019956947420026136, + "loss": 1.0463, + "step": 2195 + }, + { + "epoch": 0.12690355329949238, + "grad_norm": 0.29074445366859436, + "learning_rate": 0.00019956009149100533, + "loss": 0.9643, + "step": 2200 + }, + { + "epoch": 0.12719197046608213, + "grad_norm": 0.2764613926410675, + "learning_rate": 0.00019955060786472012, + "loss": 0.9245, + "step": 2205 + }, + { + "epoch": 0.1274803876326719, + "grad_norm": 0.2702649235725403, + "learning_rate": 0.00019954102333101856, + "loss": 0.9734, + "step": 2210 + }, + { + "epoch": 0.12776880479926164, + "grad_norm": 0.28136304020881653, + "learning_rate": 0.00019953133789961584, + "loss": 0.9782, + "step": 2215 + }, + { + "epoch": 0.1280572219658514, + "grad_norm": 0.29559558629989624, + "learning_rate": 0.0001995215515803294, + "loss": 0.9708, + "step": 2220 + }, + { + "epoch": 0.12834563913244115, + "grad_norm": 0.2811656892299652, + "learning_rate": 0.00019951166438307894, + "loss": 0.9839, + "step": 2225 + }, + { + "epoch": 0.1286340562990309, + "grad_norm": 0.27432867884635925, + "learning_rate": 0.00019950167631788642, + "loss": 0.9697, + "step": 2230 + }, + { + "epoch": 0.12892247346562066, + "grad_norm": 0.28106796741485596, + "learning_rate": 0.000199491587394876, + "loss": 0.9526, + "step": 2235 + }, + { + "epoch": 0.12921089063221042, + "grad_norm": 0.2755594253540039, + "learning_rate": 0.00019948139762427416, + "loss": 0.9943, + "step": 2240 + }, + { + "epoch": 0.12949930779880017, + "grad_norm": 0.27341076731681824, + "learning_rate": 0.00019947110701640952, + "loss": 0.9661, + "step": 2245 + }, + { + "epoch": 0.12978772496538993, + "grad_norm": 0.2582038938999176, + "learning_rate": 0.000199460715581713, + "loss": 0.9083, + "step": 2250 + }, + { + "epoch": 0.13007614213197968, + "grad_norm": 0.2739073932170868, + "learning_rate": 0.00019945022333071752, + "loss": 1.0518, + "step": 2255 + }, + { + "epoch": 0.13036455929856944, + "grad_norm": 0.2646303176879883, + "learning_rate": 0.0001994396302740585, + "loss": 0.9709, + "step": 2260 + }, + { + "epoch": 0.1306529764651592, + "grad_norm": 0.2723826766014099, + "learning_rate": 0.00019942893642247326, + "loss": 0.9845, + "step": 2265 + }, + { + "epoch": 0.13094139363174895, + "grad_norm": 0.27351605892181396, + "learning_rate": 0.00019941814178680144, + "loss": 1.0138, + "step": 2270 + }, + { + "epoch": 0.13122981079833873, + "grad_norm": 0.2802083492279053, + "learning_rate": 0.00019940724637798477, + "loss": 0.9364, + "step": 2275 + }, + { + "epoch": 0.13151822796492849, + "grad_norm": 0.27607461810112, + "learning_rate": 0.00019939625020706724, + "loss": 0.9931, + "step": 2280 + }, + { + "epoch": 0.13180664513151824, + "grad_norm": 0.270385205745697, + "learning_rate": 0.0001993851532851948, + "loss": 0.9763, + "step": 2285 + }, + { + "epoch": 0.132095062298108, + "grad_norm": 0.2873282730579376, + "learning_rate": 0.00019937395562361564, + "loss": 1.0417, + "step": 2290 + }, + { + "epoch": 0.13238347946469775, + "grad_norm": 0.2726912796497345, + "learning_rate": 0.0001993626572336801, + "loss": 0.9555, + "step": 2295 + }, + { + "epoch": 0.1326718966312875, + "grad_norm": 0.2793363332748413, + "learning_rate": 0.00019935125812684047, + "loss": 0.9883, + "step": 2300 + }, + { + "epoch": 0.13296031379787726, + "grad_norm": 0.2792257070541382, + "learning_rate": 0.0001993397583146513, + "loss": 1.0003, + "step": 2305 + }, + { + "epoch": 0.13324873096446702, + "grad_norm": 0.27051353454589844, + "learning_rate": 0.00019932815780876904, + "loss": 0.9726, + "step": 2310 + }, + { + "epoch": 0.13353714813105677, + "grad_norm": 0.28619712591171265, + "learning_rate": 0.00019931645662095237, + "loss": 0.9621, + "step": 2315 + }, + { + "epoch": 0.13382556529764653, + "grad_norm": 0.27812543511390686, + "learning_rate": 0.00019930465476306197, + "loss": 0.9909, + "step": 2320 + }, + { + "epoch": 0.13411398246423628, + "grad_norm": 0.27520883083343506, + "learning_rate": 0.0001992927522470605, + "loss": 1.0185, + "step": 2325 + }, + { + "epoch": 0.13440239963082604, + "grad_norm": 0.27513301372528076, + "learning_rate": 0.00019928074908501272, + "loss": 0.9595, + "step": 2330 + }, + { + "epoch": 0.1346908167974158, + "grad_norm": 0.29639777541160583, + "learning_rate": 0.0001992686452890854, + "loss": 0.9819, + "step": 2335 + }, + { + "epoch": 0.13497923396400555, + "grad_norm": 0.2893521189689636, + "learning_rate": 0.00019925644087154734, + "loss": 0.9894, + "step": 2340 + }, + { + "epoch": 0.1352676511305953, + "grad_norm": 0.267421156167984, + "learning_rate": 0.0001992441358447692, + "loss": 0.9882, + "step": 2345 + }, + { + "epoch": 0.13555606829718506, + "grad_norm": 0.2774795591831207, + "learning_rate": 0.00019923173022122378, + "loss": 0.9404, + "step": 2350 + }, + { + "epoch": 0.1358444854637748, + "grad_norm": 0.30167555809020996, + "learning_rate": 0.00019921922401348576, + "loss": 0.9631, + "step": 2355 + }, + { + "epoch": 0.13613290263036457, + "grad_norm": 0.2823658287525177, + "learning_rate": 0.00019920661723423183, + "loss": 0.9271, + "step": 2360 + }, + { + "epoch": 0.13642131979695432, + "grad_norm": 0.2752264142036438, + "learning_rate": 0.00019919390989624054, + "loss": 0.981, + "step": 2365 + }, + { + "epoch": 0.13670973696354408, + "grad_norm": 0.284186989068985, + "learning_rate": 0.00019918110201239247, + "loss": 1.0279, + "step": 2370 + }, + { + "epoch": 0.13699815413013383, + "grad_norm": 0.2601034343242645, + "learning_rate": 0.00019916819359567001, + "loss": 1.0219, + "step": 2375 + }, + { + "epoch": 0.1372865712967236, + "grad_norm": 0.3391975164413452, + "learning_rate": 0.00019915518465915758, + "loss": 0.9432, + "step": 2380 + }, + { + "epoch": 0.13757498846331334, + "grad_norm": 0.3057229816913605, + "learning_rate": 0.0001991420752160414, + "loss": 1.0415, + "step": 2385 + }, + { + "epoch": 0.1378634056299031, + "grad_norm": 0.2857256829738617, + "learning_rate": 0.00019912886527960954, + "loss": 0.9896, + "step": 2390 + }, + { + "epoch": 0.13815182279649285, + "grad_norm": 0.4211989641189575, + "learning_rate": 0.00019911555486325203, + "loss": 1.0471, + "step": 2395 + }, + { + "epoch": 0.1384402399630826, + "grad_norm": 0.26847025752067566, + "learning_rate": 0.0001991021439804607, + "loss": 1.0071, + "step": 2400 + }, + { + "epoch": 0.13872865712967236, + "grad_norm": 0.27097341418266296, + "learning_rate": 0.00019908863264482917, + "loss": 0.9493, + "step": 2405 + }, + { + "epoch": 0.13901707429626212, + "grad_norm": 0.2873136103153229, + "learning_rate": 0.00019907502087005297, + "loss": 1.0064, + "step": 2410 + }, + { + "epoch": 0.13930549146285187, + "grad_norm": 0.2804831564426422, + "learning_rate": 0.00019906130866992935, + "loss": 0.9483, + "step": 2415 + }, + { + "epoch": 0.13959390862944163, + "grad_norm": 0.27144983410835266, + "learning_rate": 0.00019904749605835742, + "loss": 0.9541, + "step": 2420 + }, + { + "epoch": 0.13988232579603138, + "grad_norm": 0.2791461944580078, + "learning_rate": 0.00019903358304933805, + "loss": 1.0228, + "step": 2425 + }, + { + "epoch": 0.14017074296262114, + "grad_norm": 0.2839184105396271, + "learning_rate": 0.00019901956965697387, + "loss": 0.9853, + "step": 2430 + }, + { + "epoch": 0.1404591601292109, + "grad_norm": 0.2938236594200134, + "learning_rate": 0.0001990054558954693, + "loss": 1.0175, + "step": 2435 + }, + { + "epoch": 0.14074757729580065, + "grad_norm": 0.26195093989372253, + "learning_rate": 0.00019899124177913041, + "loss": 0.9927, + "step": 2440 + }, + { + "epoch": 0.1410359944623904, + "grad_norm": 0.282997727394104, + "learning_rate": 0.0001989769273223651, + "loss": 0.9148, + "step": 2445 + }, + { + "epoch": 0.14132441162898016, + "grad_norm": 0.2869815230369568, + "learning_rate": 0.00019896251253968288, + "loss": 0.9978, + "step": 2450 + }, + { + "epoch": 0.1416128287955699, + "grad_norm": 0.30306002497673035, + "learning_rate": 0.000198947997445695, + "loss": 0.9793, + "step": 2455 + }, + { + "epoch": 0.14190124596215967, + "grad_norm": 0.2726587951183319, + "learning_rate": 0.0001989333820551144, + "loss": 0.8918, + "step": 2460 + }, + { + "epoch": 0.14218966312874942, + "grad_norm": 0.3028129041194916, + "learning_rate": 0.00019891866638275564, + "loss": 1.0184, + "step": 2465 + }, + { + "epoch": 0.14247808029533918, + "grad_norm": 0.27245384454727173, + "learning_rate": 0.00019890385044353501, + "loss": 0.9187, + "step": 2470 + }, + { + "epoch": 0.14276649746192893, + "grad_norm": 0.26684272289276123, + "learning_rate": 0.00019888893425247032, + "loss": 0.94, + "step": 2475 + }, + { + "epoch": 0.1430549146285187, + "grad_norm": 0.26761725544929504, + "learning_rate": 0.00019887391782468113, + "loss": 0.9606, + "step": 2480 + }, + { + "epoch": 0.14334333179510844, + "grad_norm": 0.2789659798145294, + "learning_rate": 0.00019885880117538846, + "loss": 0.9361, + "step": 2485 + }, + { + "epoch": 0.1436317489616982, + "grad_norm": 0.2568376362323761, + "learning_rate": 0.000198843584319915, + "loss": 1.0155, + "step": 2490 + }, + { + "epoch": 0.14392016612828795, + "grad_norm": 0.29699787497520447, + "learning_rate": 0.00019882826727368508, + "loss": 1.0136, + "step": 2495 + }, + { + "epoch": 0.1442085832948777, + "grad_norm": 0.3011142313480377, + "learning_rate": 0.0001988128500522244, + "loss": 0.9967, + "step": 2500 + }, + { + "epoch": 0.14449700046146746, + "grad_norm": 0.27386248111724854, + "learning_rate": 0.00019879733267116035, + "loss": 1.0263, + "step": 2505 + }, + { + "epoch": 0.14478541762805722, + "grad_norm": 0.31453463435173035, + "learning_rate": 0.00019878171514622187, + "loss": 0.9307, + "step": 2510 + }, + { + "epoch": 0.14507383479464697, + "grad_norm": 0.2672314941883087, + "learning_rate": 0.0001987659974932392, + "loss": 0.9441, + "step": 2515 + }, + { + "epoch": 0.14536225196123673, + "grad_norm": 0.2847091257572174, + "learning_rate": 0.00019875017972814435, + "loss": 0.9868, + "step": 2520 + }, + { + "epoch": 0.14565066912782648, + "grad_norm": 0.28868651390075684, + "learning_rate": 0.0001987342618669706, + "loss": 0.9296, + "step": 2525 + }, + { + "epoch": 0.14593908629441624, + "grad_norm": 0.29168251156806946, + "learning_rate": 0.00019871824392585276, + "loss": 0.9317, + "step": 2530 + }, + { + "epoch": 0.146227503461006, + "grad_norm": 0.2743743062019348, + "learning_rate": 0.00019870212592102711, + "loss": 1.0277, + "step": 2535 + }, + { + "epoch": 0.14651592062759575, + "grad_norm": 0.2812393605709076, + "learning_rate": 0.00019868590786883134, + "loss": 1.0553, + "step": 2540 + }, + { + "epoch": 0.1468043377941855, + "grad_norm": 0.2678181231021881, + "learning_rate": 0.00019866958978570452, + "loss": 0.8821, + "step": 2545 + }, + { + "epoch": 0.14709275496077526, + "grad_norm": 0.3037974238395691, + "learning_rate": 0.00019865317168818713, + "loss": 0.9625, + "step": 2550 + }, + { + "epoch": 0.147381172127365, + "grad_norm": 0.2820071578025818, + "learning_rate": 0.00019863665359292108, + "loss": 1.0259, + "step": 2555 + }, + { + "epoch": 0.14766958929395477, + "grad_norm": 0.2591807544231415, + "learning_rate": 0.0001986200355166495, + "loss": 0.9521, + "step": 2560 + }, + { + "epoch": 0.14795800646054452, + "grad_norm": 0.26036834716796875, + "learning_rate": 0.0001986033174762171, + "loss": 0.94, + "step": 2565 + }, + { + "epoch": 0.14824642362713428, + "grad_norm": 0.27297431230545044, + "learning_rate": 0.0001985864994885697, + "loss": 0.9859, + "step": 2570 + }, + { + "epoch": 0.14853484079372403, + "grad_norm": 0.27806761860847473, + "learning_rate": 0.00019856958157075445, + "loss": 1.0, + "step": 2575 + }, + { + "epoch": 0.1488232579603138, + "grad_norm": 0.2749041020870209, + "learning_rate": 0.00019855256373991993, + "loss": 0.9111, + "step": 2580 + }, + { + "epoch": 0.14911167512690354, + "grad_norm": 0.28046393394470215, + "learning_rate": 0.0001985354460133159, + "loss": 0.9089, + "step": 2585 + }, + { + "epoch": 0.1494000922934933, + "grad_norm": 0.2683013379573822, + "learning_rate": 0.00019851822840829338, + "loss": 0.9122, + "step": 2590 + }, + { + "epoch": 0.14968850946008305, + "grad_norm": 0.28444692492485046, + "learning_rate": 0.0001985009109423046, + "loss": 0.9987, + "step": 2595 + }, + { + "epoch": 0.1499769266266728, + "grad_norm": 0.28526070713996887, + "learning_rate": 0.0001984834936329031, + "loss": 1.0177, + "step": 2600 + }, + { + "epoch": 0.15026534379326256, + "grad_norm": 0.2751544415950775, + "learning_rate": 0.00019846597649774358, + "loss": 1.0602, + "step": 2605 + }, + { + "epoch": 0.15055376095985232, + "grad_norm": 0.29558390378952026, + "learning_rate": 0.00019844835955458193, + "loss": 1.0015, + "step": 2610 + }, + { + "epoch": 0.15084217812644207, + "grad_norm": 0.27498286962509155, + "learning_rate": 0.00019843064282127511, + "loss": 0.9561, + "step": 2615 + }, + { + "epoch": 0.15113059529303183, + "grad_norm": 0.292961061000824, + "learning_rate": 0.00019841282631578145, + "loss": 0.9914, + "step": 2620 + }, + { + "epoch": 0.1514190124596216, + "grad_norm": 0.3029356896877289, + "learning_rate": 0.0001983949100561602, + "loss": 0.9801, + "step": 2625 + }, + { + "epoch": 0.15170742962621137, + "grad_norm": 0.2864689230918884, + "learning_rate": 0.00019837689406057183, + "loss": 0.9578, + "step": 2630 + }, + { + "epoch": 0.15199584679280112, + "grad_norm": 0.2750813961029053, + "learning_rate": 0.00019835877834727787, + "loss": 0.9483, + "step": 2635 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 0.27926185727119446, + "learning_rate": 0.00019834056293464093, + "loss": 1.0165, + "step": 2640 + }, + { + "epoch": 0.15257268112598063, + "grad_norm": 0.27533864974975586, + "learning_rate": 0.00019832224784112473, + "loss": 1.0241, + "step": 2645 + }, + { + "epoch": 0.15286109829257039, + "grad_norm": 0.276993989944458, + "learning_rate": 0.00019830383308529393, + "loss": 1.0444, + "step": 2650 + }, + { + "epoch": 0.15314951545916014, + "grad_norm": 0.2960858643054962, + "learning_rate": 0.0001982853186858143, + "loss": 0.9928, + "step": 2655 + }, + { + "epoch": 0.1534379326257499, + "grad_norm": 0.29162392020225525, + "learning_rate": 0.00019826670466145262, + "loss": 0.8887, + "step": 2660 + }, + { + "epoch": 0.15372634979233965, + "grad_norm": 0.2606879472732544, + "learning_rate": 0.0001982479910310765, + "loss": 0.9832, + "step": 2665 + }, + { + "epoch": 0.1540147669589294, + "grad_norm": 0.29048001766204834, + "learning_rate": 0.00019822917781365474, + "loss": 1.01, + "step": 2670 + }, + { + "epoch": 0.15430318412551916, + "grad_norm": 0.2942920923233032, + "learning_rate": 0.00019821026502825687, + "loss": 1.0289, + "step": 2675 + }, + { + "epoch": 0.15459160129210892, + "grad_norm": 0.2862975597381592, + "learning_rate": 0.00019819125269405352, + "loss": 0.9961, + "step": 2680 + }, + { + "epoch": 0.15488001845869867, + "grad_norm": 0.2896837890148163, + "learning_rate": 0.00019817214083031614, + "loss": 1.0002, + "step": 2685 + }, + { + "epoch": 0.15516843562528843, + "grad_norm": 0.26825401186943054, + "learning_rate": 0.00019815292945641705, + "loss": 0.9874, + "step": 2690 + }, + { + "epoch": 0.15545685279187818, + "grad_norm": 0.2813914120197296, + "learning_rate": 0.00019813361859182945, + "loss": 0.9919, + "step": 2695 + }, + { + "epoch": 0.15574526995846794, + "grad_norm": 0.284069687128067, + "learning_rate": 0.0001981142082561274, + "loss": 0.8997, + "step": 2700 + }, + { + "epoch": 0.1560336871250577, + "grad_norm": 0.2858209013938904, + "learning_rate": 0.00019809469846898586, + "loss": 0.9546, + "step": 2705 + }, + { + "epoch": 0.15632210429164745, + "grad_norm": 0.2836093604564667, + "learning_rate": 0.0001980750892501804, + "loss": 0.9254, + "step": 2710 + }, + { + "epoch": 0.1566105214582372, + "grad_norm": 0.32628414034843445, + "learning_rate": 0.00019805538061958765, + "loss": 0.94, + "step": 2715 + }, + { + "epoch": 0.15689893862482696, + "grad_norm": 0.2873879373073578, + "learning_rate": 0.0001980355725971847, + "loss": 0.9598, + "step": 2720 + }, + { + "epoch": 0.1571873557914167, + "grad_norm": 0.27270689606666565, + "learning_rate": 0.00019801566520304963, + "loss": 0.9622, + "step": 2725 + }, + { + "epoch": 0.15747577295800647, + "grad_norm": 0.25972458720207214, + "learning_rate": 0.0001979956584573612, + "loss": 0.9895, + "step": 2730 + }, + { + "epoch": 0.15776419012459622, + "grad_norm": 0.2917114198207855, + "learning_rate": 0.00019797555238039872, + "loss": 0.9528, + "step": 2735 + }, + { + "epoch": 0.15805260729118598, + "grad_norm": 0.26294592022895813, + "learning_rate": 0.00019795534699254238, + "loss": 0.9309, + "step": 2740 + }, + { + "epoch": 0.15834102445777573, + "grad_norm": 0.28122779726982117, + "learning_rate": 0.0001979350423142729, + "loss": 0.9853, + "step": 2745 + }, + { + "epoch": 0.15862944162436549, + "grad_norm": 0.29183605313301086, + "learning_rate": 0.00019791463836617176, + "loss": 0.9382, + "step": 2750 + }, + { + "epoch": 0.15891785879095524, + "grad_norm": 0.28074556589126587, + "learning_rate": 0.00019789413516892098, + "loss": 1.01, + "step": 2755 + }, + { + "epoch": 0.159206275957545, + "grad_norm": 0.2814944088459015, + "learning_rate": 0.00019787353274330313, + "loss": 1.0161, + "step": 2760 + }, + { + "epoch": 0.15949469312413475, + "grad_norm": 0.2898254990577698, + "learning_rate": 0.00019785283111020156, + "loss": 1.0388, + "step": 2765 + }, + { + "epoch": 0.1597831102907245, + "grad_norm": 0.2777402400970459, + "learning_rate": 0.00019783203029059997, + "loss": 0.9589, + "step": 2770 + }, + { + "epoch": 0.16007152745731426, + "grad_norm": 0.2646116316318512, + "learning_rate": 0.00019781113030558267, + "loss": 0.9569, + "step": 2775 + }, + { + "epoch": 0.16035994462390402, + "grad_norm": 0.3243483304977417, + "learning_rate": 0.00019779013117633454, + "loss": 0.9622, + "step": 2780 + }, + { + "epoch": 0.16064836179049377, + "grad_norm": 0.2765612304210663, + "learning_rate": 0.0001977690329241409, + "loss": 1.0068, + "step": 2785 + }, + { + "epoch": 0.16093677895708353, + "grad_norm": 0.30408522486686707, + "learning_rate": 0.00019774783557038755, + "loss": 0.969, + "step": 2790 + }, + { + "epoch": 0.16122519612367328, + "grad_norm": 0.26990190148353577, + "learning_rate": 0.00019772653913656076, + "loss": 1.025, + "step": 2795 + }, + { + "epoch": 0.16151361329026304, + "grad_norm": 0.31291985511779785, + "learning_rate": 0.00019770514364424725, + "loss": 1.0174, + "step": 2800 + }, + { + "epoch": 0.1618020304568528, + "grad_norm": 0.31198903918266296, + "learning_rate": 0.00019768364911513405, + "loss": 0.9603, + "step": 2805 + }, + { + "epoch": 0.16209044762344255, + "grad_norm": 0.28119274973869324, + "learning_rate": 0.00019766205557100868, + "loss": 0.9689, + "step": 2810 + }, + { + "epoch": 0.1623788647900323, + "grad_norm": 0.27684643864631653, + "learning_rate": 0.000197640363033759, + "loss": 0.9272, + "step": 2815 + }, + { + "epoch": 0.16266728195662206, + "grad_norm": 0.2740548253059387, + "learning_rate": 0.0001976185715253732, + "loss": 1.0165, + "step": 2820 + }, + { + "epoch": 0.1629556991232118, + "grad_norm": 0.3126582205295563, + "learning_rate": 0.00019759668106793975, + "loss": 0.9915, + "step": 2825 + }, + { + "epoch": 0.16324411628980157, + "grad_norm": 0.27744656801223755, + "learning_rate": 0.0001975746916836475, + "loss": 0.9971, + "step": 2830 + }, + { + "epoch": 0.16353253345639132, + "grad_norm": 0.280280202627182, + "learning_rate": 0.00019755260339478556, + "loss": 0.9637, + "step": 2835 + }, + { + "epoch": 0.16382095062298108, + "grad_norm": 0.2840816378593445, + "learning_rate": 0.0001975304162237432, + "loss": 0.9603, + "step": 2840 + }, + { + "epoch": 0.16410936778957083, + "grad_norm": 0.2826577126979828, + "learning_rate": 0.00019750813019301004, + "loss": 1.0331, + "step": 2845 + }, + { + "epoch": 0.1643977849561606, + "grad_norm": 0.2963692545890808, + "learning_rate": 0.00019748574532517586, + "loss": 0.999, + "step": 2850 + }, + { + "epoch": 0.16468620212275034, + "grad_norm": 0.2895634174346924, + "learning_rate": 0.00019746326164293056, + "loss": 0.9637, + "step": 2855 + }, + { + "epoch": 0.1649746192893401, + "grad_norm": 0.287422776222229, + "learning_rate": 0.0001974406791690643, + "loss": 0.9696, + "step": 2860 + }, + { + "epoch": 0.16526303645592985, + "grad_norm": 0.31378328800201416, + "learning_rate": 0.00019741799792646734, + "loss": 1.0066, + "step": 2865 + }, + { + "epoch": 0.1655514536225196, + "grad_norm": 0.28587618470191956, + "learning_rate": 0.00019739521793813006, + "loss": 0.9224, + "step": 2870 + }, + { + "epoch": 0.16583987078910936, + "grad_norm": 0.28385454416275024, + "learning_rate": 0.0001973723392271429, + "loss": 0.9961, + "step": 2875 + }, + { + "epoch": 0.16612828795569912, + "grad_norm": 0.27586954832077026, + "learning_rate": 0.00019734936181669638, + "loss": 1.065, + "step": 2880 + }, + { + "epoch": 0.16641670512228887, + "grad_norm": 0.30055347084999084, + "learning_rate": 0.00019732628573008114, + "loss": 1.0089, + "step": 2885 + }, + { + "epoch": 0.16670512228887863, + "grad_norm": 0.30119630694389343, + "learning_rate": 0.00019730311099068771, + "loss": 1.017, + "step": 2890 + }, + { + "epoch": 0.16699353945546838, + "grad_norm": 0.29206573963165283, + "learning_rate": 0.00019727983762200677, + "loss": 0.9635, + "step": 2895 + }, + { + "epoch": 0.16728195662205814, + "grad_norm": 0.2570163905620575, + "learning_rate": 0.00019725646564762878, + "loss": 0.9791, + "step": 2900 + }, + { + "epoch": 0.1675703737886479, + "grad_norm": 0.3360570967197418, + "learning_rate": 0.00019723299509124433, + "loss": 0.9498, + "step": 2905 + }, + { + "epoch": 0.16785879095523765, + "grad_norm": 0.29323843121528625, + "learning_rate": 0.00019720942597664385, + "loss": 0.986, + "step": 2910 + }, + { + "epoch": 0.1681472081218274, + "grad_norm": 0.30418166518211365, + "learning_rate": 0.00019718575832771768, + "loss": 0.9756, + "step": 2915 + }, + { + "epoch": 0.16843562528841716, + "grad_norm": 0.31183257699012756, + "learning_rate": 0.00019716199216845604, + "loss": 0.9997, + "step": 2920 + }, + { + "epoch": 0.1687240424550069, + "grad_norm": 0.26834046840667725, + "learning_rate": 0.000197138127522949, + "loss": 0.9315, + "step": 2925 + }, + { + "epoch": 0.16901245962159667, + "grad_norm": 0.27434879541397095, + "learning_rate": 0.00019711416441538652, + "loss": 1.0105, + "step": 2930 + }, + { + "epoch": 0.16930087678818642, + "grad_norm": 0.28828758001327515, + "learning_rate": 0.00019709010287005825, + "loss": 1.0128, + "step": 2935 + }, + { + "epoch": 0.16958929395477618, + "grad_norm": 0.2850480079650879, + "learning_rate": 0.00019706594291135366, + "loss": 0.9618, + "step": 2940 + }, + { + "epoch": 0.16987771112136593, + "grad_norm": 0.2937301993370056, + "learning_rate": 0.00019704168456376205, + "loss": 1.0175, + "step": 2945 + }, + { + "epoch": 0.1701661282879557, + "grad_norm": 0.28153088688850403, + "learning_rate": 0.0001970173278518724, + "loss": 0.9541, + "step": 2950 + }, + { + "epoch": 0.17045454545454544, + "grad_norm": 0.2839425802230835, + "learning_rate": 0.00019699287280037332, + "loss": 1.0139, + "step": 2955 + }, + { + "epoch": 0.1707429626211352, + "grad_norm": 0.28864094614982605, + "learning_rate": 0.00019696831943405324, + "loss": 1.0833, + "step": 2960 + }, + { + "epoch": 0.17103137978772495, + "grad_norm": 0.2697494626045227, + "learning_rate": 0.0001969436677778001, + "loss": 0.9827, + "step": 2965 + }, + { + "epoch": 0.1713197969543147, + "grad_norm": 0.2844550907611847, + "learning_rate": 0.0001969189178566016, + "loss": 1.005, + "step": 2970 + }, + { + "epoch": 0.1716082141209045, + "grad_norm": 0.30949264764785767, + "learning_rate": 0.000196894069695545, + "loss": 0.9696, + "step": 2975 + }, + { + "epoch": 0.17189663128749424, + "grad_norm": 0.2768407464027405, + "learning_rate": 0.00019686912331981702, + "loss": 0.9931, + "step": 2980 + }, + { + "epoch": 0.172185048454084, + "grad_norm": 0.28683245182037354, + "learning_rate": 0.00019684407875470415, + "loss": 1.0018, + "step": 2985 + }, + { + "epoch": 0.17247346562067375, + "grad_norm": 0.3155616223812103, + "learning_rate": 0.00019681893602559224, + "loss": 0.9813, + "step": 2990 + }, + { + "epoch": 0.1727618827872635, + "grad_norm": 0.3154447376728058, + "learning_rate": 0.0001967936951579667, + "loss": 0.9915, + "step": 2995 + }, + { + "epoch": 0.17305029995385326, + "grad_norm": 0.277576744556427, + "learning_rate": 0.00019676835617741249, + "loss": 0.9668, + "step": 3000 + }, + { + "epoch": 0.17333871712044302, + "grad_norm": 0.28618210554122925, + "learning_rate": 0.0001967429191096138, + "loss": 0.9745, + "step": 3005 + }, + { + "epoch": 0.17362713428703277, + "grad_norm": 0.27911707758903503, + "learning_rate": 0.0001967173839803545, + "loss": 0.9732, + "step": 3010 + }, + { + "epoch": 0.17391555145362253, + "grad_norm": 0.28373172879219055, + "learning_rate": 0.00019669175081551773, + "loss": 0.9797, + "step": 3015 + }, + { + "epoch": 0.17420396862021229, + "grad_norm": 0.29749229550361633, + "learning_rate": 0.00019666601964108598, + "loss": 0.94, + "step": 3020 + }, + { + "epoch": 0.17449238578680204, + "grad_norm": 0.31651487946510315, + "learning_rate": 0.00019664019048314116, + "loss": 0.9829, + "step": 3025 + }, + { + "epoch": 0.1747808029533918, + "grad_norm": 0.2834007740020752, + "learning_rate": 0.00019661426336786445, + "loss": 0.9336, + "step": 3030 + }, + { + "epoch": 0.17506922011998155, + "grad_norm": 0.2876712381839752, + "learning_rate": 0.00019658823832153632, + "loss": 0.9174, + "step": 3035 + }, + { + "epoch": 0.1753576372865713, + "grad_norm": 0.3259499669075012, + "learning_rate": 0.00019656211537053654, + "loss": 1.0362, + "step": 3040 + }, + { + "epoch": 0.17564605445316106, + "grad_norm": 0.26136502623558044, + "learning_rate": 0.00019653589454134406, + "loss": 0.9399, + "step": 3045 + }, + { + "epoch": 0.17593447161975082, + "grad_norm": 0.28630778193473816, + "learning_rate": 0.00019650957586053716, + "loss": 0.9861, + "step": 3050 + }, + { + "epoch": 0.17622288878634057, + "grad_norm": 0.2615172266960144, + "learning_rate": 0.00019648315935479315, + "loss": 1.0378, + "step": 3055 + }, + { + "epoch": 0.17651130595293033, + "grad_norm": 0.28133901953697205, + "learning_rate": 0.00019645664505088864, + "loss": 0.9746, + "step": 3060 + }, + { + "epoch": 0.17679972311952008, + "grad_norm": 0.3203901946544647, + "learning_rate": 0.00019643003297569923, + "loss": 0.9894, + "step": 3065 + }, + { + "epoch": 0.17708814028610984, + "grad_norm": 0.2845044434070587, + "learning_rate": 0.00019640332315619977, + "loss": 1.0024, + "step": 3070 + }, + { + "epoch": 0.1773765574526996, + "grad_norm": 0.28776776790618896, + "learning_rate": 0.0001963765156194641, + "loss": 1.0035, + "step": 3075 + }, + { + "epoch": 0.17766497461928935, + "grad_norm": 0.2923831343650818, + "learning_rate": 0.00019634961039266506, + "loss": 1.0253, + "step": 3080 + }, + { + "epoch": 0.1779533917858791, + "grad_norm": 0.29954782128334045, + "learning_rate": 0.00019632260750307467, + "loss": 0.9984, + "step": 3085 + }, + { + "epoch": 0.17824180895246886, + "grad_norm": 0.30335840582847595, + "learning_rate": 0.0001962955069780638, + "loss": 0.9339, + "step": 3090 + }, + { + "epoch": 0.1785302261190586, + "grad_norm": 0.28872916102409363, + "learning_rate": 0.00019626830884510236, + "loss": 1.0417, + "step": 3095 + }, + { + "epoch": 0.17881864328564837, + "grad_norm": 0.3210926949977875, + "learning_rate": 0.00019624101313175918, + "loss": 1.0293, + "step": 3100 + }, + { + "epoch": 0.17910706045223812, + "grad_norm": 0.29229721426963806, + "learning_rate": 0.00019621361986570194, + "loss": 0.9386, + "step": 3105 + }, + { + "epoch": 0.17939547761882788, + "grad_norm": 0.3137836754322052, + "learning_rate": 0.00019618612907469732, + "loss": 0.9874, + "step": 3110 + }, + { + "epoch": 0.17968389478541763, + "grad_norm": 0.27663466334342957, + "learning_rate": 0.00019615854078661077, + "loss": 0.9902, + "step": 3115 + }, + { + "epoch": 0.17997231195200739, + "grad_norm": 0.30164676904678345, + "learning_rate": 0.00019613085502940658, + "loss": 1.1187, + "step": 3120 + }, + { + "epoch": 0.18026072911859714, + "grad_norm": 0.2817506790161133, + "learning_rate": 0.00019610307183114787, + "loss": 0.9643, + "step": 3125 + }, + { + "epoch": 0.1805491462851869, + "grad_norm": 0.28451189398765564, + "learning_rate": 0.00019607519121999647, + "loss": 0.9553, + "step": 3130 + }, + { + "epoch": 0.18083756345177665, + "grad_norm": 0.3148361146450043, + "learning_rate": 0.00019604721322421303, + "loss": 0.9596, + "step": 3135 + }, + { + "epoch": 0.1811259806183664, + "grad_norm": 0.3131537437438965, + "learning_rate": 0.00019601913787215683, + "loss": 0.9841, + "step": 3140 + }, + { + "epoch": 0.18141439778495616, + "grad_norm": 0.301500141620636, + "learning_rate": 0.00019599096519228585, + "loss": 0.9387, + "step": 3145 + }, + { + "epoch": 0.18170281495154592, + "grad_norm": 0.2999275028705597, + "learning_rate": 0.0001959626952131568, + "loss": 0.8649, + "step": 3150 + }, + { + "epoch": 0.18199123211813567, + "grad_norm": 0.3055667281150818, + "learning_rate": 0.00019593432796342496, + "loss": 1.0364, + "step": 3155 + }, + { + "epoch": 0.18227964928472543, + "grad_norm": 0.30451443791389465, + "learning_rate": 0.00019590586347184417, + "loss": 1.0552, + "step": 3160 + }, + { + "epoch": 0.18256806645131518, + "grad_norm": 0.3046397566795349, + "learning_rate": 0.00019587730176726686, + "loss": 0.9897, + "step": 3165 + }, + { + "epoch": 0.18285648361790494, + "grad_norm": 0.3132875859737396, + "learning_rate": 0.00019584864287864408, + "loss": 0.953, + "step": 3170 + }, + { + "epoch": 0.1831449007844947, + "grad_norm": 0.2684531807899475, + "learning_rate": 0.00019581988683502525, + "loss": 1.0479, + "step": 3175 + }, + { + "epoch": 0.18343331795108445, + "grad_norm": 0.3220478594303131, + "learning_rate": 0.0001957910336655584, + "loss": 0.9818, + "step": 3180 + }, + { + "epoch": 0.1837217351176742, + "grad_norm": 0.29744499921798706, + "learning_rate": 0.00019576208339948988, + "loss": 0.985, + "step": 3185 + }, + { + "epoch": 0.18401015228426396, + "grad_norm": 0.26757848262786865, + "learning_rate": 0.00019573303606616459, + "loss": 0.9966, + "step": 3190 + }, + { + "epoch": 0.1842985694508537, + "grad_norm": 0.2966987192630768, + "learning_rate": 0.00019570389169502569, + "loss": 0.9853, + "step": 3195 + }, + { + "epoch": 0.18458698661744347, + "grad_norm": 0.2907325327396393, + "learning_rate": 0.00019567465031561487, + "loss": 1.0468, + "step": 3200 + }, + { + "epoch": 0.18487540378403322, + "grad_norm": 0.2841055989265442, + "learning_rate": 0.00019564531195757193, + "loss": 0.9837, + "step": 3205 + }, + { + "epoch": 0.18516382095062298, + "grad_norm": 0.2998584806919098, + "learning_rate": 0.0001956158766506352, + "loss": 1.0282, + "step": 3210 + }, + { + "epoch": 0.18545223811721273, + "grad_norm": 0.3043042719364166, + "learning_rate": 0.00019558634442464113, + "loss": 0.911, + "step": 3215 + }, + { + "epoch": 0.18574065528380249, + "grad_norm": 0.30067190527915955, + "learning_rate": 0.00019555671530952445, + "loss": 0.9701, + "step": 3220 + }, + { + "epoch": 0.18602907245039224, + "grad_norm": 0.297343373298645, + "learning_rate": 0.00019552698933531808, + "loss": 0.9935, + "step": 3225 + }, + { + "epoch": 0.186317489616982, + "grad_norm": 0.2842741310596466, + "learning_rate": 0.00019549716653215318, + "loss": 0.999, + "step": 3230 + }, + { + "epoch": 0.18660590678357175, + "grad_norm": 0.27844905853271484, + "learning_rate": 0.00019546724693025896, + "loss": 0.9668, + "step": 3235 + }, + { + "epoch": 0.1868943239501615, + "grad_norm": 0.29974377155303955, + "learning_rate": 0.00019543723055996282, + "loss": 0.9864, + "step": 3240 + }, + { + "epoch": 0.18718274111675126, + "grad_norm": 0.2982295751571655, + "learning_rate": 0.0001954071174516903, + "loss": 0.9902, + "step": 3245 + }, + { + "epoch": 0.18747115828334102, + "grad_norm": 0.3086935579776764, + "learning_rate": 0.00019537690763596487, + "loss": 0.9954, + "step": 3250 + }, + { + "epoch": 0.18775957544993077, + "grad_norm": 0.28824785351753235, + "learning_rate": 0.0001953466011434081, + "loss": 0.9979, + "step": 3255 + }, + { + "epoch": 0.18804799261652053, + "grad_norm": 0.2743071913719177, + "learning_rate": 0.00019531619800473952, + "loss": 0.9299, + "step": 3260 + }, + { + "epoch": 0.18833640978311028, + "grad_norm": 0.2896062433719635, + "learning_rate": 0.00019528569825077668, + "loss": 0.9861, + "step": 3265 + }, + { + "epoch": 0.18862482694970004, + "grad_norm": 0.29393669962882996, + "learning_rate": 0.00019525510191243498, + "loss": 1.0792, + "step": 3270 + }, + { + "epoch": 0.1889132441162898, + "grad_norm": 0.3489181399345398, + "learning_rate": 0.00019522440902072782, + "loss": 1.0056, + "step": 3275 + }, + { + "epoch": 0.18920166128287955, + "grad_norm": 0.31945231556892395, + "learning_rate": 0.0001951936196067664, + "loss": 1.0386, + "step": 3280 + }, + { + "epoch": 0.1894900784494693, + "grad_norm": 0.30114686489105225, + "learning_rate": 0.00019516273370175972, + "loss": 0.9667, + "step": 3285 + }, + { + "epoch": 0.18977849561605906, + "grad_norm": 0.3653857409954071, + "learning_rate": 0.00019513175133701474, + "loss": 0.9465, + "step": 3290 + }, + { + "epoch": 0.1900669127826488, + "grad_norm": 0.2919418513774872, + "learning_rate": 0.000195100672543936, + "loss": 0.9252, + "step": 3295 + }, + { + "epoch": 0.19035532994923857, + "grad_norm": 0.29241377115249634, + "learning_rate": 0.00019506949735402588, + "loss": 0.929, + "step": 3300 + }, + { + "epoch": 0.19064374711582832, + "grad_norm": 0.30068260431289673, + "learning_rate": 0.00019503822579888453, + "loss": 1.0254, + "step": 3305 + }, + { + "epoch": 0.19093216428241808, + "grad_norm": 0.2954903542995453, + "learning_rate": 0.00019500685791020968, + "loss": 0.9485, + "step": 3310 + }, + { + "epoch": 0.19122058144900783, + "grad_norm": 0.2899206876754761, + "learning_rate": 0.00019497539371979674, + "loss": 1.036, + "step": 3315 + }, + { + "epoch": 0.1915089986155976, + "grad_norm": 0.3165214955806732, + "learning_rate": 0.00019494383325953875, + "loss": 0.9616, + "step": 3320 + }, + { + "epoch": 0.19179741578218737, + "grad_norm": 0.3250178396701813, + "learning_rate": 0.0001949121765614263, + "loss": 0.9648, + "step": 3325 + }, + { + "epoch": 0.19208583294877712, + "grad_norm": 0.2635006904602051, + "learning_rate": 0.00019488042365754758, + "loss": 0.9789, + "step": 3330 + }, + { + "epoch": 0.19237425011536688, + "grad_norm": 0.2964721620082855, + "learning_rate": 0.0001948485745800882, + "loss": 0.9432, + "step": 3335 + }, + { + "epoch": 0.19266266728195663, + "grad_norm": 0.2993474006652832, + "learning_rate": 0.0001948166293613314, + "loss": 0.9556, + "step": 3340 + }, + { + "epoch": 0.1929510844485464, + "grad_norm": 0.28304216265678406, + "learning_rate": 0.00019478458803365772, + "loss": 0.9445, + "step": 3345 + }, + { + "epoch": 0.19323950161513614, + "grad_norm": 0.2697024941444397, + "learning_rate": 0.00019475245062954523, + "loss": 1.0552, + "step": 3350 + }, + { + "epoch": 0.1935279187817259, + "grad_norm": 0.2875863015651703, + "learning_rate": 0.00019472021718156937, + "loss": 0.9319, + "step": 3355 + }, + { + "epoch": 0.19381633594831565, + "grad_norm": 0.3006811738014221, + "learning_rate": 0.00019468788772240286, + "loss": 1.0049, + "step": 3360 + }, + { + "epoch": 0.1941047531149054, + "grad_norm": 0.30004388093948364, + "learning_rate": 0.0001946554622848158, + "loss": 1.0181, + "step": 3365 + }, + { + "epoch": 0.19439317028149516, + "grad_norm": 0.3029836118221283, + "learning_rate": 0.00019462294090167554, + "loss": 1.045, + "step": 3370 + }, + { + "epoch": 0.19468158744808492, + "grad_norm": 0.2854270339012146, + "learning_rate": 0.00019459032360594677, + "loss": 0.9876, + "step": 3375 + }, + { + "epoch": 0.19497000461467467, + "grad_norm": 0.3001527786254883, + "learning_rate": 0.0001945576104306913, + "loss": 0.9083, + "step": 3380 + }, + { + "epoch": 0.19525842178126443, + "grad_norm": 0.2907600700855255, + "learning_rate": 0.00019452480140906819, + "loss": 0.9734, + "step": 3385 + }, + { + "epoch": 0.19554683894785418, + "grad_norm": 0.2804548442363739, + "learning_rate": 0.00019449189657433358, + "loss": 1.0032, + "step": 3390 + }, + { + "epoch": 0.19583525611444394, + "grad_norm": 0.29847756028175354, + "learning_rate": 0.0001944588959598408, + "loss": 0.9485, + "step": 3395 + }, + { + "epoch": 0.1961236732810337, + "grad_norm": 0.28965532779693604, + "learning_rate": 0.00019442579959904024, + "loss": 0.9713, + "step": 3400 + }, + { + "epoch": 0.19641209044762345, + "grad_norm": 0.295213520526886, + "learning_rate": 0.00019439260752547935, + "loss": 0.9486, + "step": 3405 + }, + { + "epoch": 0.1967005076142132, + "grad_norm": 0.2934512794017792, + "learning_rate": 0.0001943593197728026, + "loss": 1.0448, + "step": 3410 + }, + { + "epoch": 0.19698892478080296, + "grad_norm": 0.29289090633392334, + "learning_rate": 0.00019432593637475138, + "loss": 0.9959, + "step": 3415 + }, + { + "epoch": 0.19727734194739271, + "grad_norm": 0.2757977545261383, + "learning_rate": 0.00019429245736516415, + "loss": 0.9612, + "step": 3420 + }, + { + "epoch": 0.19756575911398247, + "grad_norm": 0.28514814376831055, + "learning_rate": 0.00019425888277797615, + "loss": 1.0246, + "step": 3425 + }, + { + "epoch": 0.19785417628057222, + "grad_norm": 0.32380256056785583, + "learning_rate": 0.00019422521264721962, + "loss": 0.9404, + "step": 3430 + }, + { + "epoch": 0.19814259344716198, + "grad_norm": 0.28507691621780396, + "learning_rate": 0.0001941914470070236, + "loss": 0.8902, + "step": 3435 + }, + { + "epoch": 0.19843101061375173, + "grad_norm": 0.3757873773574829, + "learning_rate": 0.00019415758589161385, + "loss": 1.0038, + "step": 3440 + }, + { + "epoch": 0.1987194277803415, + "grad_norm": 0.3061589300632477, + "learning_rate": 0.00019412362933531307, + "loss": 0.8961, + "step": 3445 + }, + { + "epoch": 0.19900784494693124, + "grad_norm": 0.29617950320243835, + "learning_rate": 0.0001940895773725406, + "loss": 0.9573, + "step": 3450 + }, + { + "epoch": 0.199296262113521, + "grad_norm": 0.27990731596946716, + "learning_rate": 0.00019405543003781251, + "loss": 1.044, + "step": 3455 + }, + { + "epoch": 0.19958467928011075, + "grad_norm": 0.29822319746017456, + "learning_rate": 0.00019402118736574155, + "loss": 0.9799, + "step": 3460 + }, + { + "epoch": 0.1998730964467005, + "grad_norm": 0.3118431866168976, + "learning_rate": 0.00019398684939103707, + "loss": 1.0417, + "step": 3465 + }, + { + "epoch": 0.20016151361329027, + "grad_norm": 0.3202954828739166, + "learning_rate": 0.00019395241614850504, + "loss": 0.9731, + "step": 3470 + }, + { + "epoch": 0.20044993077988002, + "grad_norm": 0.3098292052745819, + "learning_rate": 0.00019391788767304804, + "loss": 0.985, + "step": 3475 + }, + { + "epoch": 0.20073834794646978, + "grad_norm": 0.2931598722934723, + "learning_rate": 0.00019388326399966515, + "loss": 1.0129, + "step": 3480 + }, + { + "epoch": 0.20102676511305953, + "grad_norm": 0.2935352027416229, + "learning_rate": 0.0001938485451634519, + "loss": 0.9402, + "step": 3485 + }, + { + "epoch": 0.20131518227964929, + "grad_norm": 0.3236974775791168, + "learning_rate": 0.00019381373119960033, + "loss": 1.0507, + "step": 3490 + }, + { + "epoch": 0.20160359944623904, + "grad_norm": 0.3834960162639618, + "learning_rate": 0.00019377882214339893, + "loss": 0.9554, + "step": 3495 + }, + { + "epoch": 0.2018920166128288, + "grad_norm": 0.2892552316188812, + "learning_rate": 0.00019374381803023252, + "loss": 1.0119, + "step": 3500 + }, + { + "epoch": 0.20218043377941855, + "grad_norm": 0.29538676142692566, + "learning_rate": 0.0001937087188955823, + "loss": 0.9977, + "step": 3505 + }, + { + "epoch": 0.2024688509460083, + "grad_norm": 0.2964411973953247, + "learning_rate": 0.00019367352477502576, + "loss": 0.9636, + "step": 3510 + }, + { + "epoch": 0.20275726811259806, + "grad_norm": 0.3167349696159363, + "learning_rate": 0.00019363823570423675, + "loss": 0.9345, + "step": 3515 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 0.3199044466018677, + "learning_rate": 0.0001936028517189852, + "loss": 0.913, + "step": 3520 + }, + { + "epoch": 0.20333410244577757, + "grad_norm": 0.27600806951522827, + "learning_rate": 0.00019356737285513748, + "loss": 0.959, + "step": 3525 + }, + { + "epoch": 0.20362251961236733, + "grad_norm": 0.31621217727661133, + "learning_rate": 0.00019353179914865596, + "loss": 1.0437, + "step": 3530 + }, + { + "epoch": 0.20391093677895708, + "grad_norm": 0.30049943923950195, + "learning_rate": 0.00019349613063559916, + "loss": 0.9675, + "step": 3535 + }, + { + "epoch": 0.20419935394554684, + "grad_norm": 0.3039463460445404, + "learning_rate": 0.00019346036735212177, + "loss": 1.0542, + "step": 3540 + }, + { + "epoch": 0.2044877711121366, + "grad_norm": 0.3049977123737335, + "learning_rate": 0.00019342450933447448, + "loss": 0.8974, + "step": 3545 + }, + { + "epoch": 0.20477618827872635, + "grad_norm": 0.2853706181049347, + "learning_rate": 0.00019338855661900405, + "loss": 0.9711, + "step": 3550 + }, + { + "epoch": 0.2050646054453161, + "grad_norm": 0.2970394492149353, + "learning_rate": 0.00019335250924215318, + "loss": 0.9516, + "step": 3555 + }, + { + "epoch": 0.20535302261190586, + "grad_norm": 0.3310398459434509, + "learning_rate": 0.00019331636724046058, + "loss": 0.9293, + "step": 3560 + }, + { + "epoch": 0.2056414397784956, + "grad_norm": 0.2932792901992798, + "learning_rate": 0.0001932801306505608, + "loss": 1.0088, + "step": 3565 + }, + { + "epoch": 0.20592985694508537, + "grad_norm": 0.3343851566314697, + "learning_rate": 0.00019324379950918437, + "loss": 1.0363, + "step": 3570 + }, + { + "epoch": 0.20621827411167512, + "grad_norm": 0.30094677209854126, + "learning_rate": 0.00019320737385315756, + "loss": 1.0072, + "step": 3575 + }, + { + "epoch": 0.20650669127826488, + "grad_norm": 0.28837206959724426, + "learning_rate": 0.00019317085371940246, + "loss": 0.9139, + "step": 3580 + }, + { + "epoch": 0.20679510844485463, + "grad_norm": 0.29000407457351685, + "learning_rate": 0.00019313423914493703, + "loss": 0.9431, + "step": 3585 + }, + { + "epoch": 0.20708352561144439, + "grad_norm": 0.28823748230934143, + "learning_rate": 0.00019309753016687477, + "loss": 0.9281, + "step": 3590 + }, + { + "epoch": 0.20737194277803414, + "grad_norm": 0.30797070264816284, + "learning_rate": 0.00019306072682242505, + "loss": 0.9611, + "step": 3595 + }, + { + "epoch": 0.2076603599446239, + "grad_norm": 0.2971121370792389, + "learning_rate": 0.00019302382914889284, + "loss": 1.0199, + "step": 3600 + }, + { + "epoch": 0.20794877711121365, + "grad_norm": 0.2938947081565857, + "learning_rate": 0.00019298683718367864, + "loss": 0.9275, + "step": 3605 + }, + { + "epoch": 0.2082371942778034, + "grad_norm": 0.3001919686794281, + "learning_rate": 0.00019294975096427862, + "loss": 0.9963, + "step": 3610 + }, + { + "epoch": 0.20852561144439316, + "grad_norm": 0.3122607469558716, + "learning_rate": 0.00019291257052828447, + "loss": 1.0458, + "step": 3615 + }, + { + "epoch": 0.20881402861098292, + "grad_norm": 0.2895052433013916, + "learning_rate": 0.00019287529591338333, + "loss": 0.9592, + "step": 3620 + }, + { + "epoch": 0.20910244577757267, + "grad_norm": 0.2828371822834015, + "learning_rate": 0.0001928379271573579, + "loss": 0.9518, + "step": 3625 + }, + { + "epoch": 0.20939086294416243, + "grad_norm": 0.30132856965065, + "learning_rate": 0.0001928004642980862, + "loss": 0.9374, + "step": 3630 + }, + { + "epoch": 0.20967928011075218, + "grad_norm": 0.4656534194946289, + "learning_rate": 0.0001927629073735417, + "loss": 0.9824, + "step": 3635 + }, + { + "epoch": 0.20996769727734194, + "grad_norm": 0.2774214744567871, + "learning_rate": 0.00019272525642179323, + "loss": 0.9528, + "step": 3640 + }, + { + "epoch": 0.2102561144439317, + "grad_norm": 0.2919476330280304, + "learning_rate": 0.00019268751148100486, + "loss": 0.9404, + "step": 3645 + }, + { + "epoch": 0.21054453161052145, + "grad_norm": 0.3007878065109253, + "learning_rate": 0.00019264967258943595, + "loss": 0.96, + "step": 3650 + }, + { + "epoch": 0.2108329487771112, + "grad_norm": 0.30731719732284546, + "learning_rate": 0.0001926117397854412, + "loss": 0.9321, + "step": 3655 + }, + { + "epoch": 0.21112136594370096, + "grad_norm": 0.32939255237579346, + "learning_rate": 0.0001925737131074703, + "loss": 1.0182, + "step": 3660 + }, + { + "epoch": 0.2114097831102907, + "grad_norm": 0.29776227474212646, + "learning_rate": 0.0001925355925940683, + "loss": 1.0224, + "step": 3665 + }, + { + "epoch": 0.2116982002768805, + "grad_norm": 0.3057902753353119, + "learning_rate": 0.00019249737828387522, + "loss": 0.9812, + "step": 3670 + }, + { + "epoch": 0.21198661744347025, + "grad_norm": 0.3011026382446289, + "learning_rate": 0.0001924590702156262, + "loss": 0.9753, + "step": 3675 + }, + { + "epoch": 0.21227503461006, + "grad_norm": 0.2978782653808594, + "learning_rate": 0.00019242066842815146, + "loss": 1.0129, + "step": 3680 + }, + { + "epoch": 0.21256345177664976, + "grad_norm": 0.2966994047164917, + "learning_rate": 0.00019238217296037614, + "loss": 1.0068, + "step": 3685 + }, + { + "epoch": 0.21285186894323951, + "grad_norm": 0.2818816602230072, + "learning_rate": 0.00019234358385132038, + "loss": 1.0062, + "step": 3690 + }, + { + "epoch": 0.21314028610982927, + "grad_norm": 0.280269980430603, + "learning_rate": 0.00019230490114009928, + "loss": 0.9392, + "step": 3695 + }, + { + "epoch": 0.21342870327641902, + "grad_norm": 0.29371026158332825, + "learning_rate": 0.00019226612486592271, + "loss": 0.8971, + "step": 3700 + }, + { + "epoch": 0.21371712044300878, + "grad_norm": 0.3066560924053192, + "learning_rate": 0.00019222725506809547, + "loss": 0.9893, + "step": 3705 + }, + { + "epoch": 0.21400553760959853, + "grad_norm": 0.31458479166030884, + "learning_rate": 0.00019218829178601713, + "loss": 1.0389, + "step": 3710 + }, + { + "epoch": 0.2142939547761883, + "grad_norm": 0.3057044446468353, + "learning_rate": 0.00019214923505918202, + "loss": 1.0005, + "step": 3715 + }, + { + "epoch": 0.21458237194277804, + "grad_norm": 0.27441418170928955, + "learning_rate": 0.00019211008492717914, + "loss": 0.9777, + "step": 3720 + }, + { + "epoch": 0.2148707891093678, + "grad_norm": 0.2985784113407135, + "learning_rate": 0.00019207084142969225, + "loss": 1.0475, + "step": 3725 + }, + { + "epoch": 0.21515920627595755, + "grad_norm": 0.305512934923172, + "learning_rate": 0.0001920315046064997, + "loss": 0.9554, + "step": 3730 + }, + { + "epoch": 0.2154476234425473, + "grad_norm": 0.3009251356124878, + "learning_rate": 0.0001919920744974745, + "loss": 0.9912, + "step": 3735 + }, + { + "epoch": 0.21573604060913706, + "grad_norm": 0.29489755630493164, + "learning_rate": 0.00019195255114258408, + "loss": 0.9554, + "step": 3740 + }, + { + "epoch": 0.21602445777572682, + "grad_norm": 0.3059771955013275, + "learning_rate": 0.0001919129345818905, + "loss": 0.9819, + "step": 3745 + }, + { + "epoch": 0.21631287494231657, + "grad_norm": 0.3015615940093994, + "learning_rate": 0.00019187322485555031, + "loss": 0.9948, + "step": 3750 + }, + { + "epoch": 0.21660129210890633, + "grad_norm": 0.3108586072921753, + "learning_rate": 0.0001918334220038144, + "loss": 0.9818, + "step": 3755 + }, + { + "epoch": 0.21688970927549608, + "grad_norm": 0.30573326349258423, + "learning_rate": 0.00019179352606702813, + "loss": 0.9519, + "step": 3760 + }, + { + "epoch": 0.21717812644208584, + "grad_norm": 0.2957397997379303, + "learning_rate": 0.00019175353708563117, + "loss": 1.0094, + "step": 3765 + }, + { + "epoch": 0.2174665436086756, + "grad_norm": 0.2969014644622803, + "learning_rate": 0.00019171345510015758, + "loss": 1.0162, + "step": 3770 + }, + { + "epoch": 0.21775496077526535, + "grad_norm": 0.33074361085891724, + "learning_rate": 0.00019167328015123558, + "loss": 0.9382, + "step": 3775 + }, + { + "epoch": 0.2180433779418551, + "grad_norm": 0.2909998297691345, + "learning_rate": 0.0001916330122795877, + "loss": 0.9768, + "step": 3780 + }, + { + "epoch": 0.21833179510844486, + "grad_norm": 0.28647512197494507, + "learning_rate": 0.00019159265152603064, + "loss": 0.9658, + "step": 3785 + }, + { + "epoch": 0.21862021227503461, + "grad_norm": 0.3733946979045868, + "learning_rate": 0.00019155219793147522, + "loss": 1.037, + "step": 3790 + }, + { + "epoch": 0.21890862944162437, + "grad_norm": 0.2883405089378357, + "learning_rate": 0.00019151165153692644, + "loss": 0.9551, + "step": 3795 + }, + { + "epoch": 0.21919704660821412, + "grad_norm": 0.33625394105911255, + "learning_rate": 0.00019147101238348326, + "loss": 0.995, + "step": 3800 + }, + { + "epoch": 0.21948546377480388, + "grad_norm": 0.4042999744415283, + "learning_rate": 0.00019143028051233873, + "loss": 0.9512, + "step": 3805 + }, + { + "epoch": 0.21977388094139363, + "grad_norm": 0.277295857667923, + "learning_rate": 0.00019138945596477994, + "loss": 0.9281, + "step": 3810 + }, + { + "epoch": 0.2200622981079834, + "grad_norm": 0.3070628046989441, + "learning_rate": 0.0001913485387821877, + "loss": 0.938, + "step": 3815 + }, + { + "epoch": 0.22035071527457314, + "grad_norm": 0.2898661494255066, + "learning_rate": 0.00019130752900603702, + "loss": 1.0103, + "step": 3820 + }, + { + "epoch": 0.2206391324411629, + "grad_norm": 0.2981604039669037, + "learning_rate": 0.00019126642667789654, + "loss": 0.9787, + "step": 3825 + }, + { + "epoch": 0.22092754960775265, + "grad_norm": 0.2816370129585266, + "learning_rate": 0.00019122523183942879, + "loss": 1.039, + "step": 3830 + }, + { + "epoch": 0.2212159667743424, + "grad_norm": 0.306822806596756, + "learning_rate": 0.00019118394453239006, + "loss": 1.0161, + "step": 3835 + }, + { + "epoch": 0.22150438394093216, + "grad_norm": 0.29982468485832214, + "learning_rate": 0.00019114256479863038, + "loss": 0.959, + "step": 3840 + }, + { + "epoch": 0.22179280110752192, + "grad_norm": 0.2966124713420868, + "learning_rate": 0.00019110109268009347, + "loss": 0.9996, + "step": 3845 + }, + { + "epoch": 0.22208121827411167, + "grad_norm": 0.3192947208881378, + "learning_rate": 0.00019105952821881668, + "loss": 1.0132, + "step": 3850 + }, + { + "epoch": 0.22236963544070143, + "grad_norm": 0.2927592694759369, + "learning_rate": 0.00019101787145693098, + "loss": 0.9738, + "step": 3855 + }, + { + "epoch": 0.22265805260729118, + "grad_norm": 0.2782720923423767, + "learning_rate": 0.00019097612243666086, + "loss": 0.9538, + "step": 3860 + }, + { + "epoch": 0.22294646977388094, + "grad_norm": 0.32348090410232544, + "learning_rate": 0.0001909342812003244, + "loss": 0.9593, + "step": 3865 + }, + { + "epoch": 0.2232348869404707, + "grad_norm": 0.32968342304229736, + "learning_rate": 0.00019089234779033306, + "loss": 0.9899, + "step": 3870 + }, + { + "epoch": 0.22352330410706045, + "grad_norm": 0.29580381512641907, + "learning_rate": 0.00019085032224919177, + "loss": 0.9515, + "step": 3875 + }, + { + "epoch": 0.2238117212736502, + "grad_norm": 0.27999478578567505, + "learning_rate": 0.00019080820461949886, + "loss": 0.9596, + "step": 3880 + }, + { + "epoch": 0.22410013844023996, + "grad_norm": 0.31083959341049194, + "learning_rate": 0.00019076599494394602, + "loss": 1.0069, + "step": 3885 + }, + { + "epoch": 0.22438855560682971, + "grad_norm": 0.2649812400341034, + "learning_rate": 0.00019072369326531824, + "loss": 0.9238, + "step": 3890 + }, + { + "epoch": 0.22467697277341947, + "grad_norm": 0.2908613383769989, + "learning_rate": 0.00019068129962649365, + "loss": 0.9745, + "step": 3895 + }, + { + "epoch": 0.22496538994000922, + "grad_norm": 0.2983262538909912, + "learning_rate": 0.00019063881407044373, + "loss": 0.9155, + "step": 3900 + }, + { + "epoch": 0.22525380710659898, + "grad_norm": 0.3074907660484314, + "learning_rate": 0.00019059623664023311, + "loss": 1.0384, + "step": 3905 + }, + { + "epoch": 0.22554222427318874, + "grad_norm": 0.3024677336215973, + "learning_rate": 0.00019055356737901952, + "loss": 1.0626, + "step": 3910 + }, + { + "epoch": 0.2258306414397785, + "grad_norm": 0.324719101190567, + "learning_rate": 0.00019051080633005372, + "loss": 0.9757, + "step": 3915 + }, + { + "epoch": 0.22611905860636825, + "grad_norm": 0.31149742007255554, + "learning_rate": 0.00019046795353667965, + "loss": 1.0294, + "step": 3920 + }, + { + "epoch": 0.226407475772958, + "grad_norm": 0.3361373543739319, + "learning_rate": 0.00019042500904233408, + "loss": 0.949, + "step": 3925 + }, + { + "epoch": 0.22669589293954776, + "grad_norm": 0.3346847593784332, + "learning_rate": 0.00019038197289054684, + "loss": 0.9531, + "step": 3930 + }, + { + "epoch": 0.2269843101061375, + "grad_norm": 0.3011166453361511, + "learning_rate": 0.00019033884512494064, + "loss": 0.9515, + "step": 3935 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.350754052400589, + "learning_rate": 0.00019029562578923106, + "loss": 0.9878, + "step": 3940 + }, + { + "epoch": 0.22756114443931702, + "grad_norm": 0.3115714192390442, + "learning_rate": 0.00019025231492722643, + "loss": 0.9914, + "step": 3945 + }, + { + "epoch": 0.22784956160590678, + "grad_norm": 0.29641732573509216, + "learning_rate": 0.000190208912582828, + "loss": 0.9508, + "step": 3950 + }, + { + "epoch": 0.22813797877249653, + "grad_norm": 0.3013533353805542, + "learning_rate": 0.0001901654188000296, + "loss": 0.9551, + "step": 3955 + }, + { + "epoch": 0.22842639593908629, + "grad_norm": 0.3072235584259033, + "learning_rate": 0.0001901218336229178, + "loss": 1.0324, + "step": 3960 + }, + { + "epoch": 0.22871481310567604, + "grad_norm": 0.2967047691345215, + "learning_rate": 0.00019007815709567183, + "loss": 0.9767, + "step": 3965 + }, + { + "epoch": 0.2290032302722658, + "grad_norm": 0.3344308137893677, + "learning_rate": 0.0001900343892625635, + "loss": 1.053, + "step": 3970 + }, + { + "epoch": 0.22929164743885555, + "grad_norm": 0.279471218585968, + "learning_rate": 0.00018999053016795719, + "loss": 0.9597, + "step": 3975 + }, + { + "epoch": 0.2295800646054453, + "grad_norm": 0.3151692748069763, + "learning_rate": 0.00018994657985630972, + "loss": 0.981, + "step": 3980 + }, + { + "epoch": 0.22986848177203506, + "grad_norm": 0.29757049679756165, + "learning_rate": 0.00018990253837217042, + "loss": 0.9948, + "step": 3985 + }, + { + "epoch": 0.23015689893862482, + "grad_norm": 0.29068654775619507, + "learning_rate": 0.00018985840576018107, + "loss": 0.9492, + "step": 3990 + }, + { + "epoch": 0.23044531610521457, + "grad_norm": 0.29149913787841797, + "learning_rate": 0.00018981418206507575, + "loss": 0.9603, + "step": 3995 + }, + { + "epoch": 0.23073373327180433, + "grad_norm": 0.2850954830646515, + "learning_rate": 0.00018976986733168093, + "loss": 1.0198, + "step": 4000 + }, + { + "epoch": 0.23102215043839408, + "grad_norm": 0.3014662563800812, + "learning_rate": 0.00018972546160491528, + "loss": 1.0628, + "step": 4005 + }, + { + "epoch": 0.23131056760498384, + "grad_norm": 0.29958969354629517, + "learning_rate": 0.00018968096492978976, + "loss": 0.9891, + "step": 4010 + }, + { + "epoch": 0.2315989847715736, + "grad_norm": 0.29551297426223755, + "learning_rate": 0.0001896363773514075, + "loss": 0.9811, + "step": 4015 + }, + { + "epoch": 0.23188740193816337, + "grad_norm": 0.30971017479896545, + "learning_rate": 0.0001895916989149638, + "loss": 1.0459, + "step": 4020 + }, + { + "epoch": 0.23217581910475313, + "grad_norm": 0.3282906115055084, + "learning_rate": 0.000189546929665746, + "loss": 1.0698, + "step": 4025 + }, + { + "epoch": 0.23246423627134288, + "grad_norm": 0.3017507493495941, + "learning_rate": 0.00018950206964913355, + "loss": 0.9867, + "step": 4030 + }, + { + "epoch": 0.23275265343793264, + "grad_norm": 0.34195518493652344, + "learning_rate": 0.0001894571189105979, + "loss": 0.9247, + "step": 4035 + }, + { + "epoch": 0.2330410706045224, + "grad_norm": 0.33378762006759644, + "learning_rate": 0.00018941207749570237, + "loss": 1.0384, + "step": 4040 + }, + { + "epoch": 0.23332948777111215, + "grad_norm": 0.325948029756546, + "learning_rate": 0.00018936694545010232, + "loss": 0.9698, + "step": 4045 + }, + { + "epoch": 0.2336179049377019, + "grad_norm": 0.2848076820373535, + "learning_rate": 0.0001893217228195449, + "loss": 1.0036, + "step": 4050 + }, + { + "epoch": 0.23390632210429166, + "grad_norm": 0.30070775747299194, + "learning_rate": 0.0001892764096498691, + "loss": 1.0397, + "step": 4055 + }, + { + "epoch": 0.2341947392708814, + "grad_norm": 0.3177594244480133, + "learning_rate": 0.00018923100598700561, + "loss": 1.0136, + "step": 4060 + }, + { + "epoch": 0.23448315643747117, + "grad_norm": 0.31077563762664795, + "learning_rate": 0.00018918551187697703, + "loss": 0.9457, + "step": 4065 + }, + { + "epoch": 0.23477157360406092, + "grad_norm": 0.2947135865688324, + "learning_rate": 0.00018913992736589746, + "loss": 0.9988, + "step": 4070 + }, + { + "epoch": 0.23505999077065068, + "grad_norm": 0.26377373933792114, + "learning_rate": 0.00018909425249997267, + "loss": 0.9891, + "step": 4075 + }, + { + "epoch": 0.23534840793724043, + "grad_norm": 0.3427537977695465, + "learning_rate": 0.0001890484873255001, + "loss": 0.993, + "step": 4080 + }, + { + "epoch": 0.2356368251038302, + "grad_norm": 0.28606218099594116, + "learning_rate": 0.00018900263188886864, + "loss": 0.9609, + "step": 4085 + }, + { + "epoch": 0.23592524227041994, + "grad_norm": 0.31335821747779846, + "learning_rate": 0.00018895668623655873, + "loss": 0.9278, + "step": 4090 + }, + { + "epoch": 0.2362136594370097, + "grad_norm": 0.3148699104785919, + "learning_rate": 0.00018891065041514224, + "loss": 0.9486, + "step": 4095 + }, + { + "epoch": 0.23650207660359945, + "grad_norm": 0.30335333943367004, + "learning_rate": 0.0001888645244712824, + "loss": 0.9604, + "step": 4100 + }, + { + "epoch": 0.2367904937701892, + "grad_norm": 0.2990083396434784, + "learning_rate": 0.0001888183084517338, + "loss": 0.9277, + "step": 4105 + }, + { + "epoch": 0.23707891093677896, + "grad_norm": 0.3039418160915375, + "learning_rate": 0.00018877200240334236, + "loss": 1.0381, + "step": 4110 + }, + { + "epoch": 0.23736732810336872, + "grad_norm": 0.3109247386455536, + "learning_rate": 0.0001887256063730453, + "loss": 1.0214, + "step": 4115 + }, + { + "epoch": 0.23765574526995847, + "grad_norm": 0.29135051369667053, + "learning_rate": 0.00018867912040787096, + "loss": 1.0111, + "step": 4120 + }, + { + "epoch": 0.23794416243654823, + "grad_norm": 0.29950061440467834, + "learning_rate": 0.0001886325445549389, + "loss": 0.9879, + "step": 4125 + }, + { + "epoch": 0.23823257960313798, + "grad_norm": 0.3028976619243622, + "learning_rate": 0.00018858587886145975, + "loss": 0.9808, + "step": 4130 + }, + { + "epoch": 0.23852099676972774, + "grad_norm": 0.2960391342639923, + "learning_rate": 0.0001885391233747352, + "loss": 0.9033, + "step": 4135 + }, + { + "epoch": 0.2388094139363175, + "grad_norm": 0.28858163952827454, + "learning_rate": 0.00018849227814215805, + "loss": 0.8774, + "step": 4140 + }, + { + "epoch": 0.23909783110290725, + "grad_norm": 0.3187437653541565, + "learning_rate": 0.00018844534321121195, + "loss": 1.032, + "step": 4145 + }, + { + "epoch": 0.239386248269497, + "grad_norm": 0.30050045251846313, + "learning_rate": 0.00018839831862947152, + "loss": 0.9785, + "step": 4150 + }, + { + "epoch": 0.23967466543608676, + "grad_norm": 0.3172016739845276, + "learning_rate": 0.0001883512044446023, + "loss": 1.0049, + "step": 4155 + }, + { + "epoch": 0.23996308260267651, + "grad_norm": 0.2758901119232178, + "learning_rate": 0.00018830400070436057, + "loss": 0.8758, + "step": 4160 + }, + { + "epoch": 0.24025149976926627, + "grad_norm": 0.31265828013420105, + "learning_rate": 0.00018825670745659345, + "loss": 0.9875, + "step": 4165 + }, + { + "epoch": 0.24053991693585602, + "grad_norm": 0.2935623526573181, + "learning_rate": 0.00018820932474923873, + "loss": 0.9738, + "step": 4170 + }, + { + "epoch": 0.24082833410244578, + "grad_norm": 0.31961116194725037, + "learning_rate": 0.00018816185263032496, + "loss": 0.985, + "step": 4175 + }, + { + "epoch": 0.24111675126903553, + "grad_norm": 0.302990198135376, + "learning_rate": 0.00018811429114797123, + "loss": 0.9693, + "step": 4180 + }, + { + "epoch": 0.2414051684356253, + "grad_norm": 0.3246656358242035, + "learning_rate": 0.00018806664035038727, + "loss": 0.9715, + "step": 4185 + }, + { + "epoch": 0.24169358560221504, + "grad_norm": 0.30691856145858765, + "learning_rate": 0.00018801890028587333, + "loss": 0.9967, + "step": 4190 + }, + { + "epoch": 0.2419820027688048, + "grad_norm": 0.3090788424015045, + "learning_rate": 0.00018797107100282015, + "loss": 1.0014, + "step": 4195 + }, + { + "epoch": 0.24227041993539455, + "grad_norm": 0.28349974751472473, + "learning_rate": 0.0001879231525497089, + "loss": 0.9426, + "step": 4200 + }, + { + "epoch": 0.2425588371019843, + "grad_norm": 0.3226814270019531, + "learning_rate": 0.00018787514497511104, + "loss": 1.0058, + "step": 4205 + }, + { + "epoch": 0.24284725426857406, + "grad_norm": 0.3090320825576782, + "learning_rate": 0.0001878270483276886, + "loss": 0.9565, + "step": 4210 + }, + { + "epoch": 0.24313567143516382, + "grad_norm": 0.29639485478401184, + "learning_rate": 0.00018777886265619365, + "loss": 0.9994, + "step": 4215 + }, + { + "epoch": 0.24342408860175357, + "grad_norm": 0.30157527327537537, + "learning_rate": 0.00018773058800946858, + "loss": 0.9349, + "step": 4220 + }, + { + "epoch": 0.24371250576834333, + "grad_norm": 0.2847401797771454, + "learning_rate": 0.0001876822244364461, + "loss": 0.9882, + "step": 4225 + }, + { + "epoch": 0.24400092293493308, + "grad_norm": 0.2939082086086273, + "learning_rate": 0.00018763377198614887, + "loss": 0.9545, + "step": 4230 + }, + { + "epoch": 0.24428934010152284, + "grad_norm": 0.30300137400627136, + "learning_rate": 0.00018758523070768973, + "loss": 0.9069, + "step": 4235 + }, + { + "epoch": 0.2445777572681126, + "grad_norm": 0.2980591952800751, + "learning_rate": 0.00018753660065027152, + "loss": 0.9992, + "step": 4240 + }, + { + "epoch": 0.24486617443470235, + "grad_norm": 0.31828731298446655, + "learning_rate": 0.00018748788186318712, + "loss": 0.9711, + "step": 4245 + }, + { + "epoch": 0.2451545916012921, + "grad_norm": 0.31123876571655273, + "learning_rate": 0.00018743907439581933, + "loss": 0.9393, + "step": 4250 + }, + { + "epoch": 0.24544300876788186, + "grad_norm": 0.29812201857566833, + "learning_rate": 0.00018739017829764082, + "loss": 0.9653, + "step": 4255 + }, + { + "epoch": 0.24573142593447161, + "grad_norm": 0.33146384358406067, + "learning_rate": 0.0001873411936182141, + "loss": 0.9758, + "step": 4260 + }, + { + "epoch": 0.24601984310106137, + "grad_norm": 0.3051407039165497, + "learning_rate": 0.0001872921204071915, + "loss": 1.0172, + "step": 4265 + }, + { + "epoch": 0.24630826026765112, + "grad_norm": 0.30195561051368713, + "learning_rate": 0.000187242958714315, + "loss": 0.9868, + "step": 4270 + }, + { + "epoch": 0.24659667743424088, + "grad_norm": 0.2948630750179291, + "learning_rate": 0.00018719370858941644, + "loss": 0.9771, + "step": 4275 + }, + { + "epoch": 0.24688509460083063, + "grad_norm": 0.3198891282081604, + "learning_rate": 0.00018714437008241709, + "loss": 1.04, + "step": 4280 + }, + { + "epoch": 0.2471735117674204, + "grad_norm": 0.3208988606929779, + "learning_rate": 0.000187094943243328, + "loss": 0.9666, + "step": 4285 + }, + { + "epoch": 0.24746192893401014, + "grad_norm": 0.3209957182407379, + "learning_rate": 0.00018704542812224956, + "loss": 0.9374, + "step": 4290 + }, + { + "epoch": 0.2477503461005999, + "grad_norm": 0.3006252348423004, + "learning_rate": 0.00018699582476937185, + "loss": 0.9798, + "step": 4295 + }, + { + "epoch": 0.24803876326718965, + "grad_norm": 0.3490176796913147, + "learning_rate": 0.00018694613323497422, + "loss": 1.0087, + "step": 4300 + }, + { + "epoch": 0.2483271804337794, + "grad_norm": 0.3163358271121979, + "learning_rate": 0.0001868963535694255, + "loss": 1.043, + "step": 4305 + }, + { + "epoch": 0.24861559760036916, + "grad_norm": 0.298026442527771, + "learning_rate": 0.0001868464858231838, + "loss": 1.0404, + "step": 4310 + }, + { + "epoch": 0.24890401476695892, + "grad_norm": 0.3209499418735504, + "learning_rate": 0.00018679653004679655, + "loss": 0.9687, + "step": 4315 + }, + { + "epoch": 0.24919243193354867, + "grad_norm": 0.3158719539642334, + "learning_rate": 0.0001867464862909004, + "loss": 0.9548, + "step": 4320 + }, + { + "epoch": 0.24948084910013843, + "grad_norm": 0.28783926367759705, + "learning_rate": 0.00018669635460622107, + "loss": 0.9042, + "step": 4325 + }, + { + "epoch": 0.24976926626672818, + "grad_norm": 0.2980654835700989, + "learning_rate": 0.00018664613504357366, + "loss": 0.97, + "step": 4330 + }, + { + "epoch": 0.25005768343331797, + "grad_norm": 0.2950812876224518, + "learning_rate": 0.00018659582765386204, + "loss": 1.0261, + "step": 4335 + }, + { + "epoch": 0.2503461005999077, + "grad_norm": 0.2984694540500641, + "learning_rate": 0.0001865454324880794, + "loss": 0.9859, + "step": 4340 + }, + { + "epoch": 0.2506345177664975, + "grad_norm": 0.3119395971298218, + "learning_rate": 0.00018649494959730765, + "loss": 1.03, + "step": 4345 + }, + { + "epoch": 0.2509229349330872, + "grad_norm": 0.3380660116672516, + "learning_rate": 0.00018644437903271778, + "loss": 1.0373, + "step": 4350 + }, + { + "epoch": 0.251211352099677, + "grad_norm": 0.310693621635437, + "learning_rate": 0.0001863937208455696, + "loss": 0.977, + "step": 4355 + }, + { + "epoch": 0.2514997692662667, + "grad_norm": 0.3119440972805023, + "learning_rate": 0.00018634297508721167, + "loss": 0.9384, + "step": 4360 + }, + { + "epoch": 0.2517881864328565, + "grad_norm": 0.3072355389595032, + "learning_rate": 0.00018629214180908144, + "loss": 1.0126, + "step": 4365 + }, + { + "epoch": 0.2520766035994462, + "grad_norm": 0.3056802749633789, + "learning_rate": 0.00018624122106270506, + "loss": 0.9496, + "step": 4370 + }, + { + "epoch": 0.252365020766036, + "grad_norm": 0.34883102774620056, + "learning_rate": 0.00018619021289969717, + "loss": 0.9626, + "step": 4375 + }, + { + "epoch": 0.25265343793262574, + "grad_norm": 0.2876664698123932, + "learning_rate": 0.00018613911737176125, + "loss": 0.9452, + "step": 4380 + }, + { + "epoch": 0.2529418550992155, + "grad_norm": 0.3051524758338928, + "learning_rate": 0.00018608793453068914, + "loss": 0.996, + "step": 4385 + }, + { + "epoch": 0.25323027226580525, + "grad_norm": 0.2734985053539276, + "learning_rate": 0.0001860366644283613, + "loss": 0.9395, + "step": 4390 + }, + { + "epoch": 0.25351868943239503, + "grad_norm": 0.30163031816482544, + "learning_rate": 0.00018598530711674667, + "loss": 0.9608, + "step": 4395 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 0.2709837555885315, + "learning_rate": 0.00018593386264790243, + "loss": 0.9611, + "step": 4400 + }, + { + "epoch": 0.25409552376557454, + "grad_norm": 0.3166120946407318, + "learning_rate": 0.00018588233107397429, + "loss": 0.8999, + "step": 4405 + }, + { + "epoch": 0.25438394093216427, + "grad_norm": 0.2956826090812683, + "learning_rate": 0.00018583071244719607, + "loss": 0.9097, + "step": 4410 + }, + { + "epoch": 0.25467235809875405, + "grad_norm": 0.31426194310188293, + "learning_rate": 0.00018577900681989, + "loss": 0.941, + "step": 4415 + }, + { + "epoch": 0.2549607752653438, + "grad_norm": 0.2746027410030365, + "learning_rate": 0.0001857272142444664, + "loss": 0.9168, + "step": 4420 + }, + { + "epoch": 0.25524919243193356, + "grad_norm": 0.2936379015445709, + "learning_rate": 0.00018567533477342377, + "loss": 0.9536, + "step": 4425 + }, + { + "epoch": 0.2555376095985233, + "grad_norm": 0.31358134746551514, + "learning_rate": 0.0001856233684593486, + "loss": 0.9569, + "step": 4430 + }, + { + "epoch": 0.25582602676511307, + "grad_norm": 0.31144851446151733, + "learning_rate": 0.0001855713153549155, + "loss": 0.9447, + "step": 4435 + }, + { + "epoch": 0.2561144439317028, + "grad_norm": 0.31088197231292725, + "learning_rate": 0.00018551917551288706, + "loss": 0.9873, + "step": 4440 + }, + { + "epoch": 0.2564028610982926, + "grad_norm": 0.31137150526046753, + "learning_rate": 0.0001854669489861137, + "loss": 0.9769, + "step": 4445 + }, + { + "epoch": 0.2566912782648823, + "grad_norm": 0.3470550775527954, + "learning_rate": 0.0001854146358275338, + "loss": 0.9824, + "step": 4450 + }, + { + "epoch": 0.2569796954314721, + "grad_norm": 0.305550754070282, + "learning_rate": 0.00018536223609017348, + "loss": 1.0573, + "step": 4455 + }, + { + "epoch": 0.2572681125980618, + "grad_norm": 0.30111902952194214, + "learning_rate": 0.00018530974982714667, + "loss": 0.9919, + "step": 4460 + }, + { + "epoch": 0.2575565297646516, + "grad_norm": 0.29458123445510864, + "learning_rate": 0.00018525717709165498, + "loss": 1.0249, + "step": 4465 + }, + { + "epoch": 0.2578449469312413, + "grad_norm": 0.2974050045013428, + "learning_rate": 0.0001852045179369877, + "loss": 1.0155, + "step": 4470 + }, + { + "epoch": 0.2581333640978311, + "grad_norm": 0.27646365761756897, + "learning_rate": 0.00018515177241652163, + "loss": 0.9477, + "step": 4475 + }, + { + "epoch": 0.25842178126442084, + "grad_norm": 0.3065283000469208, + "learning_rate": 0.0001850989405837212, + "loss": 0.9789, + "step": 4480 + }, + { + "epoch": 0.2587101984310106, + "grad_norm": 0.31208351254463196, + "learning_rate": 0.00018504602249213838, + "loss": 1.0209, + "step": 4485 + }, + { + "epoch": 0.25899861559760035, + "grad_norm": 0.27680978178977966, + "learning_rate": 0.0001849930181954124, + "loss": 0.9937, + "step": 4490 + }, + { + "epoch": 0.25928703276419013, + "grad_norm": 0.35537493228912354, + "learning_rate": 0.00018493992774727005, + "loss": 1.019, + "step": 4495 + }, + { + "epoch": 0.25957544993077986, + "grad_norm": 0.2992296814918518, + "learning_rate": 0.00018488675120152532, + "loss": 0.9409, + "step": 4500 + }, + { + "epoch": 0.25986386709736964, + "grad_norm": 0.2907122075557709, + "learning_rate": 0.00018483348861207953, + "loss": 0.9925, + "step": 4505 + }, + { + "epoch": 0.26015228426395937, + "grad_norm": 0.3083319664001465, + "learning_rate": 0.00018478014003292116, + "loss": 0.9494, + "step": 4510 + }, + { + "epoch": 0.26044070143054915, + "grad_norm": 0.2940841615200043, + "learning_rate": 0.00018472670551812596, + "loss": 1.0234, + "step": 4515 + }, + { + "epoch": 0.2607291185971389, + "grad_norm": 0.3526857793331146, + "learning_rate": 0.0001846731851218567, + "loss": 1.0047, + "step": 4520 + }, + { + "epoch": 0.26101753576372866, + "grad_norm": 0.2867284119129181, + "learning_rate": 0.00018461957889836324, + "loss": 0.953, + "step": 4525 + }, + { + "epoch": 0.2613059529303184, + "grad_norm": 0.28662440180778503, + "learning_rate": 0.00018456588690198236, + "loss": 0.9734, + "step": 4530 + }, + { + "epoch": 0.26159437009690817, + "grad_norm": 0.2874925136566162, + "learning_rate": 0.0001845121091871379, + "loss": 1.012, + "step": 4535 + }, + { + "epoch": 0.2618827872634979, + "grad_norm": 0.30890873074531555, + "learning_rate": 0.0001844582458083405, + "loss": 0.9317, + "step": 4540 + }, + { + "epoch": 0.2621712044300877, + "grad_norm": 0.2991410791873932, + "learning_rate": 0.0001844042968201877, + "loss": 0.9488, + "step": 4545 + }, + { + "epoch": 0.26245962159667746, + "grad_norm": 0.29846030473709106, + "learning_rate": 0.0001843502622773637, + "loss": 0.9722, + "step": 4550 + }, + { + "epoch": 0.2627480387632672, + "grad_norm": 0.30086445808410645, + "learning_rate": 0.0001842961422346396, + "loss": 0.9901, + "step": 4555 + }, + { + "epoch": 0.26303645592985697, + "grad_norm": 0.3020675778388977, + "learning_rate": 0.00018424193674687297, + "loss": 1.0275, + "step": 4560 + }, + { + "epoch": 0.2633248730964467, + "grad_norm": 0.3111262023448944, + "learning_rate": 0.00018418764586900817, + "loss": 0.9977, + "step": 4565 + }, + { + "epoch": 0.2636132902630365, + "grad_norm": 0.3167891204357147, + "learning_rate": 0.00018413326965607593, + "loss": 1.0266, + "step": 4570 + }, + { + "epoch": 0.2639017074296262, + "grad_norm": 0.28536850214004517, + "learning_rate": 0.00018407880816319363, + "loss": 0.9475, + "step": 4575 + }, + { + "epoch": 0.264190124596216, + "grad_norm": 0.30811807513237, + "learning_rate": 0.00018402426144556504, + "loss": 0.9549, + "step": 4580 + }, + { + "epoch": 0.2644785417628057, + "grad_norm": 0.2881765365600586, + "learning_rate": 0.0001839696295584803, + "loss": 1.0276, + "step": 4585 + }, + { + "epoch": 0.2647669589293955, + "grad_norm": 0.3339601159095764, + "learning_rate": 0.0001839149125573159, + "loss": 0.9772, + "step": 4590 + }, + { + "epoch": 0.26505537609598523, + "grad_norm": 0.2897505760192871, + "learning_rate": 0.0001838601104975346, + "loss": 1.0897, + "step": 4595 + }, + { + "epoch": 0.265343793262575, + "grad_norm": 0.3119150400161743, + "learning_rate": 0.00018380522343468532, + "loss": 0.9842, + "step": 4600 + }, + { + "epoch": 0.265343793262575, + "step": 4600, + "total_flos": 3.2343958172802744e+18, + "train_loss": 0.0, + "train_runtime": 0.0325, + "train_samples_per_second": 4262129.534, + "train_steps_per_second": 133192.508 + } + ], + "logging_steps": 5, + "max_steps": 4334, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.2343958172802744e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}