diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6472 +1,222 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.265343793262575, + "epoch": 1.0, "eval_steps": 500, - "global_step": 4600, + "global_step": 134, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 5.7683433317951084e-05, - "grad_norm": 0.3952319025993347, - "learning_rate": 1.1534025374855825e-07, - "loss": 1.182, + "epoch": 0.007462686567164179, + "grad_norm": 1.777404546737671, + "learning_rate": 1.4285714285714285e-05, + "loss": 1.2363, "step": 1 }, { - "epoch": 0.0002884171665897554, - "grad_norm": 0.3334461748600006, - "learning_rate": 5.767012687427913e-07, - "loss": 1.0887, + "epoch": 0.03731343283582089, + "grad_norm": 1.1628299951553345, + "learning_rate": 7.142857142857143e-05, + "loss": 1.1427, "step": 5 }, { - "epoch": 0.0005768343331795108, - "grad_norm": 0.41704559326171875, - "learning_rate": 1.1534025374855826e-06, - "loss": 1.2132, + "epoch": 0.07462686567164178, + "grad_norm": 0.5306259989738464, + "learning_rate": 0.00014285714285714287, + "loss": 0.8518, "step": 10 }, { - "epoch": 0.0008652514997692663, - "grad_norm": 0.4982852637767792, - "learning_rate": 1.7301038062283738e-06, - "loss": 1.1888, + "epoch": 0.11194029850746269, + "grad_norm": 0.36661896109580994, + "learning_rate": 0.00019996573249755572, + "loss": 0.6968, "step": 15 }, { - "epoch": 0.0011536686663590216, - "grad_norm": 0.3702298104763031, - "learning_rate": 2.3068050749711653e-06, - "loss": 1.2105, + "epoch": 0.14925373134328357, + "grad_norm": 0.22499878704547882, + "learning_rate": 0.00019876883405951377, + "loss": 0.6287, "step": 20 }, { - "epoch": 0.001442085832948777, - "grad_norm": 0.3640645444393158, - "learning_rate": 2.8835063437139563e-06, - "loss": 1.1714, + "epoch": 0.1865671641791045, + "grad_norm": 0.23837368190288544, + "learning_rate": 0.0001958819734868193, + "loss": 0.5985, "step": 25 }, { - "epoch": 0.0017305029995385325, - "grad_norm": 0.31508558988571167, - "learning_rate": 3.4602076124567477e-06, - "loss": 1.0438, + "epoch": 0.22388059701492538, + "grad_norm": 0.20461460947990417, + "learning_rate": 0.0001913545457642601, + "loss": 0.5683, "step": 30 }, { - "epoch": 0.0020189201661282878, - "grad_norm": 0.3910152018070221, - "learning_rate": 4.036908881199539e-06, - "loss": 1.212, + "epoch": 0.26119402985074625, + "grad_norm": 0.1855892837047577, + "learning_rate": 0.00018526401643540922, + "loss": 0.555, "step": 35 }, { - "epoch": 0.0023073373327180432, - "grad_norm": 0.32711583375930786, - "learning_rate": 4.6136101499423305e-06, - "loss": 1.1552, + "epoch": 0.29850746268656714, + "grad_norm": 0.22351227700710297, + "learning_rate": 0.0001777145961456971, + "loss": 0.5436, "step": 40 }, { - "epoch": 0.0025957544993077987, - "grad_norm": 0.37455540895462036, - "learning_rate": 5.190311418685121e-06, - "loss": 1.1355, + "epoch": 0.3358208955223881, + "grad_norm": 0.2045900523662567, + "learning_rate": 0.0001688354575693754, + "loss": 0.531, "step": 45 }, { - "epoch": 0.002884171665897554, - "grad_norm": 0.32155269384384155, - "learning_rate": 5.7670126874279126e-06, - "loss": 1.1375, + "epoch": 0.373134328358209, + "grad_norm": 0.20202215015888214, + "learning_rate": 0.00015877852522924732, + "loss": 0.5251, "step": 50 }, { - "epoch": 0.0031725888324873096, - "grad_norm": 0.29815641045570374, - "learning_rate": 6.3437139561707036e-06, - "loss": 1.1193, + "epoch": 0.41044776119402987, + "grad_norm": 0.20512989163398743, + "learning_rate": 0.00014771587602596084, + "loss": 0.519, "step": 55 }, { - "epoch": 0.003461005999077065, - "grad_norm": 0.39492201805114746, - "learning_rate": 6.920415224913495e-06, - "loss": 1.1053, + "epoch": 0.44776119402985076, + "grad_norm": 0.21831591427326202, + "learning_rate": 0.00013583679495453, + "loss": 0.5138, "step": 60 }, { - "epoch": 0.0037494231656668205, - "grad_norm": 0.3298701345920563, - "learning_rate": 7.497116493656286e-06, - "loss": 1.107, + "epoch": 0.48507462686567165, + "grad_norm": 0.20354878902435303, + "learning_rate": 0.00012334453638559057, + "loss": 0.5104, "step": 65 }, { - "epoch": 0.0040378403322565756, - "grad_norm": 0.3114672005176544, - "learning_rate": 8.073817762399077e-06, - "loss": 1.0677, + "epoch": 0.5223880597014925, + "grad_norm": 0.1882062703371048, + "learning_rate": 0.00011045284632676536, + "loss": 0.5059, "step": 70 }, { - "epoch": 0.0043262574988463314, - "grad_norm": 0.3159383535385132, - "learning_rate": 8.650519031141868e-06, - "loss": 1.0959, + "epoch": 0.5597014925373134, + "grad_norm": 0.21688711643218994, + "learning_rate": 9.73823051692127e-05, + "loss": 0.4992, "step": 75 }, { - "epoch": 0.0046146746654360865, - "grad_norm": 0.2858622074127197, - "learning_rate": 9.227220299884661e-06, - "loss": 1.0435, + "epoch": 0.5970149253731343, + "grad_norm": 0.2021055668592453, + "learning_rate": 8.435655349597689e-05, + "loss": 0.4936, "step": 80 }, { - "epoch": 0.004903091832025842, - "grad_norm": 0.3337515890598297, - "learning_rate": 9.803921568627451e-06, - "loss": 0.9889, + "epoch": 0.6343283582089553, + "grad_norm": 0.18059054017066956, + "learning_rate": 7.159846552960774e-05, + "loss": 0.5016, "step": 85 }, { - "epoch": 0.005191508998615597, - "grad_norm": 0.3027825951576233, - "learning_rate": 1.0380622837370241e-05, - "loss": 1.1145, + "epoch": 0.6716417910447762, + "grad_norm": 0.19665920734405518, + "learning_rate": 5.9326335692419995e-05, + "loss": 0.4983, "step": 90 }, { - "epoch": 0.005479926165205353, - "grad_norm": 0.34131115674972534, - "learning_rate": 1.0957324106113035e-05, - "loss": 1.0596, + "epoch": 0.7089552238805971, + "grad_norm": 0.20969931781291962, + "learning_rate": 4.7750143528405126e-05, + "loss": 0.4911, "step": 95 }, { - "epoch": 0.005768343331795108, - "grad_norm": 0.3263566792011261, - "learning_rate": 1.1534025374855825e-05, - "loss": 0.9887, + "epoch": 0.746268656716418, + "grad_norm": 0.19205212593078613, + "learning_rate": 3.7067960895016275e-05, + "loss": 0.4964, "step": 100 }, { - "epoch": 0.006056760498384864, - "grad_norm": 0.325528085231781, - "learning_rate": 1.2110726643598615e-05, - "loss": 1.0143, + "epoch": 0.7835820895522388, + "grad_norm": 0.17200438678264618, + "learning_rate": 2.746256289877126e-05, + "loss": 0.4879, "step": 105 }, { - "epoch": 0.006345177664974619, - "grad_norm": 0.3773256242275238, - "learning_rate": 1.2687427912341407e-05, - "loss": 1.0, + "epoch": 0.8208955223880597, + "grad_norm": 0.17229676246643066, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.4915, "step": 110 }, { - "epoch": 0.006633594831564375, - "grad_norm": 0.2968287765979767, - "learning_rate": 1.3264129181084197e-05, - "loss": 0.9572, + "epoch": 0.8582089552238806, + "grad_norm": 0.17735055088996887, + "learning_rate": 1.2118288733803473e-05, + "loss": 0.4883, "step": 115 }, { - "epoch": 0.00692201199815413, - "grad_norm": 0.29874077439308167, - "learning_rate": 1.384083044982699e-05, - "loss": 1.0344, + "epoch": 0.8955223880597015, + "grad_norm": 0.17195551097393036, + "learning_rate": 6.6419573502798374e-06, + "loss": 0.4912, "step": 120 }, { - "epoch": 0.007210429164743885, - "grad_norm": 0.3251142203807831, - "learning_rate": 1.4417531718569783e-05, - "loss": 1.0183, + "epoch": 0.9328358208955224, + "grad_norm": 0.17827412486076355, + "learning_rate": 2.7630079602323442e-06, + "loss": 0.4861, "step": 125 }, { - "epoch": 0.007498846331333641, - "grad_norm": 0.29589974880218506, - "learning_rate": 1.4994232987312573e-05, - "loss": 1.047, + "epoch": 0.9701492537313433, + "grad_norm": 0.17053547501564026, + "learning_rate": 5.478104631726711e-07, + "loss": 0.4852, "step": 130 }, { - "epoch": 0.007787263497923396, - "grad_norm": 0.3242173194885254, - "learning_rate": 1.5570934256055363e-05, - "loss": 1.0461, - "step": 135 + "epoch": 1.0, + "eval_loss": 0.1658700853586197, + "eval_runtime": 9.0857, + "eval_samples_per_second": 9.685, + "eval_steps_per_second": 0.33, + "step": 134 }, { - "epoch": 0.008075680664513151, - "grad_norm": 0.31147414445877075, - "learning_rate": 1.6147635524798155e-05, - "loss": 1.047, - "step": 140 - }, - { - "epoch": 0.008364097831102908, - "grad_norm": 0.31779709458351135, - "learning_rate": 1.6724336793540947e-05, - "loss": 1.0784, - "step": 145 - }, - { - "epoch": 0.008652514997692663, - "grad_norm": 0.3391679525375366, - "learning_rate": 1.7301038062283735e-05, - "loss": 1.0576, - "step": 150 - }, - { - "epoch": 0.008940932164282418, - "grad_norm": 0.3228215277194977, - "learning_rate": 1.787773933102653e-05, - "loss": 1.0145, - "step": 155 - }, - { - "epoch": 0.009229349330872173, - "grad_norm": 0.30271971225738525, - "learning_rate": 1.8454440599769322e-05, - "loss": 0.9874, - "step": 160 - }, - { - "epoch": 0.00951776649746193, - "grad_norm": 0.30643004179000854, - "learning_rate": 1.903114186851211e-05, - "loss": 0.9733, - "step": 165 - }, - { - "epoch": 0.009806183664051685, - "grad_norm": 0.36777183413505554, - "learning_rate": 1.9607843137254903e-05, - "loss": 1.0242, - "step": 170 - }, - { - "epoch": 0.01009460083064144, - "grad_norm": 0.3419516086578369, - "learning_rate": 2.0184544405997694e-05, - "loss": 1.1211, - "step": 175 - }, - { - "epoch": 0.010383017997231195, - "grad_norm": 0.3591030538082123, - "learning_rate": 2.0761245674740483e-05, - "loss": 1.0323, - "step": 180 - }, - { - "epoch": 0.01067143516382095, - "grad_norm": 0.38365352153778076, - "learning_rate": 2.1337946943483278e-05, - "loss": 0.9613, - "step": 185 - }, - { - "epoch": 0.010959852330410707, - "grad_norm": 0.3436645269393921, - "learning_rate": 2.191464821222607e-05, - "loss": 1.0753, - "step": 190 - }, - { - "epoch": 0.011248269497000462, - "grad_norm": 0.341776967048645, - "learning_rate": 2.249134948096886e-05, - "loss": 1.064, - "step": 195 - }, - { - "epoch": 0.011536686663590217, - "grad_norm": 0.38297685980796814, - "learning_rate": 2.306805074971165e-05, - "loss": 1.0105, - "step": 200 - }, - { - "epoch": 0.011825103830179972, - "grad_norm": 0.3430030643939972, - "learning_rate": 2.3644752018454442e-05, - "loss": 1.0103, - "step": 205 - }, - { - "epoch": 0.012113520996769728, - "grad_norm": 0.3319534361362457, - "learning_rate": 2.422145328719723e-05, - "loss": 1.0671, - "step": 210 - }, - { - "epoch": 0.012401938163359483, - "grad_norm": 0.3615305423736572, - "learning_rate": 2.4798154555940022e-05, - "loss": 0.9236, - "step": 215 - }, - { - "epoch": 0.012690355329949238, - "grad_norm": 0.4457886517047882, - "learning_rate": 2.5374855824682814e-05, - "loss": 1.0461, - "step": 220 - }, - { - "epoch": 0.012978772496538993, - "grad_norm": 0.7715578675270081, - "learning_rate": 2.5951557093425606e-05, - "loss": 1.0131, - "step": 225 - }, - { - "epoch": 0.01326718966312875, - "grad_norm": 0.4368738830089569, - "learning_rate": 2.6528258362168395e-05, - "loss": 1.0255, - "step": 230 - }, - { - "epoch": 0.013555606829718505, - "grad_norm": 0.38978299498558044, - "learning_rate": 2.7104959630911193e-05, - "loss": 0.9773, - "step": 235 - }, - { - "epoch": 0.01384402399630826, - "grad_norm": 0.35930851101875305, - "learning_rate": 2.768166089965398e-05, - "loss": 1.0043, - "step": 240 - }, - { - "epoch": 0.014132441162898015, - "grad_norm": 0.37871646881103516, - "learning_rate": 2.8258362168396773e-05, - "loss": 1.0082, - "step": 245 - }, - { - "epoch": 0.01442085832948777, - "grad_norm": 0.3493201732635498, - "learning_rate": 2.8835063437139565e-05, - "loss": 0.9856, - "step": 250 - }, - { - "epoch": 0.014709275496077527, - "grad_norm": 0.364734947681427, - "learning_rate": 2.9411764705882354e-05, - "loss": 1.0379, - "step": 255 - }, - { - "epoch": 0.014997692662667282, - "grad_norm": 0.3644263446331024, - "learning_rate": 2.9988465974625146e-05, - "loss": 1.006, - "step": 260 - }, - { - "epoch": 0.015286109829257037, - "grad_norm": 0.3671714961528778, - "learning_rate": 3.0565167243367934e-05, - "loss": 0.9499, - "step": 265 - }, - { - "epoch": 0.015574526995846792, - "grad_norm": 0.384804904460907, - "learning_rate": 3.1141868512110726e-05, - "loss": 1.0438, - "step": 270 - }, - { - "epoch": 0.015862944162436547, - "grad_norm": 0.36940938234329224, - "learning_rate": 3.171856978085352e-05, - "loss": 0.9476, - "step": 275 - }, - { - "epoch": 0.016151361329026302, - "grad_norm": 0.38267725706100464, - "learning_rate": 3.229527104959631e-05, - "loss": 0.9689, - "step": 280 - }, - { - "epoch": 0.01643977849561606, - "grad_norm": 0.3497903347015381, - "learning_rate": 3.28719723183391e-05, - "loss": 0.9143, - "step": 285 - }, - { - "epoch": 0.016728195662205816, - "grad_norm": 0.3465529978275299, - "learning_rate": 3.344867358708189e-05, - "loss": 0.9616, - "step": 290 - }, - { - "epoch": 0.01701661282879557, - "grad_norm": 0.3548210859298706, - "learning_rate": 3.4025374855824685e-05, - "loss": 0.9695, - "step": 295 - }, - { - "epoch": 0.017305029995385326, - "grad_norm": 0.3769378662109375, - "learning_rate": 3.460207612456747e-05, - "loss": 0.963, - "step": 300 - }, - { - "epoch": 0.01759344716197508, - "grad_norm": 0.3663967549800873, - "learning_rate": 3.517877739331027e-05, - "loss": 1.0924, - "step": 305 - }, - { - "epoch": 0.017881864328564836, - "grad_norm": 0.38498544692993164, - "learning_rate": 3.575547866205306e-05, - "loss": 1.0481, - "step": 310 - }, - { - "epoch": 0.01817028149515459, - "grad_norm": 0.3465900123119354, - "learning_rate": 3.633217993079585e-05, - "loss": 1.0396, - "step": 315 - }, - { - "epoch": 0.018458698661744346, - "grad_norm": 0.3498382270336151, - "learning_rate": 3.6908881199538644e-05, - "loss": 1.0005, - "step": 320 - }, - { - "epoch": 0.0187471158283341, - "grad_norm": 0.3397336006164551, - "learning_rate": 3.748558246828143e-05, - "loss": 0.9682, - "step": 325 - }, - { - "epoch": 0.01903553299492386, - "grad_norm": 0.33760690689086914, - "learning_rate": 3.806228373702422e-05, - "loss": 0.9975, - "step": 330 - }, - { - "epoch": 0.019323950161513614, - "grad_norm": 0.32710301876068115, - "learning_rate": 3.863898500576701e-05, - "loss": 0.985, - "step": 335 - }, - { - "epoch": 0.01961236732810337, - "grad_norm": 0.40678462386131287, - "learning_rate": 3.9215686274509805e-05, - "loss": 0.9664, - "step": 340 - }, - { - "epoch": 0.019900784494693124, - "grad_norm": 0.38339948654174805, - "learning_rate": 3.97923875432526e-05, - "loss": 0.9962, - "step": 345 - }, - { - "epoch": 0.02018920166128288, - "grad_norm": 0.3516389727592468, - "learning_rate": 4.036908881199539e-05, - "loss": 0.9385, - "step": 350 - }, - { - "epoch": 0.020477618827872635, - "grad_norm": 0.3469911515712738, - "learning_rate": 4.094579008073818e-05, - "loss": 0.9795, - "step": 355 - }, - { - "epoch": 0.02076603599446239, - "grad_norm": 0.351566344499588, - "learning_rate": 4.1522491349480966e-05, - "loss": 1.0131, - "step": 360 - }, - { - "epoch": 0.021054453161052145, - "grad_norm": 0.3254294991493225, - "learning_rate": 4.209919261822376e-05, - "loss": 0.9784, - "step": 365 - }, - { - "epoch": 0.0213428703276419, - "grad_norm": 0.352115660905838, - "learning_rate": 4.2675893886966556e-05, - "loss": 1.0013, - "step": 370 - }, - { - "epoch": 0.021631287494231658, - "grad_norm": 0.35616523027420044, - "learning_rate": 4.325259515570935e-05, - "loss": 1.0209, - "step": 375 - }, - { - "epoch": 0.021919704660821413, - "grad_norm": 0.3402170240879059, - "learning_rate": 4.382929642445214e-05, - "loss": 0.976, - "step": 380 - }, - { - "epoch": 0.022208121827411168, - "grad_norm": 0.30762144923210144, - "learning_rate": 4.440599769319493e-05, - "loss": 0.8757, - "step": 385 - }, - { - "epoch": 0.022496538994000923, - "grad_norm": 0.33472269773483276, - "learning_rate": 4.498269896193772e-05, - "loss": 1.0687, - "step": 390 - }, - { - "epoch": 0.022784956160590678, - "grad_norm": 0.3568858802318573, - "learning_rate": 4.555940023068051e-05, - "loss": 1.0279, - "step": 395 - }, - { - "epoch": 0.023073373327180433, - "grad_norm": 0.3303862512111664, - "learning_rate": 4.61361014994233e-05, - "loss": 1.0061, - "step": 400 - }, - { - "epoch": 0.023361790493770188, - "grad_norm": 0.3586498498916626, - "learning_rate": 4.671280276816609e-05, - "loss": 1.0007, - "step": 405 - }, - { - "epoch": 0.023650207660359943, - "grad_norm": 0.34804537892341614, - "learning_rate": 4.7289504036908884e-05, - "loss": 0.9913, - "step": 410 - }, - { - "epoch": 0.0239386248269497, - "grad_norm": 0.33361154794692993, - "learning_rate": 4.7866205305651676e-05, - "loss": 0.9615, - "step": 415 - }, - { - "epoch": 0.024227041993539457, - "grad_norm": 0.30743229389190674, - "learning_rate": 4.844290657439446e-05, - "loss": 1.0062, - "step": 420 - }, - { - "epoch": 0.024515459160129212, - "grad_norm": 0.3414464294910431, - "learning_rate": 4.901960784313725e-05, - "loss": 1.0266, - "step": 425 - }, - { - "epoch": 0.024803876326718967, - "grad_norm": 0.311254620552063, - "learning_rate": 4.9596309111880045e-05, - "loss": 0.9525, - "step": 430 - }, - { - "epoch": 0.025092293493308722, - "grad_norm": 0.3211973011493683, - "learning_rate": 5.017301038062284e-05, - "loss": 1.0204, - "step": 435 - }, - { - "epoch": 0.025380710659898477, - "grad_norm": 0.32264503836631775, - "learning_rate": 5.074971164936563e-05, - "loss": 0.9187, - "step": 440 - }, - { - "epoch": 0.025669127826488232, - "grad_norm": 0.3149093985557556, - "learning_rate": 5.132641291810843e-05, - "loss": 1.0324, - "step": 445 - }, - { - "epoch": 0.025957544993077987, - "grad_norm": 0.31910112500190735, - "learning_rate": 5.190311418685121e-05, - "loss": 0.9924, - "step": 450 - }, - { - "epoch": 0.026245962159667742, - "grad_norm": 0.329057514667511, - "learning_rate": 5.2479815455594004e-05, - "loss": 1.0235, - "step": 455 - }, - { - "epoch": 0.0265343793262575, - "grad_norm": 0.32927969098091125, - "learning_rate": 5.305651672433679e-05, - "loss": 0.9986, - "step": 460 - }, - { - "epoch": 0.026822796492847256, - "grad_norm": 0.30113425850868225, - "learning_rate": 5.363321799307959e-05, - "loss": 0.9996, - "step": 465 - }, - { - "epoch": 0.02711121365943701, - "grad_norm": 0.31802427768707275, - "learning_rate": 5.4209919261822386e-05, - "loss": 0.903, - "step": 470 - }, - { - "epoch": 0.027399630826026766, - "grad_norm": 0.31492453813552856, - "learning_rate": 5.478662053056517e-05, - "loss": 0.9627, - "step": 475 - }, - { - "epoch": 0.02768804799261652, - "grad_norm": 0.32527875900268555, - "learning_rate": 5.536332179930796e-05, - "loss": 0.9842, - "step": 480 - }, - { - "epoch": 0.027976465159206276, - "grad_norm": 0.3000083267688751, - "learning_rate": 5.594002306805075e-05, - "loss": 0.9275, - "step": 485 - }, - { - "epoch": 0.02826488232579603, - "grad_norm": 0.30580878257751465, - "learning_rate": 5.651672433679355e-05, - "loss": 1.0111, - "step": 490 - }, - { - "epoch": 0.028553299492385786, - "grad_norm": 0.3029692769050598, - "learning_rate": 5.709342560553633e-05, - "loss": 0.9997, - "step": 495 - }, - { - "epoch": 0.02884171665897554, - "grad_norm": 0.29320913553237915, - "learning_rate": 5.767012687427913e-05, - "loss": 0.9728, - "step": 500 - }, - { - "epoch": 0.0291301338255653, - "grad_norm": 0.27277612686157227, - "learning_rate": 5.8246828143021916e-05, - "loss": 0.9481, - "step": 505 - }, - { - "epoch": 0.029418550992155054, - "grad_norm": 0.3065517544746399, - "learning_rate": 5.882352941176471e-05, - "loss": 1.0068, - "step": 510 - }, - { - "epoch": 0.02970696815874481, - "grad_norm": 0.30595871806144714, - "learning_rate": 5.940023068050749e-05, - "loss": 1.0394, - "step": 515 - }, - { - "epoch": 0.029995385325334564, - "grad_norm": 0.2905437648296356, - "learning_rate": 5.997693194925029e-05, - "loss": 0.8914, - "step": 520 - }, - { - "epoch": 0.03028380249192432, - "grad_norm": 0.30169710516929626, - "learning_rate": 6.0553633217993076e-05, - "loss": 1.0714, - "step": 525 - }, - { - "epoch": 0.030572219658514074, - "grad_norm": 0.30245259404182434, - "learning_rate": 6.113033448673587e-05, - "loss": 0.9748, - "step": 530 - }, - { - "epoch": 0.03086063682510383, - "grad_norm": 0.31071239709854126, - "learning_rate": 6.170703575547867e-05, - "loss": 1.0307, - "step": 535 - }, - { - "epoch": 0.031149053991693584, - "grad_norm": 0.301554799079895, - "learning_rate": 6.228373702422145e-05, - "loss": 0.9904, - "step": 540 - }, - { - "epoch": 0.03143747115828334, - "grad_norm": 0.29832157492637634, - "learning_rate": 6.286043829296425e-05, - "loss": 0.965, - "step": 545 - }, - { - "epoch": 0.031725888324873094, - "grad_norm": 0.2960033118724823, - "learning_rate": 6.343713956170704e-05, - "loss": 0.9661, - "step": 550 - }, - { - "epoch": 0.03201430549146285, - "grad_norm": 0.2793910503387451, - "learning_rate": 6.401384083044983e-05, - "loss": 0.9691, - "step": 555 - }, - { - "epoch": 0.032302722658052604, - "grad_norm": 0.2931232750415802, - "learning_rate": 6.459054209919262e-05, - "loss": 1.0152, - "step": 560 - }, - { - "epoch": 0.03259113982464236, - "grad_norm": 0.29276397824287415, - "learning_rate": 6.516724336793542e-05, - "loss": 0.9644, - "step": 565 - }, - { - "epoch": 0.03287955699123212, - "grad_norm": 0.2859160304069519, - "learning_rate": 6.57439446366782e-05, - "loss": 0.8926, - "step": 570 - }, - { - "epoch": 0.033167974157821876, - "grad_norm": 0.2981337308883667, - "learning_rate": 6.6320645905421e-05, - "loss": 0.9805, - "step": 575 - }, - { - "epoch": 0.03345639132441163, - "grad_norm": 0.28318145871162415, - "learning_rate": 6.689734717416379e-05, - "loss": 0.9828, - "step": 580 - }, - { - "epoch": 0.033744808491001387, - "grad_norm": 0.2922738194465637, - "learning_rate": 6.747404844290659e-05, - "loss": 0.9495, - "step": 585 - }, - { - "epoch": 0.03403322565759114, - "grad_norm": 0.3307567536830902, - "learning_rate": 6.805074971164937e-05, - "loss": 0.975, - "step": 590 - }, - { - "epoch": 0.0343216428241809, - "grad_norm": 0.2792339622974396, - "learning_rate": 6.862745098039216e-05, - "loss": 1.0021, - "step": 595 - }, - { - "epoch": 0.03461005999077065, - "grad_norm": 0.26365357637405396, - "learning_rate": 6.920415224913494e-05, - "loss": 1.0316, - "step": 600 - }, - { - "epoch": 0.03489847715736041, - "grad_norm": 0.285918265581131, - "learning_rate": 6.978085351787774e-05, - "loss": 1.0025, - "step": 605 - }, - { - "epoch": 0.03518689432395016, - "grad_norm": 0.290382444858551, - "learning_rate": 7.035755478662054e-05, - "loss": 1.0198, - "step": 610 - }, - { - "epoch": 0.03547531149053992, - "grad_norm": 0.2909998595714569, - "learning_rate": 7.093425605536332e-05, - "loss": 1.0522, - "step": 615 - }, - { - "epoch": 0.03576372865712967, - "grad_norm": 0.2691628038883209, - "learning_rate": 7.151095732410612e-05, - "loss": 1.0285, - "step": 620 - }, - { - "epoch": 0.03605214582371943, - "grad_norm": 0.2793739140033722, - "learning_rate": 7.20876585928489e-05, - "loss": 0.9431, - "step": 625 - }, - { - "epoch": 0.03634056299030918, - "grad_norm": 0.28252139687538147, - "learning_rate": 7.26643598615917e-05, - "loss": 0.954, - "step": 630 - }, - { - "epoch": 0.03662898015689894, - "grad_norm": 0.2551520764827728, - "learning_rate": 7.324106113033449e-05, - "loss": 0.9477, - "step": 635 - }, - { - "epoch": 0.03691739732348869, - "grad_norm": 0.2769528925418854, - "learning_rate": 7.381776239907729e-05, - "loss": 1.0228, - "step": 640 - }, - { - "epoch": 0.03720581449007845, - "grad_norm": 0.26769739389419556, - "learning_rate": 7.439446366782007e-05, - "loss": 0.9844, - "step": 645 - }, - { - "epoch": 0.0374942316566682, - "grad_norm": 0.2822119891643524, - "learning_rate": 7.497116493656286e-05, - "loss": 1.0532, - "step": 650 - }, - { - "epoch": 0.03778264882325796, - "grad_norm": 0.2787601053714752, - "learning_rate": 7.554786620530564e-05, - "loss": 1.0154, - "step": 655 - }, - { - "epoch": 0.03807106598984772, - "grad_norm": 0.27694109082221985, - "learning_rate": 7.612456747404844e-05, - "loss": 0.9775, - "step": 660 - }, - { - "epoch": 0.038359483156437474, - "grad_norm": 0.4112897217273712, - "learning_rate": 7.670126874279123e-05, - "loss": 1.0071, - "step": 665 - }, - { - "epoch": 0.03864790032302723, - "grad_norm": 0.26005199551582336, - "learning_rate": 7.727797001153403e-05, - "loss": 0.9632, - "step": 670 - }, - { - "epoch": 0.038936317489616984, - "grad_norm": 0.25056615471839905, - "learning_rate": 7.785467128027682e-05, - "loss": 0.9773, - "step": 675 - }, - { - "epoch": 0.03922473465620674, - "grad_norm": 0.27164942026138306, - "learning_rate": 7.843137254901961e-05, - "loss": 0.9927, - "step": 680 - }, - { - "epoch": 0.039513151822796494, - "grad_norm": 0.26238757371902466, - "learning_rate": 7.900807381776241e-05, - "loss": 0.9612, - "step": 685 - }, - { - "epoch": 0.03980156898938625, - "grad_norm": 0.28629186749458313, - "learning_rate": 7.95847750865052e-05, - "loss": 0.9579, - "step": 690 - }, - { - "epoch": 0.040089986155976004, - "grad_norm": 0.2650497555732727, - "learning_rate": 8.016147635524799e-05, - "loss": 0.9667, - "step": 695 - }, - { - "epoch": 0.04037840332256576, - "grad_norm": 0.26934972405433655, - "learning_rate": 8.073817762399078e-05, - "loss": 0.9257, - "step": 700 - }, - { - "epoch": 0.040666820489155514, - "grad_norm": 0.27391955256462097, - "learning_rate": 8.131487889273358e-05, - "loss": 1.0725, - "step": 705 - }, - { - "epoch": 0.04095523765574527, - "grad_norm": 0.2905539274215698, - "learning_rate": 8.189158016147636e-05, - "loss": 0.9979, - "step": 710 - }, - { - "epoch": 0.041243654822335024, - "grad_norm": 0.26050031185150146, - "learning_rate": 8.246828143021915e-05, - "loss": 0.9901, - "step": 715 - }, - { - "epoch": 0.04153207198892478, - "grad_norm": 0.4822568893432617, - "learning_rate": 8.304498269896193e-05, - "loss": 0.9753, - "step": 720 - }, - { - "epoch": 0.041820489155514534, - "grad_norm": 0.27065780758857727, - "learning_rate": 8.362168396770473e-05, - "loss": 0.961, - "step": 725 - }, - { - "epoch": 0.04210890632210429, - "grad_norm": 0.27039390802383423, - "learning_rate": 8.419838523644751e-05, - "loss": 1.0218, - "step": 730 - }, - { - "epoch": 0.042397323488694044, - "grad_norm": 0.267991304397583, - "learning_rate": 8.477508650519031e-05, - "loss": 0.8937, - "step": 735 - }, - { - "epoch": 0.0426857406552838, - "grad_norm": 0.2698671519756317, - "learning_rate": 8.535178777393311e-05, - "loss": 1.0203, - "step": 740 - }, - { - "epoch": 0.04297415782187356, - "grad_norm": 0.25605538487434387, - "learning_rate": 8.59284890426759e-05, - "loss": 1.0398, - "step": 745 - }, - { - "epoch": 0.043262574988463316, - "grad_norm": 0.26644793152809143, - "learning_rate": 8.65051903114187e-05, - "loss": 1.0212, - "step": 750 - }, - { - "epoch": 0.04355099215505307, - "grad_norm": 0.2879778742790222, - "learning_rate": 8.708189158016148e-05, - "loss": 0.9854, - "step": 755 - }, - { - "epoch": 0.043839409321642826, - "grad_norm": 0.26750192046165466, - "learning_rate": 8.765859284890428e-05, - "loss": 1.0168, - "step": 760 - }, - { - "epoch": 0.04412782648823258, - "grad_norm": 0.2743099331855774, - "learning_rate": 8.823529411764706e-05, - "loss": 0.9447, - "step": 765 - }, - { - "epoch": 0.044416243654822336, - "grad_norm": 0.27284887433052063, - "learning_rate": 8.881199538638986e-05, - "loss": 1.016, - "step": 770 - }, - { - "epoch": 0.04470466082141209, - "grad_norm": 0.26251500844955444, - "learning_rate": 8.938869665513265e-05, - "loss": 0.9275, - "step": 775 - }, - { - "epoch": 0.044993077988001846, - "grad_norm": 0.26898619532585144, - "learning_rate": 8.996539792387543e-05, - "loss": 0.9258, - "step": 780 - }, - { - "epoch": 0.0452814951545916, - "grad_norm": 0.2636859118938446, - "learning_rate": 9.054209919261822e-05, - "loss": 1.1368, - "step": 785 - }, - { - "epoch": 0.045569912321181356, - "grad_norm": 0.25750333070755005, - "learning_rate": 9.111880046136102e-05, - "loss": 0.9829, - "step": 790 - }, - { - "epoch": 0.04585832948777111, - "grad_norm": 0.26251962780952454, - "learning_rate": 9.16955017301038e-05, - "loss": 1.0722, - "step": 795 - }, - { - "epoch": 0.046146746654360866, - "grad_norm": 0.24186044931411743, - "learning_rate": 9.22722029988466e-05, - "loss": 0.9681, - "step": 800 - }, - { - "epoch": 0.04643516382095062, - "grad_norm": 0.2631891965866089, - "learning_rate": 9.28489042675894e-05, - "loss": 1.0082, - "step": 805 - }, - { - "epoch": 0.046723580987540377, - "grad_norm": 0.25769105553627014, - "learning_rate": 9.342560553633218e-05, - "loss": 0.9419, - "step": 810 - }, - { - "epoch": 0.04701199815413013, - "grad_norm": 0.26983222365379333, - "learning_rate": 9.400230680507498e-05, - "loss": 0.9698, - "step": 815 - }, - { - "epoch": 0.04730041532071989, - "grad_norm": 0.268951952457428, - "learning_rate": 9.457900807381777e-05, - "loss": 1.0199, - "step": 820 - }, - { - "epoch": 0.04758883248730964, - "grad_norm": 0.2618368864059448, - "learning_rate": 9.515570934256057e-05, - "loss": 1.0474, - "step": 825 - }, - { - "epoch": 0.0478772496538994, - "grad_norm": 0.2535788118839264, - "learning_rate": 9.573241061130335e-05, - "loss": 1.051, - "step": 830 - }, - { - "epoch": 0.04816566682048916, - "grad_norm": 0.24797338247299194, - "learning_rate": 9.630911188004614e-05, - "loss": 0.9787, - "step": 835 - }, - { - "epoch": 0.048454083987078914, - "grad_norm": 0.2542094886302948, - "learning_rate": 9.688581314878892e-05, - "loss": 1.0301, - "step": 840 - }, - { - "epoch": 0.04874250115366867, - "grad_norm": 0.34137168526649475, - "learning_rate": 9.746251441753172e-05, - "loss": 0.8916, - "step": 845 - }, - { - "epoch": 0.049030918320258424, - "grad_norm": 0.25905948877334595, - "learning_rate": 9.80392156862745e-05, - "loss": 1.0086, - "step": 850 - }, - { - "epoch": 0.04931933548684818, - "grad_norm": 0.24208292365074158, - "learning_rate": 9.86159169550173e-05, - "loss": 0.962, - "step": 855 - }, - { - "epoch": 0.049607752653437934, - "grad_norm": 0.2500937879085541, - "learning_rate": 9.919261822376009e-05, - "loss": 0.983, - "step": 860 - }, - { - "epoch": 0.04989616982002769, - "grad_norm": 0.2481968104839325, - "learning_rate": 9.976931949250289e-05, - "loss": 0.9798, - "step": 865 - }, - { - "epoch": 0.050184586986617444, - "grad_norm": 0.25975415110588074, - "learning_rate": 0.00010034602076124569, - "loss": 0.9621, - "step": 870 - }, - { - "epoch": 0.0504730041532072, - "grad_norm": 0.25389575958251953, - "learning_rate": 0.00010092272202998847, - "loss": 0.9959, - "step": 875 - }, - { - "epoch": 0.050761421319796954, - "grad_norm": 0.26200932264328003, - "learning_rate": 0.00010149942329873126, - "loss": 0.9432, - "step": 880 - }, - { - "epoch": 0.05104983848638671, - "grad_norm": 0.25433865189552307, - "learning_rate": 0.00010207612456747407, - "loss": 1.0272, - "step": 885 - }, - { - "epoch": 0.051338255652976464, - "grad_norm": 0.29402443766593933, - "learning_rate": 0.00010265282583621685, - "loss": 1.018, - "step": 890 - }, - { - "epoch": 0.05162667281956622, - "grad_norm": 0.2625313699245453, - "learning_rate": 0.00010322952710495964, - "loss": 1.0326, - "step": 895 - }, - { - "epoch": 0.051915089986155974, - "grad_norm": 0.2682657241821289, - "learning_rate": 0.00010380622837370242, - "loss": 1.0215, - "step": 900 - }, - { - "epoch": 0.05220350715274573, - "grad_norm": 0.27114447951316833, - "learning_rate": 0.00010438292964244522, - "loss": 0.9736, - "step": 905 - }, - { - "epoch": 0.052491924319335484, - "grad_norm": 0.2469518631696701, - "learning_rate": 0.00010495963091118801, - "loss": 0.93, - "step": 910 - }, - { - "epoch": 0.05278034148592524, - "grad_norm": 0.262253999710083, - "learning_rate": 0.00010553633217993079, - "loss": 0.9477, - "step": 915 - }, - { - "epoch": 0.053068758652515, - "grad_norm": 0.25354915857315063, - "learning_rate": 0.00010611303344867358, - "loss": 0.9926, - "step": 920 - }, - { - "epoch": 0.053357175819104756, - "grad_norm": 0.24856913089752197, - "learning_rate": 0.00010668973471741639, - "loss": 0.9726, - "step": 925 - }, - { - "epoch": 0.05364559298569451, - "grad_norm": 0.24939557909965515, - "learning_rate": 0.00010726643598615918, - "loss": 0.9575, - "step": 930 - }, - { - "epoch": 0.053934010152284266, - "grad_norm": 0.2722608745098114, - "learning_rate": 0.00010784313725490196, - "loss": 1.0017, - "step": 935 - }, - { - "epoch": 0.05422242731887402, - "grad_norm": 0.25203198194503784, - "learning_rate": 0.00010841983852364477, - "loss": 0.9141, - "step": 940 - }, - { - "epoch": 0.054510844485463776, - "grad_norm": 0.2586802840232849, - "learning_rate": 0.00010899653979238756, - "loss": 1.0066, - "step": 945 - }, - { - "epoch": 0.05479926165205353, - "grad_norm": 0.24033570289611816, - "learning_rate": 0.00010957324106113034, - "loss": 1.0113, - "step": 950 - }, - { - "epoch": 0.055087678818643286, - "grad_norm": 0.2373732328414917, - "learning_rate": 0.00011014994232987313, - "loss": 1.0172, - "step": 955 - }, - { - "epoch": 0.05537609598523304, - "grad_norm": 0.25045233964920044, - "learning_rate": 0.00011072664359861593, - "loss": 0.9548, - "step": 960 - }, - { - "epoch": 0.055664513151822796, - "grad_norm": 0.25307127833366394, - "learning_rate": 0.00011130334486735871, - "loss": 0.8803, - "step": 965 - }, - { - "epoch": 0.05595293031841255, - "grad_norm": 0.2580971121788025, - "learning_rate": 0.0001118800461361015, - "loss": 1.0257, - "step": 970 - }, - { - "epoch": 0.056241347485002306, - "grad_norm": 0.3492274284362793, - "learning_rate": 0.00011245674740484428, - "loss": 0.9915, - "step": 975 - }, - { - "epoch": 0.05652976465159206, - "grad_norm": 0.3969261944293976, - "learning_rate": 0.0001130334486735871, - "loss": 0.9871, - "step": 980 - }, - { - "epoch": 0.056818181818181816, - "grad_norm": 0.2512189447879791, - "learning_rate": 0.00011361014994232988, - "loss": 0.9999, - "step": 985 - }, - { - "epoch": 0.05710659898477157, - "grad_norm": 0.24583379924297333, - "learning_rate": 0.00011418685121107266, - "loss": 1.019, - "step": 990 - }, - { - "epoch": 0.057395016151361326, - "grad_norm": 0.23418952524662018, - "learning_rate": 0.00011476355247981545, - "loss": 0.9976, - "step": 995 - }, - { - "epoch": 0.05768343331795108, - "grad_norm": 0.24816179275512695, - "learning_rate": 0.00011534025374855826, - "loss": 0.9787, - "step": 1000 - }, - { - "epoch": 0.05797185048454084, - "grad_norm": 0.238878071308136, - "learning_rate": 0.00011591695501730105, - "loss": 0.9831, - "step": 1005 - }, - { - "epoch": 0.0582602676511306, - "grad_norm": 0.240176260471344, - "learning_rate": 0.00011649365628604383, - "loss": 0.9604, - "step": 1010 - }, - { - "epoch": 0.05854868481772035, - "grad_norm": 0.24366143345832825, - "learning_rate": 0.00011707035755478663, - "loss": 1.0633, - "step": 1015 - }, - { - "epoch": 0.05883710198431011, - "grad_norm": 0.24254244565963745, - "learning_rate": 0.00011764705882352942, - "loss": 1.0299, - "step": 1020 - }, - { - "epoch": 0.05912551915089986, - "grad_norm": 0.2483944445848465, - "learning_rate": 0.0001182237600922722, - "loss": 1.0325, - "step": 1025 - }, - { - "epoch": 0.05941393631748962, - "grad_norm": 0.23639345169067383, - "learning_rate": 0.00011880046136101499, - "loss": 0.9192, - "step": 1030 - }, - { - "epoch": 0.059702353484079373, - "grad_norm": 0.26320794224739075, - "learning_rate": 0.0001193771626297578, - "loss": 0.973, - "step": 1035 - }, - { - "epoch": 0.05999077065066913, - "grad_norm": 0.26271867752075195, - "learning_rate": 0.00011995386389850058, - "loss": 1.0339, - "step": 1040 - }, - { - "epoch": 0.060279187817258884, - "grad_norm": 0.2515929043292999, - "learning_rate": 0.00012053056516724337, - "loss": 0.9777, - "step": 1045 - }, - { - "epoch": 0.06056760498384864, - "grad_norm": 0.24450047314167023, - "learning_rate": 0.00012110726643598615, - "loss": 0.9781, - "step": 1050 - }, - { - "epoch": 0.060856022150438394, - "grad_norm": 0.247002974152565, - "learning_rate": 0.00012168396770472896, - "loss": 0.9742, - "step": 1055 - }, - { - "epoch": 0.06114443931702815, - "grad_norm": 0.22039633989334106, - "learning_rate": 0.00012226066897347174, - "loss": 0.9602, - "step": 1060 - }, - { - "epoch": 0.061432856483617904, - "grad_norm": 0.25299662351608276, - "learning_rate": 0.00012283737024221453, - "loss": 0.9429, - "step": 1065 - }, - { - "epoch": 0.06172127365020766, - "grad_norm": 0.24021919071674347, - "learning_rate": 0.00012341407151095733, - "loss": 1.0543, - "step": 1070 - }, - { - "epoch": 0.062009690816797414, - "grad_norm": 0.2851802408695221, - "learning_rate": 0.00012399077277970013, - "loss": 1.0169, - "step": 1075 - }, - { - "epoch": 0.06229810798338717, - "grad_norm": 0.2532206177711487, - "learning_rate": 0.0001245674740484429, - "loss": 0.9388, - "step": 1080 - }, - { - "epoch": 0.06258652514997692, - "grad_norm": 0.2355235517024994, - "learning_rate": 0.0001251441753171857, - "loss": 0.9283, - "step": 1085 - }, - { - "epoch": 0.06287494231656668, - "grad_norm": 0.2673757076263428, - "learning_rate": 0.0001257208765859285, - "loss": 1.0022, - "step": 1090 - }, - { - "epoch": 0.06316335948315643, - "grad_norm": 0.22847038507461548, - "learning_rate": 0.0001262975778546713, - "loss": 0.9481, - "step": 1095 - }, - { - "epoch": 0.06345177664974619, - "grad_norm": 0.25772714614868164, - "learning_rate": 0.00012687427912341407, - "loss": 0.9909, - "step": 1100 - }, - { - "epoch": 0.06374019381633594, - "grad_norm": 0.238713800907135, - "learning_rate": 0.00012745098039215687, - "loss": 0.9379, - "step": 1105 - }, - { - "epoch": 0.0640286109829257, - "grad_norm": 0.24460141360759735, - "learning_rate": 0.00012802768166089967, - "loss": 0.9398, - "step": 1110 - }, - { - "epoch": 0.06431702814951545, - "grad_norm": 0.23570501804351807, - "learning_rate": 0.00012860438292964244, - "loss": 0.9292, - "step": 1115 - }, - { - "epoch": 0.06460544531610521, - "grad_norm": 0.26408931612968445, - "learning_rate": 0.00012918108419838524, - "loss": 1.026, - "step": 1120 - }, - { - "epoch": 0.06489386248269496, - "grad_norm": 0.2372530698776245, - "learning_rate": 0.00012975778546712804, - "loss": 0.9906, - "step": 1125 - }, - { - "epoch": 0.06518227964928472, - "grad_norm": 0.2314678579568863, - "learning_rate": 0.00013033448673587084, - "loss": 0.9447, - "step": 1130 - }, - { - "epoch": 0.06547069681587447, - "grad_norm": 0.25254136323928833, - "learning_rate": 0.0001309111880046136, - "loss": 1.0364, - "step": 1135 - }, - { - "epoch": 0.06575911398246424, - "grad_norm": 0.23922473192214966, - "learning_rate": 0.0001314878892733564, - "loss": 1.0091, - "step": 1140 - }, - { - "epoch": 0.066047531149054, - "grad_norm": 0.24500273168087006, - "learning_rate": 0.0001320645905420992, - "loss": 0.9951, - "step": 1145 - }, - { - "epoch": 0.06633594831564375, - "grad_norm": 0.23815661668777466, - "learning_rate": 0.000132641291810842, - "loss": 1.0065, - "step": 1150 - }, - { - "epoch": 0.06662436548223351, - "grad_norm": 0.26173415780067444, - "learning_rate": 0.00013321799307958477, - "loss": 1.0159, - "step": 1155 - }, - { - "epoch": 0.06691278264882326, - "grad_norm": 0.22709496319293976, - "learning_rate": 0.00013379469434832757, - "loss": 0.9121, - "step": 1160 - }, - { - "epoch": 0.06720119981541302, - "grad_norm": 0.2595439553260803, - "learning_rate": 0.00013437139561707037, - "loss": 1.0136, - "step": 1165 - }, - { - "epoch": 0.06748961698200277, - "grad_norm": 0.23945558071136475, - "learning_rate": 0.00013494809688581317, - "loss": 0.9508, - "step": 1170 - }, - { - "epoch": 0.06777803414859253, - "grad_norm": 0.2526959478855133, - "learning_rate": 0.00013552479815455594, - "loss": 0.9304, - "step": 1175 - }, - { - "epoch": 0.06806645131518228, - "grad_norm": 0.2385508418083191, - "learning_rate": 0.00013610149942329874, - "loss": 1.012, - "step": 1180 - }, - { - "epoch": 0.06835486848177204, - "grad_norm": 0.25558724999427795, - "learning_rate": 0.00013667820069204154, - "loss": 1.0289, - "step": 1185 - }, - { - "epoch": 0.0686432856483618, - "grad_norm": 0.26076334714889526, - "learning_rate": 0.0001372549019607843, - "loss": 0.9564, - "step": 1190 - }, - { - "epoch": 0.06893170281495155, - "grad_norm": 0.24157829582691193, - "learning_rate": 0.0001378316032295271, - "loss": 1.0265, - "step": 1195 - }, - { - "epoch": 0.0692201199815413, - "grad_norm": 0.2505204379558563, - "learning_rate": 0.00013840830449826988, - "loss": 0.965, - "step": 1200 - }, - { - "epoch": 0.06950853714813106, - "grad_norm": 0.2583898603916168, - "learning_rate": 0.0001389850057670127, - "loss": 1.0161, - "step": 1205 - }, - { - "epoch": 0.06979695431472081, - "grad_norm": 0.24660265445709229, - "learning_rate": 0.00013956170703575548, - "loss": 1.0086, - "step": 1210 - }, - { - "epoch": 0.07008537148131057, - "grad_norm": 0.2303483486175537, - "learning_rate": 0.00014013840830449828, - "loss": 1.0004, - "step": 1215 - }, - { - "epoch": 0.07037378864790032, - "grad_norm": 0.25441575050354004, - "learning_rate": 0.00014071510957324108, - "loss": 1.0218, - "step": 1220 - }, - { - "epoch": 0.07066220581449008, - "grad_norm": 0.2441866099834442, - "learning_rate": 0.00014129181084198387, - "loss": 0.9947, - "step": 1225 - }, - { - "epoch": 0.07095062298107983, - "grad_norm": 0.2431473582983017, - "learning_rate": 0.00014186851211072665, - "loss": 0.977, - "step": 1230 - }, - { - "epoch": 0.07123904014766959, - "grad_norm": 0.22348998486995697, - "learning_rate": 0.00014244521337946944, - "loss": 0.9626, - "step": 1235 - }, - { - "epoch": 0.07152745731425934, - "grad_norm": 0.25038719177246094, - "learning_rate": 0.00014302191464821224, - "loss": 1.0234, - "step": 1240 - }, - { - "epoch": 0.0718158744808491, - "grad_norm": 0.24543331563472748, - "learning_rate": 0.00014359861591695501, - "loss": 0.9782, - "step": 1245 - }, - { - "epoch": 0.07210429164743885, - "grad_norm": 0.2646369934082031, - "learning_rate": 0.0001441753171856978, - "loss": 1.0049, - "step": 1250 - }, - { - "epoch": 0.07239270881402861, - "grad_norm": 0.24707183241844177, - "learning_rate": 0.00014475201845444058, - "loss": 1.0426, - "step": 1255 - }, - { - "epoch": 0.07268112598061836, - "grad_norm": 0.24609191715717316, - "learning_rate": 0.0001453287197231834, - "loss": 0.9978, - "step": 1260 - }, - { - "epoch": 0.07296954314720812, - "grad_norm": 0.2498229593038559, - "learning_rate": 0.00014590542099192618, - "loss": 1.0299, - "step": 1265 - }, - { - "epoch": 0.07325796031379787, - "grad_norm": 0.24294817447662354, - "learning_rate": 0.00014648212226066898, - "loss": 0.9387, - "step": 1270 - }, - { - "epoch": 0.07354637748038763, - "grad_norm": 0.22789110243320465, - "learning_rate": 0.00014705882352941178, - "loss": 0.9859, - "step": 1275 - }, - { - "epoch": 0.07383479464697738, - "grad_norm": 0.2392035871744156, - "learning_rate": 0.00014763552479815458, - "loss": 0.9821, - "step": 1280 - }, - { - "epoch": 0.07412321181356714, - "grad_norm": 0.24138358235359192, - "learning_rate": 0.00014821222606689735, - "loss": 0.9644, - "step": 1285 - }, - { - "epoch": 0.0744116289801569, - "grad_norm": 0.2574746012687683, - "learning_rate": 0.00014878892733564015, - "loss": 0.9894, - "step": 1290 - }, - { - "epoch": 0.07470004614674665, - "grad_norm": 0.2577558755874634, - "learning_rate": 0.00014936562860438295, - "loss": 1.0049, - "step": 1295 - }, - { - "epoch": 0.0749884633133364, - "grad_norm": 0.2638446092605591, - "learning_rate": 0.00014994232987312572, - "loss": 0.9866, - "step": 1300 - }, - { - "epoch": 0.07527688047992616, - "grad_norm": 0.2279583364725113, - "learning_rate": 0.00015051903114186852, - "loss": 0.9697, - "step": 1305 - }, - { - "epoch": 0.07556529764651591, - "grad_norm": 0.25132206082344055, - "learning_rate": 0.0001510957324106113, - "loss": 0.9654, - "step": 1310 - }, - { - "epoch": 0.07585371481310568, - "grad_norm": 0.24250829219818115, - "learning_rate": 0.00015167243367935411, - "loss": 0.9594, - "step": 1315 - }, - { - "epoch": 0.07614213197969544, - "grad_norm": 0.24679099023342133, - "learning_rate": 0.00015224913494809689, - "loss": 0.9514, - "step": 1320 - }, - { - "epoch": 0.07643054914628519, - "grad_norm": 0.26517555117607117, - "learning_rate": 0.00015282583621683968, - "loss": 0.9575, - "step": 1325 - }, - { - "epoch": 0.07671896631287495, - "grad_norm": 0.23794426023960114, - "learning_rate": 0.00015340253748558246, - "loss": 0.9982, - "step": 1330 - }, - { - "epoch": 0.0770073834794647, - "grad_norm": 0.2488831728696823, - "learning_rate": 0.00015397923875432528, - "loss": 0.9454, - "step": 1335 - }, - { - "epoch": 0.07729580064605446, - "grad_norm": 0.26782914996147156, - "learning_rate": 0.00015455594002306805, - "loss": 1.0235, - "step": 1340 - }, - { - "epoch": 0.07758421781264421, - "grad_norm": 0.25021234154701233, - "learning_rate": 0.00015513264129181085, - "loss": 0.9243, - "step": 1345 - }, - { - "epoch": 0.07787263497923397, - "grad_norm": 0.2522822618484497, - "learning_rate": 0.00015570934256055365, - "loss": 1.0428, - "step": 1350 - }, - { - "epoch": 0.07816105214582372, - "grad_norm": 0.27001574635505676, - "learning_rate": 0.00015628604382929645, - "loss": 0.9755, - "step": 1355 - }, - { - "epoch": 0.07844946931241348, - "grad_norm": 0.24071645736694336, - "learning_rate": 0.00015686274509803922, - "loss": 1.013, - "step": 1360 - }, - { - "epoch": 0.07873788647900323, - "grad_norm": 0.24303098022937775, - "learning_rate": 0.00015743944636678202, - "loss": 0.9862, - "step": 1365 - }, - { - "epoch": 0.07902630364559299, - "grad_norm": 0.2542005479335785, - "learning_rate": 0.00015801614763552482, - "loss": 0.9709, - "step": 1370 - }, - { - "epoch": 0.07931472081218274, - "grad_norm": 0.2585870325565338, - "learning_rate": 0.0001585928489042676, - "loss": 1.0085, - "step": 1375 - }, - { - "epoch": 0.0796031379787725, - "grad_norm": 0.2629243731498718, - "learning_rate": 0.0001591695501730104, - "loss": 0.985, - "step": 1380 - }, - { - "epoch": 0.07989155514536225, - "grad_norm": 0.24008338153362274, - "learning_rate": 0.00015974625144175316, - "loss": 0.9839, - "step": 1385 - }, - { - "epoch": 0.08017997231195201, - "grad_norm": 0.2442033439874649, - "learning_rate": 0.00016032295271049598, - "loss": 0.8798, - "step": 1390 - }, - { - "epoch": 0.08046838947854176, - "grad_norm": 0.250362366437912, - "learning_rate": 0.00016089965397923876, - "loss": 0.9301, - "step": 1395 - }, - { - "epoch": 0.08075680664513152, - "grad_norm": 0.2477293759584427, - "learning_rate": 0.00016147635524798155, - "loss": 0.9561, - "step": 1400 - }, - { - "epoch": 0.08104522381172127, - "grad_norm": 0.23329582810401917, - "learning_rate": 0.00016205305651672435, - "loss": 0.9505, - "step": 1405 - }, - { - "epoch": 0.08133364097831103, - "grad_norm": 0.24549901485443115, - "learning_rate": 0.00016262975778546715, - "loss": 1.0284, - "step": 1410 - }, - { - "epoch": 0.08162205814490078, - "grad_norm": 0.24419653415679932, - "learning_rate": 0.00016320645905420992, - "loss": 0.9114, - "step": 1415 - }, - { - "epoch": 0.08191047531149054, - "grad_norm": 0.24551044404506683, - "learning_rate": 0.00016378316032295272, - "loss": 0.9574, - "step": 1420 - }, - { - "epoch": 0.0821988924780803, - "grad_norm": 0.29641515016555786, - "learning_rate": 0.00016435986159169552, - "loss": 0.9821, - "step": 1425 - }, - { - "epoch": 0.08248730964467005, - "grad_norm": 0.24953129887580872, - "learning_rate": 0.0001649365628604383, - "loss": 0.9966, - "step": 1430 - }, - { - "epoch": 0.0827757268112598, - "grad_norm": 0.25181591510772705, - "learning_rate": 0.0001655132641291811, - "loss": 1.023, - "step": 1435 - }, - { - "epoch": 0.08306414397784956, - "grad_norm": 0.2478877305984497, - "learning_rate": 0.00016608996539792386, - "loss": 0.9762, - "step": 1440 - }, - { - "epoch": 0.08335256114443931, - "grad_norm": 0.24414442479610443, - "learning_rate": 0.0001666666666666667, - "loss": 0.9339, - "step": 1445 - }, - { - "epoch": 0.08364097831102907, - "grad_norm": 0.24295495450496674, - "learning_rate": 0.00016724336793540946, - "loss": 1.0144, - "step": 1450 - }, - { - "epoch": 0.08392939547761882, - "grad_norm": 0.25291165709495544, - "learning_rate": 0.00016782006920415226, - "loss": 0.916, - "step": 1455 - }, - { - "epoch": 0.08421781264420858, - "grad_norm": 0.23744194209575653, - "learning_rate": 0.00016839677047289503, - "loss": 0.952, - "step": 1460 - }, - { - "epoch": 0.08450622981079833, - "grad_norm": 0.24316394329071045, - "learning_rate": 0.00016897347174163786, - "loss": 0.9725, - "step": 1465 - }, - { - "epoch": 0.08479464697738809, - "grad_norm": 0.23748493194580078, - "learning_rate": 0.00016955017301038063, - "loss": 0.9831, - "step": 1470 - }, - { - "epoch": 0.08508306414397784, - "grad_norm": 0.25356602668762207, - "learning_rate": 0.00017012687427912343, - "loss": 0.9632, - "step": 1475 - }, - { - "epoch": 0.0853714813105676, - "grad_norm": 0.24660415947437286, - "learning_rate": 0.00017070357554786622, - "loss": 0.9319, - "step": 1480 - }, - { - "epoch": 0.08565989847715735, - "grad_norm": 0.25426214933395386, - "learning_rate": 0.000171280276816609, - "loss": 1.0245, - "step": 1485 - }, - { - "epoch": 0.08594831564374712, - "grad_norm": 0.23765899240970612, - "learning_rate": 0.0001718569780853518, - "loss": 0.9202, - "step": 1490 - }, - { - "epoch": 0.08623673281033688, - "grad_norm": 0.24204228818416595, - "learning_rate": 0.00017243367935409457, - "loss": 0.9974, - "step": 1495 - }, - { - "epoch": 0.08652514997692663, - "grad_norm": 0.23034018278121948, - "learning_rate": 0.0001730103806228374, - "loss": 0.9251, - "step": 1500 - }, - { - "epoch": 0.08681356714351639, - "grad_norm": 0.24768561124801636, - "learning_rate": 0.00017358708189158016, - "loss": 0.957, - "step": 1505 - }, - { - "epoch": 0.08710198431010614, - "grad_norm": 0.24252378940582275, - "learning_rate": 0.00017416378316032296, - "loss": 0.9347, - "step": 1510 - }, - { - "epoch": 0.0873904014766959, - "grad_norm": 0.24422116577625275, - "learning_rate": 0.00017474048442906573, - "loss": 0.956, - "step": 1515 - }, - { - "epoch": 0.08767881864328565, - "grad_norm": 0.25470009446144104, - "learning_rate": 0.00017531718569780856, - "loss": 0.9355, - "step": 1520 - }, - { - "epoch": 0.08796723580987541, - "grad_norm": 0.240427628159523, - "learning_rate": 0.00017589388696655133, - "loss": 1.0345, - "step": 1525 - }, - { - "epoch": 0.08825565297646516, - "grad_norm": 0.2679055631160736, - "learning_rate": 0.00017647058823529413, - "loss": 1.0215, - "step": 1530 - }, - { - "epoch": 0.08854407014305492, - "grad_norm": 0.2706778943538666, - "learning_rate": 0.00017704728950403693, - "loss": 0.9951, - "step": 1535 - }, - { - "epoch": 0.08883248730964467, - "grad_norm": 0.24882011115550995, - "learning_rate": 0.00017762399077277973, - "loss": 1.0267, - "step": 1540 - }, - { - "epoch": 0.08912090447623443, - "grad_norm": 0.24369126558303833, - "learning_rate": 0.0001782006920415225, - "loss": 1.046, - "step": 1545 - }, - { - "epoch": 0.08940932164282418, - "grad_norm": 0.27035751938819885, - "learning_rate": 0.0001787773933102653, - "loss": 1.0522, - "step": 1550 - }, - { - "epoch": 0.08969773880941394, - "grad_norm": 0.25707873702049255, - "learning_rate": 0.0001793540945790081, - "loss": 0.9507, - "step": 1555 - }, - { - "epoch": 0.08998615597600369, - "grad_norm": 0.26456013321876526, - "learning_rate": 0.00017993079584775087, - "loss": 0.9941, - "step": 1560 - }, - { - "epoch": 0.09027457314259345, - "grad_norm": 0.26937803626060486, - "learning_rate": 0.00018050749711649367, - "loss": 1.0267, - "step": 1565 - }, - { - "epoch": 0.0905629903091832, - "grad_norm": 0.2615615725517273, - "learning_rate": 0.00018108419838523644, - "loss": 0.984, - "step": 1570 - }, - { - "epoch": 0.09085140747577296, - "grad_norm": 0.23720060288906097, - "learning_rate": 0.00018166089965397926, - "loss": 0.9401, - "step": 1575 - }, - { - "epoch": 0.09113982464236271, - "grad_norm": 0.24640457332134247, - "learning_rate": 0.00018223760092272203, - "loss": 1.086, - "step": 1580 - }, - { - "epoch": 0.09142824180895247, - "grad_norm": 0.2521013915538788, - "learning_rate": 0.00018281430219146483, - "loss": 0.9619, - "step": 1585 - }, - { - "epoch": 0.09171665897554222, - "grad_norm": 0.23948408663272858, - "learning_rate": 0.0001833910034602076, - "loss": 0.9835, - "step": 1590 - }, - { - "epoch": 0.09200507614213198, - "grad_norm": 0.25325456261634827, - "learning_rate": 0.00018396770472895043, - "loss": 1.0552, - "step": 1595 - }, - { - "epoch": 0.09229349330872173, - "grad_norm": 0.24731087684631348, - "learning_rate": 0.0001845444059976932, - "loss": 0.9253, - "step": 1600 - }, - { - "epoch": 0.09258191047531149, - "grad_norm": 0.26164206862449646, - "learning_rate": 0.000185121107266436, - "loss": 0.9396, - "step": 1605 - }, - { - "epoch": 0.09287032764190124, - "grad_norm": 0.25318196415901184, - "learning_rate": 0.0001856978085351788, - "loss": 0.9431, - "step": 1610 - }, - { - "epoch": 0.093158744808491, - "grad_norm": 0.2592536211013794, - "learning_rate": 0.00018627450980392157, - "loss": 0.9955, - "step": 1615 - }, - { - "epoch": 0.09344716197508075, - "grad_norm": 0.2497592270374298, - "learning_rate": 0.00018685121107266437, - "loss": 0.9844, - "step": 1620 - }, - { - "epoch": 0.09373557914167051, - "grad_norm": 0.2648375630378723, - "learning_rate": 0.00018742791234140714, - "loss": 0.9655, - "step": 1625 - }, - { - "epoch": 0.09402399630826026, - "grad_norm": 0.25172188878059387, - "learning_rate": 0.00018800461361014997, - "loss": 1.0322, - "step": 1630 - }, - { - "epoch": 0.09431241347485002, - "grad_norm": 0.24844340980052948, - "learning_rate": 0.00018858131487889274, - "loss": 0.9636, - "step": 1635 - }, - { - "epoch": 0.09460083064143977, - "grad_norm": 0.25023674964904785, - "learning_rate": 0.00018915801614763554, - "loss": 0.9601, - "step": 1640 - }, - { - "epoch": 0.09488924780802953, - "grad_norm": 0.2417484074831009, - "learning_rate": 0.0001897347174163783, - "loss": 0.9748, - "step": 1645 - }, - { - "epoch": 0.09517766497461928, - "grad_norm": 0.2597021162509918, - "learning_rate": 0.00019031141868512113, - "loss": 0.9672, - "step": 1650 - }, - { - "epoch": 0.09546608214120904, - "grad_norm": 0.25209182500839233, - "learning_rate": 0.0001908881199538639, - "loss": 0.9766, - "step": 1655 - }, - { - "epoch": 0.0957544993077988, - "grad_norm": 0.2704354226589203, - "learning_rate": 0.0001914648212226067, - "loss": 0.9658, - "step": 1660 - }, - { - "epoch": 0.09604291647438856, - "grad_norm": 0.2553963363170624, - "learning_rate": 0.00019204152249134948, - "loss": 0.972, - "step": 1665 - }, - { - "epoch": 0.09633133364097832, - "grad_norm": 0.25183454155921936, - "learning_rate": 0.00019261822376009227, - "loss": 0.9312, - "step": 1670 - }, - { - "epoch": 0.09661975080756807, - "grad_norm": 0.27272742986679077, - "learning_rate": 0.00019319492502883507, - "loss": 1.0585, - "step": 1675 - }, - { - "epoch": 0.09690816797415783, - "grad_norm": 0.25347381830215454, - "learning_rate": 0.00019377162629757784, - "loss": 1.0013, - "step": 1680 - }, - { - "epoch": 0.09719658514074758, - "grad_norm": 0.26412150263786316, - "learning_rate": 0.00019434832756632067, - "loss": 0.9175, - "step": 1685 - }, - { - "epoch": 0.09748500230733734, - "grad_norm": 0.2841266393661499, - "learning_rate": 0.00019492502883506344, - "loss": 0.8907, - "step": 1690 - }, - { - "epoch": 0.09777341947392709, - "grad_norm": 0.2843879163265228, - "learning_rate": 0.00019550173010380624, - "loss": 0.9952, - "step": 1695 - }, - { - "epoch": 0.09806183664051685, - "grad_norm": 0.24573901295661926, - "learning_rate": 0.000196078431372549, - "loss": 1.0093, - "step": 1700 - }, - { - "epoch": 0.0983502538071066, - "grad_norm": 0.25996410846710205, - "learning_rate": 0.00019665513264129184, - "loss": 1.0403, - "step": 1705 - }, - { - "epoch": 0.09863867097369636, - "grad_norm": 0.26386144757270813, - "learning_rate": 0.0001972318339100346, - "loss": 1.0211, - "step": 1710 - }, - { - "epoch": 0.09892708814028611, - "grad_norm": 0.26584669947624207, - "learning_rate": 0.0001978085351787774, - "loss": 0.9985, - "step": 1715 - }, - { - "epoch": 0.09921550530687587, - "grad_norm": 0.25835517048835754, - "learning_rate": 0.00019838523644752018, - "loss": 0.9615, - "step": 1720 - }, - { - "epoch": 0.09950392247346562, - "grad_norm": 0.2537446618080139, - "learning_rate": 0.000198961937716263, - "loss": 0.9851, - "step": 1725 - }, - { - "epoch": 0.09979233964005538, - "grad_norm": 0.2637675702571869, - "learning_rate": 0.00019953863898500578, - "loss": 0.9991, - "step": 1730 - }, - { - "epoch": 0.10008075680664513, - "grad_norm": 0.2486466020345688, - "learning_rate": 0.00019999999797274117, - "loss": 0.928, - "step": 1735 - }, - { - "epoch": 0.10036917397323489, - "grad_norm": 0.31705260276794434, - "learning_rate": 0.0001999999270186907, - "loss": 0.9909, - "step": 1740 - }, - { - "epoch": 0.10065759113982464, - "grad_norm": 0.2822314500808716, - "learning_rate": 0.0001999997547017808, - "loss": 0.9688, - "step": 1745 - }, - { - "epoch": 0.1009460083064144, - "grad_norm": 0.2564781606197357, - "learning_rate": 0.0001999994810221862, - "loss": 0.9515, - "step": 1750 - }, - { - "epoch": 0.10123442547300415, - "grad_norm": 0.2958817183971405, - "learning_rate": 0.00019999910598018426, - "loss": 0.9859, - "step": 1755 - }, - { - "epoch": 0.10152284263959391, - "grad_norm": 0.25060567259788513, - "learning_rate": 0.00019999862957615513, - "loss": 1.0043, - "step": 1760 - }, - { - "epoch": 0.10181125980618366, - "grad_norm": 0.2674092650413513, - "learning_rate": 0.00019999805181058176, - "loss": 0.9626, - "step": 1765 - }, - { - "epoch": 0.10209967697277342, - "grad_norm": 0.2575248181819916, - "learning_rate": 0.00019999737268404973, - "loss": 1.0265, - "step": 1770 - }, - { - "epoch": 0.10238809413936317, - "grad_norm": 0.2554805278778076, - "learning_rate": 0.00019999659219724749, - "loss": 0.9661, - "step": 1775 - }, - { - "epoch": 0.10267651130595293, - "grad_norm": 0.26680126786231995, - "learning_rate": 0.00019999571035096608, - "loss": 1.0231, - "step": 1780 - }, - { - "epoch": 0.10296492847254268, - "grad_norm": 0.25776219367980957, - "learning_rate": 0.00019999472714609943, - "loss": 0.9058, - "step": 1785 - }, - { - "epoch": 0.10325334563913244, - "grad_norm": 0.2542843818664551, - "learning_rate": 0.00019999364258364413, - "loss": 0.9773, - "step": 1790 - }, - { - "epoch": 0.10354176280572219, - "grad_norm": 0.2621992826461792, - "learning_rate": 0.0001999924566646995, - "loss": 0.9559, - "step": 1795 - }, - { - "epoch": 0.10383017997231195, - "grad_norm": 0.2683923840522766, - "learning_rate": 0.00019999116939046764, - "loss": 1.0355, - "step": 1800 - }, - { - "epoch": 0.1041185971389017, - "grad_norm": 0.24701032042503357, - "learning_rate": 0.0001999897807622534, - "loss": 1.0906, - "step": 1805 - }, - { - "epoch": 0.10440701430549146, - "grad_norm": 0.25396963953971863, - "learning_rate": 0.0001999882907814643, - "loss": 1.0226, - "step": 1810 - }, - { - "epoch": 0.10469543147208121, - "grad_norm": 0.28205832839012146, - "learning_rate": 0.00019998669944961062, - "loss": 0.9224, - "step": 1815 - }, - { - "epoch": 0.10498384863867097, - "grad_norm": 0.26078683137893677, - "learning_rate": 0.0001999850067683054, - "loss": 0.9427, - "step": 1820 - }, - { - "epoch": 0.10527226580526072, - "grad_norm": 0.25481727719306946, - "learning_rate": 0.00019998321273926437, - "loss": 1.0042, - "step": 1825 - }, - { - "epoch": 0.10556068297185048, - "grad_norm": 0.25570574402809143, - "learning_rate": 0.00019998131736430604, - "loss": 0.9722, - "step": 1830 - }, - { - "epoch": 0.10584910013844025, - "grad_norm": 0.2734397351741791, - "learning_rate": 0.00019997932064535158, - "loss": 1.001, - "step": 1835 - }, - { - "epoch": 0.10613751730503, - "grad_norm": 0.27242162823677063, - "learning_rate": 0.00019997722258442499, - "loss": 0.9647, - "step": 1840 - }, - { - "epoch": 0.10642593447161976, - "grad_norm": 0.2732183635234833, - "learning_rate": 0.00019997502318365286, - "loss": 0.9697, - "step": 1845 - }, - { - "epoch": 0.10671435163820951, - "grad_norm": 0.26898330450057983, - "learning_rate": 0.00019997272244526456, - "loss": 0.9284, - "step": 1850 - }, - { - "epoch": 0.10700276880479927, - "grad_norm": 0.2656812071800232, - "learning_rate": 0.00019997032037159224, - "loss": 1.0368, - "step": 1855 - }, - { - "epoch": 0.10729118597138902, - "grad_norm": 0.2728678584098816, - "learning_rate": 0.00019996781696507069, - "loss": 1.0147, - "step": 1860 - }, - { - "epoch": 0.10757960313797878, - "grad_norm": 0.2543455958366394, - "learning_rate": 0.00019996521222823743, - "loss": 0.954, - "step": 1865 - }, - { - "epoch": 0.10786802030456853, - "grad_norm": 0.27658751606941223, - "learning_rate": 0.00019996250616373268, - "loss": 0.9796, - "step": 1870 - }, - { - "epoch": 0.10815643747115829, - "grad_norm": 0.27136722207069397, - "learning_rate": 0.00019995969877429945, - "loss": 0.9125, - "step": 1875 - }, - { - "epoch": 0.10844485463774804, - "grad_norm": 0.2712014317512512, - "learning_rate": 0.0001999567900627833, - "loss": 1.0053, - "step": 1880 - }, - { - "epoch": 0.1087332718043378, - "grad_norm": 0.2740635573863983, - "learning_rate": 0.0001999537800321327, - "loss": 0.9951, - "step": 1885 - }, - { - "epoch": 0.10902168897092755, - "grad_norm": 0.26667481660842896, - "learning_rate": 0.0001999506686853986, - "loss": 1.0062, - "step": 1890 - }, - { - "epoch": 0.10931010613751731, - "grad_norm": 0.2604423463344574, - "learning_rate": 0.0001999474560257348, - "loss": 0.9852, - "step": 1895 - }, - { - "epoch": 0.10959852330410706, - "grad_norm": 0.27640554308891296, - "learning_rate": 0.00019994414205639775, - "loss": 0.959, - "step": 1900 - }, - { - "epoch": 0.10988694047069682, - "grad_norm": 0.25489839911460876, - "learning_rate": 0.00019994072678074655, - "loss": 0.9957, - "step": 1905 - }, - { - "epoch": 0.11017535763728657, - "grad_norm": 0.2796529233455658, - "learning_rate": 0.00019993721020224308, - "loss": 0.9418, - "step": 1910 - }, - { - "epoch": 0.11046377480387633, - "grad_norm": 0.2622373402118683, - "learning_rate": 0.00019993359232445176, - "loss": 0.9573, - "step": 1915 - }, - { - "epoch": 0.11075219197046608, - "grad_norm": 0.2514156997203827, - "learning_rate": 0.0001999298731510399, - "loss": 0.9373, - "step": 1920 - }, - { - "epoch": 0.11104060913705584, - "grad_norm": 0.2672327160835266, - "learning_rate": 0.00019992605268577727, - "loss": 0.9097, - "step": 1925 - }, - { - "epoch": 0.11132902630364559, - "grad_norm": 0.26772674918174744, - "learning_rate": 0.00019992213093253643, - "loss": 1.0108, - "step": 1930 - }, - { - "epoch": 0.11161744347023535, - "grad_norm": 0.2462950050830841, - "learning_rate": 0.00019991810789529257, - "loss": 1.0006, - "step": 1935 - }, - { - "epoch": 0.1119058606368251, - "grad_norm": 0.26759883761405945, - "learning_rate": 0.0001999139835781236, - "loss": 0.9758, - "step": 1940 - }, - { - "epoch": 0.11219427780341486, - "grad_norm": 0.2841535806655884, - "learning_rate": 0.00019990975798521, - "loss": 1.0408, - "step": 1945 - }, - { - "epoch": 0.11248269497000461, - "grad_norm": 0.2822214365005493, - "learning_rate": 0.00019990543112083503, - "loss": 0.9317, - "step": 1950 - }, - { - "epoch": 0.11277111213659437, - "grad_norm": 0.2670351564884186, - "learning_rate": 0.00019990100298938442, - "loss": 0.9536, - "step": 1955 - }, - { - "epoch": 0.11305952930318412, - "grad_norm": 0.27470991015434265, - "learning_rate": 0.00019989647359534672, - "loss": 1.0404, - "step": 1960 - }, - { - "epoch": 0.11334794646977388, - "grad_norm": 0.2892574071884155, - "learning_rate": 0.00019989184294331308, - "loss": 0.9912, - "step": 1965 - }, - { - "epoch": 0.11363636363636363, - "grad_norm": 0.28786224126815796, - "learning_rate": 0.0001998871110379772, - "loss": 1.048, - "step": 1970 - }, - { - "epoch": 0.11392478080295339, - "grad_norm": 0.2730783522129059, - "learning_rate": 0.0001998822778841355, - "loss": 1.0148, - "step": 1975 - }, - { - "epoch": 0.11421319796954314, - "grad_norm": 0.25908493995666504, - "learning_rate": 0.00019987734348668706, - "loss": 0.9237, - "step": 1980 - }, - { - "epoch": 0.1145016151361329, - "grad_norm": 0.2924931049346924, - "learning_rate": 0.00019987230785063344, - "loss": 1.0084, - "step": 1985 - }, - { - "epoch": 0.11479003230272265, - "grad_norm": 0.2685001790523529, - "learning_rate": 0.00019986717098107896, - "loss": 0.977, - "step": 1990 - }, - { - "epoch": 0.11507844946931241, - "grad_norm": 0.26407670974731445, - "learning_rate": 0.0001998619328832305, - "loss": 1.0132, - "step": 1995 - }, - { - "epoch": 0.11536686663590216, - "grad_norm": 0.2581160366535187, - "learning_rate": 0.00019985659356239758, - "loss": 1.0553, - "step": 2000 - }, - { - "epoch": 0.11565528380249192, - "grad_norm": 0.2579261064529419, - "learning_rate": 0.0001998511530239922, - "loss": 0.992, - "step": 2005 - }, - { - "epoch": 0.11594370096908169, - "grad_norm": 0.27874529361724854, - "learning_rate": 0.00019984561127352914, - "loss": 1.0208, - "step": 2010 - }, - { - "epoch": 0.11623211813567144, - "grad_norm": 0.2448752522468567, - "learning_rate": 0.00019983996831662566, - "loss": 1.0272, - "step": 2015 - }, - { - "epoch": 0.1165205353022612, - "grad_norm": 0.2515913248062134, - "learning_rate": 0.00019983422415900158, - "loss": 1.0251, - "step": 2020 - }, - { - "epoch": 0.11680895246885095, - "grad_norm": 0.2612157464027405, - "learning_rate": 0.0001998283788064794, - "loss": 0.9298, - "step": 2025 - }, - { - "epoch": 0.1170973696354407, - "grad_norm": 0.2781950533390045, - "learning_rate": 0.00019982243226498411, - "loss": 1.0191, - "step": 2030 - }, - { - "epoch": 0.11738578680203046, - "grad_norm": 0.27393776178359985, - "learning_rate": 0.00019981638454054333, - "loss": 0.8712, - "step": 2035 - }, - { - "epoch": 0.11767420396862022, - "grad_norm": 0.271932452917099, - "learning_rate": 0.00019981023563928716, - "loss": 0.9644, - "step": 2040 - }, - { - "epoch": 0.11796262113520997, - "grad_norm": 0.2659457325935364, - "learning_rate": 0.00019980398556744837, - "loss": 0.9295, - "step": 2045 - }, - { - "epoch": 0.11825103830179973, - "grad_norm": 0.2813827395439148, - "learning_rate": 0.00019979763433136216, - "loss": 0.975, - "step": 2050 - }, - { - "epoch": 0.11853945546838948, - "grad_norm": 0.24046528339385986, - "learning_rate": 0.00019979118193746637, - "loss": 0.9836, - "step": 2055 - }, - { - "epoch": 0.11882787263497924, - "grad_norm": 0.27069780230522156, - "learning_rate": 0.00019978462839230133, - "loss": 1.0503, - "step": 2060 - }, - { - "epoch": 0.11911628980156899, - "grad_norm": 0.2609676718711853, - "learning_rate": 0.00019977797370250986, - "loss": 0.959, - "step": 2065 - }, - { - "epoch": 0.11940470696815875, - "grad_norm": 0.2760465145111084, - "learning_rate": 0.0001997712178748374, - "loss": 1.0014, - "step": 2070 - }, - { - "epoch": 0.1196931241347485, - "grad_norm": 0.2539708614349365, - "learning_rate": 0.00019976436091613184, - "loss": 1.0215, - "step": 2075 - }, - { - "epoch": 0.11998154130133826, - "grad_norm": 0.27062153816223145, - "learning_rate": 0.0001997574028333436, - "loss": 0.964, - "step": 2080 - }, - { - "epoch": 0.12026995846792801, - "grad_norm": 0.26900675892829895, - "learning_rate": 0.00019975034363352556, - "loss": 0.935, - "step": 2085 - }, - { - "epoch": 0.12055837563451777, - "grad_norm": 0.27462172508239746, - "learning_rate": 0.0001997431833238332, - "loss": 0.974, - "step": 2090 - }, - { - "epoch": 0.12084679280110752, - "grad_norm": 0.3665010333061218, - "learning_rate": 0.00019973592191152437, - "loss": 1.0159, - "step": 2095 - }, - { - "epoch": 0.12113520996769728, - "grad_norm": 0.28900420665740967, - "learning_rate": 0.00019972855940395947, - "loss": 1.0202, - "step": 2100 - }, - { - "epoch": 0.12142362713428703, - "grad_norm": 0.2706412374973297, - "learning_rate": 0.00019972109580860132, - "loss": 0.9766, - "step": 2105 - }, - { - "epoch": 0.12171204430087679, - "grad_norm": 0.28748854994773865, - "learning_rate": 0.00019971353113301527, - "loss": 1.095, - "step": 2110 - }, - { - "epoch": 0.12200046146746654, - "grad_norm": 0.2745112180709839, - "learning_rate": 0.0001997058653848691, - "loss": 0.9995, - "step": 2115 - }, - { - "epoch": 0.1222888786340563, - "grad_norm": 0.27372869849205017, - "learning_rate": 0.00019969809857193306, - "loss": 0.9582, - "step": 2120 - }, - { - "epoch": 0.12257729580064605, - "grad_norm": 0.2714395821094513, - "learning_rate": 0.00019969023070207973, - "loss": 0.9423, - "step": 2125 - }, - { - "epoch": 0.12286571296723581, - "grad_norm": 0.26695722341537476, - "learning_rate": 0.0001996822617832843, - "loss": 0.9192, - "step": 2130 - }, - { - "epoch": 0.12315413013382556, - "grad_norm": 0.2779480814933777, - "learning_rate": 0.00019967419182362429, - "loss": 0.9577, - "step": 2135 - }, - { - "epoch": 0.12344254730041532, - "grad_norm": 0.279851496219635, - "learning_rate": 0.0001996660208312796, - "loss": 0.9946, - "step": 2140 - }, - { - "epoch": 0.12373096446700507, - "grad_norm": 0.2676329016685486, - "learning_rate": 0.00019965774881453263, - "loss": 1.0293, - "step": 2145 - }, - { - "epoch": 0.12401938163359483, - "grad_norm": 0.2577393054962158, - "learning_rate": 0.00019964937578176816, - "loss": 0.9845, - "step": 2150 - }, - { - "epoch": 0.12430779880018458, - "grad_norm": 0.2870205342769623, - "learning_rate": 0.00019964090174147327, - "loss": 0.9747, - "step": 2155 - }, - { - "epoch": 0.12459621596677434, - "grad_norm": 0.2597945034503937, - "learning_rate": 0.00019963232670223752, - "loss": 0.9896, - "step": 2160 - }, - { - "epoch": 0.12488463313336409, - "grad_norm": 0.3189765512943268, - "learning_rate": 0.00019962365067275286, - "loss": 0.9538, - "step": 2165 - }, - { - "epoch": 0.12517305029995385, - "grad_norm": 0.27205929160118103, - "learning_rate": 0.00019961487366181355, - "loss": 0.9626, - "step": 2170 - }, - { - "epoch": 0.1254614674665436, - "grad_norm": 0.26647019386291504, - "learning_rate": 0.0001996059956783162, - "loss": 1.0142, - "step": 2175 - }, - { - "epoch": 0.12574988463313336, - "grad_norm": 0.2724989652633667, - "learning_rate": 0.00019959701673125983, - "loss": 1.0228, - "step": 2180 - }, - { - "epoch": 0.1260383017997231, - "grad_norm": 0.27627307176589966, - "learning_rate": 0.00019958793682974574, - "loss": 0.9744, - "step": 2185 - }, - { - "epoch": 0.12632671896631287, - "grad_norm": 0.2836136221885681, - "learning_rate": 0.00019957875598297759, - "loss": 1.0011, - "step": 2190 - }, - { - "epoch": 0.12661513613290262, - "grad_norm": 0.26454490423202515, - "learning_rate": 0.00019956947420026136, - "loss": 1.0463, - "step": 2195 - }, - { - "epoch": 0.12690355329949238, - "grad_norm": 0.29074445366859436, - "learning_rate": 0.00019956009149100533, - "loss": 0.9643, - "step": 2200 - }, - { - "epoch": 0.12719197046608213, - "grad_norm": 0.2764613926410675, - "learning_rate": 0.00019955060786472012, - "loss": 0.9245, - "step": 2205 - }, - { - "epoch": 0.1274803876326719, - "grad_norm": 0.2702649235725403, - "learning_rate": 0.00019954102333101856, - "loss": 0.9734, - "step": 2210 - }, - { - "epoch": 0.12776880479926164, - "grad_norm": 0.28136304020881653, - "learning_rate": 0.00019953133789961584, - "loss": 0.9782, - "step": 2215 - }, - { - "epoch": 0.1280572219658514, - "grad_norm": 0.29559558629989624, - "learning_rate": 0.0001995215515803294, - "loss": 0.9708, - "step": 2220 - }, - { - "epoch": 0.12834563913244115, - "grad_norm": 0.2811656892299652, - "learning_rate": 0.00019951166438307894, - "loss": 0.9839, - "step": 2225 - }, - { - "epoch": 0.1286340562990309, - "grad_norm": 0.27432867884635925, - "learning_rate": 0.00019950167631788642, - "loss": 0.9697, - "step": 2230 - }, - { - "epoch": 0.12892247346562066, - "grad_norm": 0.28106796741485596, - "learning_rate": 0.000199491587394876, - "loss": 0.9526, - "step": 2235 - }, - { - "epoch": 0.12921089063221042, - "grad_norm": 0.2755594253540039, - "learning_rate": 0.00019948139762427416, - "loss": 0.9943, - "step": 2240 - }, - { - "epoch": 0.12949930779880017, - "grad_norm": 0.27341076731681824, - "learning_rate": 0.00019947110701640952, - "loss": 0.9661, - "step": 2245 - }, - { - "epoch": 0.12978772496538993, - "grad_norm": 0.2582038938999176, - "learning_rate": 0.000199460715581713, - "loss": 0.9083, - "step": 2250 - }, - { - "epoch": 0.13007614213197968, - "grad_norm": 0.2739073932170868, - "learning_rate": 0.00019945022333071752, - "loss": 1.0518, - "step": 2255 - }, - { - "epoch": 0.13036455929856944, - "grad_norm": 0.2646303176879883, - "learning_rate": 0.0001994396302740585, - "loss": 0.9709, - "step": 2260 - }, - { - "epoch": 0.1306529764651592, - "grad_norm": 0.2723826766014099, - "learning_rate": 0.00019942893642247326, - "loss": 0.9845, - "step": 2265 - }, - { - "epoch": 0.13094139363174895, - "grad_norm": 0.27351605892181396, - "learning_rate": 0.00019941814178680144, - "loss": 1.0138, - "step": 2270 - }, - { - "epoch": 0.13122981079833873, - "grad_norm": 0.2802083492279053, - "learning_rate": 0.00019940724637798477, - "loss": 0.9364, - "step": 2275 - }, - { - "epoch": 0.13151822796492849, - "grad_norm": 0.27607461810112, - "learning_rate": 0.00019939625020706724, - "loss": 0.9931, - "step": 2280 - }, - { - "epoch": 0.13180664513151824, - "grad_norm": 0.270385205745697, - "learning_rate": 0.0001993851532851948, - "loss": 0.9763, - "step": 2285 - }, - { - "epoch": 0.132095062298108, - "grad_norm": 0.2873282730579376, - "learning_rate": 0.00019937395562361564, - "loss": 1.0417, - "step": 2290 - }, - { - "epoch": 0.13238347946469775, - "grad_norm": 0.2726912796497345, - "learning_rate": 0.0001993626572336801, - "loss": 0.9555, - "step": 2295 - }, - { - "epoch": 0.1326718966312875, - "grad_norm": 0.2793363332748413, - "learning_rate": 0.00019935125812684047, - "loss": 0.9883, - "step": 2300 - }, - { - "epoch": 0.13296031379787726, - "grad_norm": 0.2792257070541382, - "learning_rate": 0.0001993397583146513, - "loss": 1.0003, - "step": 2305 - }, - { - "epoch": 0.13324873096446702, - "grad_norm": 0.27051353454589844, - "learning_rate": 0.00019932815780876904, - "loss": 0.9726, - "step": 2310 - }, - { - "epoch": 0.13353714813105677, - "grad_norm": 0.28619712591171265, - "learning_rate": 0.00019931645662095237, - "loss": 0.9621, - "step": 2315 - }, - { - "epoch": 0.13382556529764653, - "grad_norm": 0.27812543511390686, - "learning_rate": 0.00019930465476306197, - "loss": 0.9909, - "step": 2320 - }, - { - "epoch": 0.13411398246423628, - "grad_norm": 0.27520883083343506, - "learning_rate": 0.0001992927522470605, - "loss": 1.0185, - "step": 2325 - }, - { - "epoch": 0.13440239963082604, - "grad_norm": 0.27513301372528076, - "learning_rate": 0.00019928074908501272, - "loss": 0.9595, - "step": 2330 - }, - { - "epoch": 0.1346908167974158, - "grad_norm": 0.29639777541160583, - "learning_rate": 0.0001992686452890854, - "loss": 0.9819, - "step": 2335 - }, - { - "epoch": 0.13497923396400555, - "grad_norm": 0.2893521189689636, - "learning_rate": 0.00019925644087154734, - "loss": 0.9894, - "step": 2340 - }, - { - "epoch": 0.1352676511305953, - "grad_norm": 0.267421156167984, - "learning_rate": 0.0001992441358447692, - "loss": 0.9882, - "step": 2345 - }, - { - "epoch": 0.13555606829718506, - "grad_norm": 0.2774795591831207, - "learning_rate": 0.00019923173022122378, - "loss": 0.9404, - "step": 2350 - }, - { - "epoch": 0.1358444854637748, - "grad_norm": 0.30167555809020996, - "learning_rate": 0.00019921922401348576, - "loss": 0.9631, - "step": 2355 - }, - { - "epoch": 0.13613290263036457, - "grad_norm": 0.2823658287525177, - "learning_rate": 0.00019920661723423183, - "loss": 0.9271, - "step": 2360 - }, - { - "epoch": 0.13642131979695432, - "grad_norm": 0.2752264142036438, - "learning_rate": 0.00019919390989624054, - "loss": 0.981, - "step": 2365 - }, - { - "epoch": 0.13670973696354408, - "grad_norm": 0.284186989068985, - "learning_rate": 0.00019918110201239247, - "loss": 1.0279, - "step": 2370 - }, - { - "epoch": 0.13699815413013383, - "grad_norm": 0.2601034343242645, - "learning_rate": 0.00019916819359567001, - "loss": 1.0219, - "step": 2375 - }, - { - "epoch": 0.1372865712967236, - "grad_norm": 0.3391975164413452, - "learning_rate": 0.00019915518465915758, - "loss": 0.9432, - "step": 2380 - }, - { - "epoch": 0.13757498846331334, - "grad_norm": 0.3057229816913605, - "learning_rate": 0.0001991420752160414, - "loss": 1.0415, - "step": 2385 - }, - { - "epoch": 0.1378634056299031, - "grad_norm": 0.2857256829738617, - "learning_rate": 0.00019912886527960954, - "loss": 0.9896, - "step": 2390 - }, - { - "epoch": 0.13815182279649285, - "grad_norm": 0.4211989641189575, - "learning_rate": 0.00019911555486325203, - "loss": 1.0471, - "step": 2395 - }, - { - "epoch": 0.1384402399630826, - "grad_norm": 0.26847025752067566, - "learning_rate": 0.0001991021439804607, - "loss": 1.0071, - "step": 2400 - }, - { - "epoch": 0.13872865712967236, - "grad_norm": 0.27097341418266296, - "learning_rate": 0.00019908863264482917, - "loss": 0.9493, - "step": 2405 - }, - { - "epoch": 0.13901707429626212, - "grad_norm": 0.2873136103153229, - "learning_rate": 0.00019907502087005297, - "loss": 1.0064, - "step": 2410 - }, - { - "epoch": 0.13930549146285187, - "grad_norm": 0.2804831564426422, - "learning_rate": 0.00019906130866992935, - "loss": 0.9483, - "step": 2415 - }, - { - "epoch": 0.13959390862944163, - "grad_norm": 0.27144983410835266, - "learning_rate": 0.00019904749605835742, - "loss": 0.9541, - "step": 2420 - }, - { - "epoch": 0.13988232579603138, - "grad_norm": 0.2791461944580078, - "learning_rate": 0.00019903358304933805, - "loss": 1.0228, - "step": 2425 - }, - { - "epoch": 0.14017074296262114, - "grad_norm": 0.2839184105396271, - "learning_rate": 0.00019901956965697387, - "loss": 0.9853, - "step": 2430 - }, - { - "epoch": 0.1404591601292109, - "grad_norm": 0.2938236594200134, - "learning_rate": 0.0001990054558954693, - "loss": 1.0175, - "step": 2435 - }, - { - "epoch": 0.14074757729580065, - "grad_norm": 0.26195093989372253, - "learning_rate": 0.00019899124177913041, - "loss": 0.9927, - "step": 2440 - }, - { - "epoch": 0.1410359944623904, - "grad_norm": 0.282997727394104, - "learning_rate": 0.0001989769273223651, - "loss": 0.9148, - "step": 2445 - }, - { - "epoch": 0.14132441162898016, - "grad_norm": 0.2869815230369568, - "learning_rate": 0.00019896251253968288, - "loss": 0.9978, - "step": 2450 - }, - { - "epoch": 0.1416128287955699, - "grad_norm": 0.30306002497673035, - "learning_rate": 0.000198947997445695, - "loss": 0.9793, - "step": 2455 - }, - { - "epoch": 0.14190124596215967, - "grad_norm": 0.2726587951183319, - "learning_rate": 0.0001989333820551144, - "loss": 0.8918, - "step": 2460 - }, - { - "epoch": 0.14218966312874942, - "grad_norm": 0.3028129041194916, - "learning_rate": 0.00019891866638275564, - "loss": 1.0184, - "step": 2465 - }, - { - "epoch": 0.14247808029533918, - "grad_norm": 0.27245384454727173, - "learning_rate": 0.00019890385044353501, - "loss": 0.9187, - "step": 2470 - }, - { - "epoch": 0.14276649746192893, - "grad_norm": 0.26684272289276123, - "learning_rate": 0.00019888893425247032, - "loss": 0.94, - "step": 2475 - }, - { - "epoch": 0.1430549146285187, - "grad_norm": 0.26761725544929504, - "learning_rate": 0.00019887391782468113, - "loss": 0.9606, - "step": 2480 - }, - { - "epoch": 0.14334333179510844, - "grad_norm": 0.2789659798145294, - "learning_rate": 0.00019885880117538846, - "loss": 0.9361, - "step": 2485 - }, - { - "epoch": 0.1436317489616982, - "grad_norm": 0.2568376362323761, - "learning_rate": 0.000198843584319915, - "loss": 1.0155, - "step": 2490 - }, - { - "epoch": 0.14392016612828795, - "grad_norm": 0.29699787497520447, - "learning_rate": 0.00019882826727368508, - "loss": 1.0136, - "step": 2495 - }, - { - "epoch": 0.1442085832948777, - "grad_norm": 0.3011142313480377, - "learning_rate": 0.0001988128500522244, - "loss": 0.9967, - "step": 2500 - }, - { - "epoch": 0.14449700046146746, - "grad_norm": 0.27386248111724854, - "learning_rate": 0.00019879733267116035, - "loss": 1.0263, - "step": 2505 - }, - { - "epoch": 0.14478541762805722, - "grad_norm": 0.31453463435173035, - "learning_rate": 0.00019878171514622187, - "loss": 0.9307, - "step": 2510 - }, - { - "epoch": 0.14507383479464697, - "grad_norm": 0.2672314941883087, - "learning_rate": 0.0001987659974932392, - "loss": 0.9441, - "step": 2515 - }, - { - "epoch": 0.14536225196123673, - "grad_norm": 0.2847091257572174, - "learning_rate": 0.00019875017972814435, - "loss": 0.9868, - "step": 2520 - }, - { - "epoch": 0.14565066912782648, - "grad_norm": 0.28868651390075684, - "learning_rate": 0.0001987342618669706, - "loss": 0.9296, - "step": 2525 - }, - { - "epoch": 0.14593908629441624, - "grad_norm": 0.29168251156806946, - "learning_rate": 0.00019871824392585276, - "loss": 0.9317, - "step": 2530 - }, - { - "epoch": 0.146227503461006, - "grad_norm": 0.2743743062019348, - "learning_rate": 0.00019870212592102711, - "loss": 1.0277, - "step": 2535 - }, - { - "epoch": 0.14651592062759575, - "grad_norm": 0.2812393605709076, - "learning_rate": 0.00019868590786883134, - "loss": 1.0553, - "step": 2540 - }, - { - "epoch": 0.1468043377941855, - "grad_norm": 0.2678181231021881, - "learning_rate": 0.00019866958978570452, - "loss": 0.8821, - "step": 2545 - }, - { - "epoch": 0.14709275496077526, - "grad_norm": 0.3037974238395691, - "learning_rate": 0.00019865317168818713, - "loss": 0.9625, - "step": 2550 - }, - { - "epoch": 0.147381172127365, - "grad_norm": 0.2820071578025818, - "learning_rate": 0.00019863665359292108, - "loss": 1.0259, - "step": 2555 - }, - { - "epoch": 0.14766958929395477, - "grad_norm": 0.2591807544231415, - "learning_rate": 0.0001986200355166495, - "loss": 0.9521, - "step": 2560 - }, - { - "epoch": 0.14795800646054452, - "grad_norm": 0.26036834716796875, - "learning_rate": 0.0001986033174762171, - "loss": 0.94, - "step": 2565 - }, - { - "epoch": 0.14824642362713428, - "grad_norm": 0.27297431230545044, - "learning_rate": 0.0001985864994885697, - "loss": 0.9859, - "step": 2570 - }, - { - "epoch": 0.14853484079372403, - "grad_norm": 0.27806761860847473, - "learning_rate": 0.00019856958157075445, - "loss": 1.0, - "step": 2575 - }, - { - "epoch": 0.1488232579603138, - "grad_norm": 0.2749041020870209, - "learning_rate": 0.00019855256373991993, - "loss": 0.9111, - "step": 2580 - }, - { - "epoch": 0.14911167512690354, - "grad_norm": 0.28046393394470215, - "learning_rate": 0.0001985354460133159, - "loss": 0.9089, - "step": 2585 - }, - { - "epoch": 0.1494000922934933, - "grad_norm": 0.2683013379573822, - "learning_rate": 0.00019851822840829338, - "loss": 0.9122, - "step": 2590 - }, - { - "epoch": 0.14968850946008305, - "grad_norm": 0.28444692492485046, - "learning_rate": 0.0001985009109423046, - "loss": 0.9987, - "step": 2595 - }, - { - "epoch": 0.1499769266266728, - "grad_norm": 0.28526070713996887, - "learning_rate": 0.0001984834936329031, - "loss": 1.0177, - "step": 2600 - }, - { - "epoch": 0.15026534379326256, - "grad_norm": 0.2751544415950775, - "learning_rate": 0.00019846597649774358, - "loss": 1.0602, - "step": 2605 - }, - { - "epoch": 0.15055376095985232, - "grad_norm": 0.29558390378952026, - "learning_rate": 0.00019844835955458193, - "loss": 1.0015, - "step": 2610 - }, - { - "epoch": 0.15084217812644207, - "grad_norm": 0.27498286962509155, - "learning_rate": 0.00019843064282127511, - "loss": 0.9561, - "step": 2615 - }, - { - "epoch": 0.15113059529303183, - "grad_norm": 0.292961061000824, - "learning_rate": 0.00019841282631578145, - "loss": 0.9914, - "step": 2620 - }, - { - "epoch": 0.1514190124596216, - "grad_norm": 0.3029356896877289, - "learning_rate": 0.0001983949100561602, - "loss": 0.9801, - "step": 2625 - }, - { - "epoch": 0.15170742962621137, - "grad_norm": 0.2864689230918884, - "learning_rate": 0.00019837689406057183, - "loss": 0.9578, - "step": 2630 - }, - { - "epoch": 0.15199584679280112, - "grad_norm": 0.2750813961029053, - "learning_rate": 0.00019835877834727787, - "loss": 0.9483, - "step": 2635 - }, - { - "epoch": 0.15228426395939088, - "grad_norm": 0.27926185727119446, - "learning_rate": 0.00019834056293464093, - "loss": 1.0165, - "step": 2640 - }, - { - "epoch": 0.15257268112598063, - "grad_norm": 0.27533864974975586, - "learning_rate": 0.00019832224784112473, - "loss": 1.0241, - "step": 2645 - }, - { - "epoch": 0.15286109829257039, - "grad_norm": 0.276993989944458, - "learning_rate": 0.00019830383308529393, - "loss": 1.0444, - "step": 2650 - }, - { - "epoch": 0.15314951545916014, - "grad_norm": 0.2960858643054962, - "learning_rate": 0.0001982853186858143, - "loss": 0.9928, - "step": 2655 - }, - { - "epoch": 0.1534379326257499, - "grad_norm": 0.29162392020225525, - "learning_rate": 0.00019826670466145262, - "loss": 0.8887, - "step": 2660 - }, - { - "epoch": 0.15372634979233965, - "grad_norm": 0.2606879472732544, - "learning_rate": 0.0001982479910310765, - "loss": 0.9832, - "step": 2665 - }, - { - "epoch": 0.1540147669589294, - "grad_norm": 0.29048001766204834, - "learning_rate": 0.00019822917781365474, - "loss": 1.01, - "step": 2670 - }, - { - "epoch": 0.15430318412551916, - "grad_norm": 0.2942920923233032, - "learning_rate": 0.00019821026502825687, - "loss": 1.0289, - "step": 2675 - }, - { - "epoch": 0.15459160129210892, - "grad_norm": 0.2862975597381592, - "learning_rate": 0.00019819125269405352, - "loss": 0.9961, - "step": 2680 - }, - { - "epoch": 0.15488001845869867, - "grad_norm": 0.2896837890148163, - "learning_rate": 0.00019817214083031614, - "loss": 1.0002, - "step": 2685 - }, - { - "epoch": 0.15516843562528843, - "grad_norm": 0.26825401186943054, - "learning_rate": 0.00019815292945641705, - "loss": 0.9874, - "step": 2690 - }, - { - "epoch": 0.15545685279187818, - "grad_norm": 0.2813914120197296, - "learning_rate": 0.00019813361859182945, - "loss": 0.9919, - "step": 2695 - }, - { - "epoch": 0.15574526995846794, - "grad_norm": 0.284069687128067, - "learning_rate": 0.0001981142082561274, - "loss": 0.8997, - "step": 2700 - }, - { - "epoch": 0.1560336871250577, - "grad_norm": 0.2858209013938904, - "learning_rate": 0.00019809469846898586, - "loss": 0.9546, - "step": 2705 - }, - { - "epoch": 0.15632210429164745, - "grad_norm": 0.2836093604564667, - "learning_rate": 0.0001980750892501804, - "loss": 0.9254, - "step": 2710 - }, - { - "epoch": 0.1566105214582372, - "grad_norm": 0.32628414034843445, - "learning_rate": 0.00019805538061958765, - "loss": 0.94, - "step": 2715 - }, - { - "epoch": 0.15689893862482696, - "grad_norm": 0.2873879373073578, - "learning_rate": 0.0001980355725971847, - "loss": 0.9598, - "step": 2720 - }, - { - "epoch": 0.1571873557914167, - "grad_norm": 0.27270689606666565, - "learning_rate": 0.00019801566520304963, - "loss": 0.9622, - "step": 2725 - }, - { - "epoch": 0.15747577295800647, - "grad_norm": 0.25972458720207214, - "learning_rate": 0.0001979956584573612, - "loss": 0.9895, - "step": 2730 - }, - { - "epoch": 0.15776419012459622, - "grad_norm": 0.2917114198207855, - "learning_rate": 0.00019797555238039872, - "loss": 0.9528, - "step": 2735 - }, - { - "epoch": 0.15805260729118598, - "grad_norm": 0.26294592022895813, - "learning_rate": 0.00019795534699254238, - "loss": 0.9309, - "step": 2740 - }, - { - "epoch": 0.15834102445777573, - "grad_norm": 0.28122779726982117, - "learning_rate": 0.0001979350423142729, - "loss": 0.9853, - "step": 2745 - }, - { - "epoch": 0.15862944162436549, - "grad_norm": 0.29183605313301086, - "learning_rate": 0.00019791463836617176, - "loss": 0.9382, - "step": 2750 - }, - { - "epoch": 0.15891785879095524, - "grad_norm": 0.28074556589126587, - "learning_rate": 0.00019789413516892098, - "loss": 1.01, - "step": 2755 - }, - { - "epoch": 0.159206275957545, - "grad_norm": 0.2814944088459015, - "learning_rate": 0.00019787353274330313, - "loss": 1.0161, - "step": 2760 - }, - { - "epoch": 0.15949469312413475, - "grad_norm": 0.2898254990577698, - "learning_rate": 0.00019785283111020156, - "loss": 1.0388, - "step": 2765 - }, - { - "epoch": 0.1597831102907245, - "grad_norm": 0.2777402400970459, - "learning_rate": 0.00019783203029059997, - "loss": 0.9589, - "step": 2770 - }, - { - "epoch": 0.16007152745731426, - "grad_norm": 0.2646116316318512, - "learning_rate": 0.00019781113030558267, - "loss": 0.9569, - "step": 2775 - }, - { - "epoch": 0.16035994462390402, - "grad_norm": 0.3243483304977417, - "learning_rate": 0.00019779013117633454, - "loss": 0.9622, - "step": 2780 - }, - { - "epoch": 0.16064836179049377, - "grad_norm": 0.2765612304210663, - "learning_rate": 0.0001977690329241409, - "loss": 1.0068, - "step": 2785 - }, - { - "epoch": 0.16093677895708353, - "grad_norm": 0.30408522486686707, - "learning_rate": 0.00019774783557038755, - "loss": 0.969, - "step": 2790 - }, - { - "epoch": 0.16122519612367328, - "grad_norm": 0.26990190148353577, - "learning_rate": 0.00019772653913656076, - "loss": 1.025, - "step": 2795 - }, - { - "epoch": 0.16151361329026304, - "grad_norm": 0.31291985511779785, - "learning_rate": 0.00019770514364424725, - "loss": 1.0174, - "step": 2800 - }, - { - "epoch": 0.1618020304568528, - "grad_norm": 0.31198903918266296, - "learning_rate": 0.00019768364911513405, - "loss": 0.9603, - "step": 2805 - }, - { - "epoch": 0.16209044762344255, - "grad_norm": 0.28119274973869324, - "learning_rate": 0.00019766205557100868, - "loss": 0.9689, - "step": 2810 - }, - { - "epoch": 0.1623788647900323, - "grad_norm": 0.27684643864631653, - "learning_rate": 0.000197640363033759, - "loss": 0.9272, - "step": 2815 - }, - { - "epoch": 0.16266728195662206, - "grad_norm": 0.2740548253059387, - "learning_rate": 0.0001976185715253732, - "loss": 1.0165, - "step": 2820 - }, - { - "epoch": 0.1629556991232118, - "grad_norm": 0.3126582205295563, - "learning_rate": 0.00019759668106793975, - "loss": 0.9915, - "step": 2825 - }, - { - "epoch": 0.16324411628980157, - "grad_norm": 0.27744656801223755, - "learning_rate": 0.0001975746916836475, - "loss": 0.9971, - "step": 2830 - }, - { - "epoch": 0.16353253345639132, - "grad_norm": 0.280280202627182, - "learning_rate": 0.00019755260339478556, - "loss": 0.9637, - "step": 2835 - }, - { - "epoch": 0.16382095062298108, - "grad_norm": 0.2840816378593445, - "learning_rate": 0.0001975304162237432, - "loss": 0.9603, - "step": 2840 - }, - { - "epoch": 0.16410936778957083, - "grad_norm": 0.2826577126979828, - "learning_rate": 0.00019750813019301004, - "loss": 1.0331, - "step": 2845 - }, - { - "epoch": 0.1643977849561606, - "grad_norm": 0.2963692545890808, - "learning_rate": 0.00019748574532517586, - "loss": 0.999, - "step": 2850 - }, - { - "epoch": 0.16468620212275034, - "grad_norm": 0.2895634174346924, - "learning_rate": 0.00019746326164293056, - "loss": 0.9637, - "step": 2855 - }, - { - "epoch": 0.1649746192893401, - "grad_norm": 0.287422776222229, - "learning_rate": 0.0001974406791690643, - "loss": 0.9696, - "step": 2860 - }, - { - "epoch": 0.16526303645592985, - "grad_norm": 0.31378328800201416, - "learning_rate": 0.00019741799792646734, - "loss": 1.0066, - "step": 2865 - }, - { - "epoch": 0.1655514536225196, - "grad_norm": 0.28587618470191956, - "learning_rate": 0.00019739521793813006, - "loss": 0.9224, - "step": 2870 - }, - { - "epoch": 0.16583987078910936, - "grad_norm": 0.28385454416275024, - "learning_rate": 0.0001973723392271429, - "loss": 0.9961, - "step": 2875 - }, - { - "epoch": 0.16612828795569912, - "grad_norm": 0.27586954832077026, - "learning_rate": 0.00019734936181669638, - "loss": 1.065, - "step": 2880 - }, - { - "epoch": 0.16641670512228887, - "grad_norm": 0.30055347084999084, - "learning_rate": 0.00019732628573008114, - "loss": 1.0089, - "step": 2885 - }, - { - "epoch": 0.16670512228887863, - "grad_norm": 0.30119630694389343, - "learning_rate": 0.00019730311099068771, - "loss": 1.017, - "step": 2890 - }, - { - "epoch": 0.16699353945546838, - "grad_norm": 0.29206573963165283, - "learning_rate": 0.00019727983762200677, - "loss": 0.9635, - "step": 2895 - }, - { - "epoch": 0.16728195662205814, - "grad_norm": 0.2570163905620575, - "learning_rate": 0.00019725646564762878, - "loss": 0.9791, - "step": 2900 - }, - { - "epoch": 0.1675703737886479, - "grad_norm": 0.3360570967197418, - "learning_rate": 0.00019723299509124433, - "loss": 0.9498, - "step": 2905 - }, - { - "epoch": 0.16785879095523765, - "grad_norm": 0.29323843121528625, - "learning_rate": 0.00019720942597664385, - "loss": 0.986, - "step": 2910 - }, - { - "epoch": 0.1681472081218274, - "grad_norm": 0.30418166518211365, - "learning_rate": 0.00019718575832771768, - "loss": 0.9756, - "step": 2915 - }, - { - "epoch": 0.16843562528841716, - "grad_norm": 0.31183257699012756, - "learning_rate": 0.00019716199216845604, - "loss": 0.9997, - "step": 2920 - }, - { - "epoch": 0.1687240424550069, - "grad_norm": 0.26834046840667725, - "learning_rate": 0.000197138127522949, - "loss": 0.9315, - "step": 2925 - }, - { - "epoch": 0.16901245962159667, - "grad_norm": 0.27434879541397095, - "learning_rate": 0.00019711416441538652, - "loss": 1.0105, - "step": 2930 - }, - { - "epoch": 0.16930087678818642, - "grad_norm": 0.28828758001327515, - "learning_rate": 0.00019709010287005825, - "loss": 1.0128, - "step": 2935 - }, - { - "epoch": 0.16958929395477618, - "grad_norm": 0.2850480079650879, - "learning_rate": 0.00019706594291135366, - "loss": 0.9618, - "step": 2940 - }, - { - "epoch": 0.16987771112136593, - "grad_norm": 0.2937301993370056, - "learning_rate": 0.00019704168456376205, - "loss": 1.0175, - "step": 2945 - }, - { - "epoch": 0.1701661282879557, - "grad_norm": 0.28153088688850403, - "learning_rate": 0.0001970173278518724, - "loss": 0.9541, - "step": 2950 - }, - { - "epoch": 0.17045454545454544, - "grad_norm": 0.2839425802230835, - "learning_rate": 0.00019699287280037332, - "loss": 1.0139, - "step": 2955 - }, - { - "epoch": 0.1707429626211352, - "grad_norm": 0.28864094614982605, - "learning_rate": 0.00019696831943405324, - "loss": 1.0833, - "step": 2960 - }, - { - "epoch": 0.17103137978772495, - "grad_norm": 0.2697494626045227, - "learning_rate": 0.0001969436677778001, - "loss": 0.9827, - "step": 2965 - }, - { - "epoch": 0.1713197969543147, - "grad_norm": 0.2844550907611847, - "learning_rate": 0.0001969189178566016, - "loss": 1.005, - "step": 2970 - }, - { - "epoch": 0.1716082141209045, - "grad_norm": 0.30949264764785767, - "learning_rate": 0.000196894069695545, - "loss": 0.9696, - "step": 2975 - }, - { - "epoch": 0.17189663128749424, - "grad_norm": 0.2768407464027405, - "learning_rate": 0.00019686912331981702, - "loss": 0.9931, - "step": 2980 - }, - { - "epoch": 0.172185048454084, - "grad_norm": 0.28683245182037354, - "learning_rate": 0.00019684407875470415, - "loss": 1.0018, - "step": 2985 - }, - { - "epoch": 0.17247346562067375, - "grad_norm": 0.3155616223812103, - "learning_rate": 0.00019681893602559224, - "loss": 0.9813, - "step": 2990 - }, - { - "epoch": 0.1727618827872635, - "grad_norm": 0.3154447376728058, - "learning_rate": 0.0001967936951579667, - "loss": 0.9915, - "step": 2995 - }, - { - "epoch": 0.17305029995385326, - "grad_norm": 0.277576744556427, - "learning_rate": 0.00019676835617741249, - "loss": 0.9668, - "step": 3000 - }, - { - "epoch": 0.17333871712044302, - "grad_norm": 0.28618210554122925, - "learning_rate": 0.0001967429191096138, - "loss": 0.9745, - "step": 3005 - }, - { - "epoch": 0.17362713428703277, - "grad_norm": 0.27911707758903503, - "learning_rate": 0.0001967173839803545, - "loss": 0.9732, - "step": 3010 - }, - { - "epoch": 0.17391555145362253, - "grad_norm": 0.28373172879219055, - "learning_rate": 0.00019669175081551773, - "loss": 0.9797, - "step": 3015 - }, - { - "epoch": 0.17420396862021229, - "grad_norm": 0.29749229550361633, - "learning_rate": 0.00019666601964108598, - "loss": 0.94, - "step": 3020 - }, - { - "epoch": 0.17449238578680204, - "grad_norm": 0.31651487946510315, - "learning_rate": 0.00019664019048314116, - "loss": 0.9829, - "step": 3025 - }, - { - "epoch": 0.1747808029533918, - "grad_norm": 0.2834007740020752, - "learning_rate": 0.00019661426336786445, - "loss": 0.9336, - "step": 3030 - }, - { - "epoch": 0.17506922011998155, - "grad_norm": 0.2876712381839752, - "learning_rate": 0.00019658823832153632, - "loss": 0.9174, - "step": 3035 - }, - { - "epoch": 0.1753576372865713, - "grad_norm": 0.3259499669075012, - "learning_rate": 0.00019656211537053654, - "loss": 1.0362, - "step": 3040 - }, - { - "epoch": 0.17564605445316106, - "grad_norm": 0.26136502623558044, - "learning_rate": 0.00019653589454134406, - "loss": 0.9399, - "step": 3045 - }, - { - "epoch": 0.17593447161975082, - "grad_norm": 0.28630778193473816, - "learning_rate": 0.00019650957586053716, - "loss": 0.9861, - "step": 3050 - }, - { - "epoch": 0.17622288878634057, - "grad_norm": 0.2615172266960144, - "learning_rate": 0.00019648315935479315, - "loss": 1.0378, - "step": 3055 - }, - { - "epoch": 0.17651130595293033, - "grad_norm": 0.28133901953697205, - "learning_rate": 0.00019645664505088864, - "loss": 0.9746, - "step": 3060 - }, - { - "epoch": 0.17679972311952008, - "grad_norm": 0.3203901946544647, - "learning_rate": 0.00019643003297569923, - "loss": 0.9894, - "step": 3065 - }, - { - "epoch": 0.17708814028610984, - "grad_norm": 0.2845044434070587, - "learning_rate": 0.00019640332315619977, - "loss": 1.0024, - "step": 3070 - }, - { - "epoch": 0.1773765574526996, - "grad_norm": 0.28776776790618896, - "learning_rate": 0.0001963765156194641, - "loss": 1.0035, - "step": 3075 - }, - { - "epoch": 0.17766497461928935, - "grad_norm": 0.2923831343650818, - "learning_rate": 0.00019634961039266506, - "loss": 1.0253, - "step": 3080 - }, - { - "epoch": 0.1779533917858791, - "grad_norm": 0.29954782128334045, - "learning_rate": 0.00019632260750307467, - "loss": 0.9984, - "step": 3085 - }, - { - "epoch": 0.17824180895246886, - "grad_norm": 0.30335840582847595, - "learning_rate": 0.0001962955069780638, - "loss": 0.9339, - "step": 3090 - }, - { - "epoch": 0.1785302261190586, - "grad_norm": 0.28872916102409363, - "learning_rate": 0.00019626830884510236, - "loss": 1.0417, - "step": 3095 - }, - { - "epoch": 0.17881864328564837, - "grad_norm": 0.3210926949977875, - "learning_rate": 0.00019624101313175918, - "loss": 1.0293, - "step": 3100 - }, - { - "epoch": 0.17910706045223812, - "grad_norm": 0.29229721426963806, - "learning_rate": 0.00019621361986570194, - "loss": 0.9386, - "step": 3105 - }, - { - "epoch": 0.17939547761882788, - "grad_norm": 0.3137836754322052, - "learning_rate": 0.00019618612907469732, - "loss": 0.9874, - "step": 3110 - }, - { - "epoch": 0.17968389478541763, - "grad_norm": 0.27663466334342957, - "learning_rate": 0.00019615854078661077, - "loss": 0.9902, - "step": 3115 - }, - { - "epoch": 0.17997231195200739, - "grad_norm": 0.30164676904678345, - "learning_rate": 0.00019613085502940658, - "loss": 1.1187, - "step": 3120 - }, - { - "epoch": 0.18026072911859714, - "grad_norm": 0.2817506790161133, - "learning_rate": 0.00019610307183114787, - "loss": 0.9643, - "step": 3125 - }, - { - "epoch": 0.1805491462851869, - "grad_norm": 0.28451189398765564, - "learning_rate": 0.00019607519121999647, - "loss": 0.9553, - "step": 3130 - }, - { - "epoch": 0.18083756345177665, - "grad_norm": 0.3148361146450043, - "learning_rate": 0.00019604721322421303, - "loss": 0.9596, - "step": 3135 - }, - { - "epoch": 0.1811259806183664, - "grad_norm": 0.3131537437438965, - "learning_rate": 0.00019601913787215683, - "loss": 0.9841, - "step": 3140 - }, - { - "epoch": 0.18141439778495616, - "grad_norm": 0.301500141620636, - "learning_rate": 0.00019599096519228585, - "loss": 0.9387, - "step": 3145 - }, - { - "epoch": 0.18170281495154592, - "grad_norm": 0.2999275028705597, - "learning_rate": 0.0001959626952131568, - "loss": 0.8649, - "step": 3150 - }, - { - "epoch": 0.18199123211813567, - "grad_norm": 0.3055667281150818, - "learning_rate": 0.00019593432796342496, - "loss": 1.0364, - "step": 3155 - }, - { - "epoch": 0.18227964928472543, - "grad_norm": 0.30451443791389465, - "learning_rate": 0.00019590586347184417, - "loss": 1.0552, - "step": 3160 - }, - { - "epoch": 0.18256806645131518, - "grad_norm": 0.3046397566795349, - "learning_rate": 0.00019587730176726686, - "loss": 0.9897, - "step": 3165 - }, - { - "epoch": 0.18285648361790494, - "grad_norm": 0.3132875859737396, - "learning_rate": 0.00019584864287864408, - "loss": 0.953, - "step": 3170 - }, - { - "epoch": 0.1831449007844947, - "grad_norm": 0.2684531807899475, - "learning_rate": 0.00019581988683502525, - "loss": 1.0479, - "step": 3175 - }, - { - "epoch": 0.18343331795108445, - "grad_norm": 0.3220478594303131, - "learning_rate": 0.0001957910336655584, - "loss": 0.9818, - "step": 3180 - }, - { - "epoch": 0.1837217351176742, - "grad_norm": 0.29744499921798706, - "learning_rate": 0.00019576208339948988, - "loss": 0.985, - "step": 3185 - }, - { - "epoch": 0.18401015228426396, - "grad_norm": 0.26757848262786865, - "learning_rate": 0.00019573303606616459, - "loss": 0.9966, - "step": 3190 - }, - { - "epoch": 0.1842985694508537, - "grad_norm": 0.2966987192630768, - "learning_rate": 0.00019570389169502569, - "loss": 0.9853, - "step": 3195 - }, - { - "epoch": 0.18458698661744347, - "grad_norm": 0.2907325327396393, - "learning_rate": 0.00019567465031561487, - "loss": 1.0468, - "step": 3200 - }, - { - "epoch": 0.18487540378403322, - "grad_norm": 0.2841055989265442, - "learning_rate": 0.00019564531195757193, - "loss": 0.9837, - "step": 3205 - }, - { - "epoch": 0.18516382095062298, - "grad_norm": 0.2998584806919098, - "learning_rate": 0.0001956158766506352, - "loss": 1.0282, - "step": 3210 - }, - { - "epoch": 0.18545223811721273, - "grad_norm": 0.3043042719364166, - "learning_rate": 0.00019558634442464113, - "loss": 0.911, - "step": 3215 - }, - { - "epoch": 0.18574065528380249, - "grad_norm": 0.30067190527915955, - "learning_rate": 0.00019555671530952445, - "loss": 0.9701, - "step": 3220 - }, - { - "epoch": 0.18602907245039224, - "grad_norm": 0.297343373298645, - "learning_rate": 0.00019552698933531808, - "loss": 0.9935, - "step": 3225 - }, - { - "epoch": 0.186317489616982, - "grad_norm": 0.2842741310596466, - "learning_rate": 0.00019549716653215318, - "loss": 0.999, - "step": 3230 - }, - { - "epoch": 0.18660590678357175, - "grad_norm": 0.27844905853271484, - "learning_rate": 0.00019546724693025896, - "loss": 0.9668, - "step": 3235 - }, - { - "epoch": 0.1868943239501615, - "grad_norm": 0.29974377155303955, - "learning_rate": 0.00019543723055996282, - "loss": 0.9864, - "step": 3240 - }, - { - "epoch": 0.18718274111675126, - "grad_norm": 0.2982295751571655, - "learning_rate": 0.0001954071174516903, - "loss": 0.9902, - "step": 3245 - }, - { - "epoch": 0.18747115828334102, - "grad_norm": 0.3086935579776764, - "learning_rate": 0.00019537690763596487, - "loss": 0.9954, - "step": 3250 - }, - { - "epoch": 0.18775957544993077, - "grad_norm": 0.28824785351753235, - "learning_rate": 0.0001953466011434081, - "loss": 0.9979, - "step": 3255 - }, - { - "epoch": 0.18804799261652053, - "grad_norm": 0.2743071913719177, - "learning_rate": 0.00019531619800473952, - "loss": 0.9299, - "step": 3260 - }, - { - "epoch": 0.18833640978311028, - "grad_norm": 0.2896062433719635, - "learning_rate": 0.00019528569825077668, - "loss": 0.9861, - "step": 3265 - }, - { - "epoch": 0.18862482694970004, - "grad_norm": 0.29393669962882996, - "learning_rate": 0.00019525510191243498, - "loss": 1.0792, - "step": 3270 - }, - { - "epoch": 0.1889132441162898, - "grad_norm": 0.3489181399345398, - "learning_rate": 0.00019522440902072782, - "loss": 1.0056, - "step": 3275 - }, - { - "epoch": 0.18920166128287955, - "grad_norm": 0.31945231556892395, - "learning_rate": 0.0001951936196067664, - "loss": 1.0386, - "step": 3280 - }, - { - "epoch": 0.1894900784494693, - "grad_norm": 0.30114686489105225, - "learning_rate": 0.00019516273370175972, - "loss": 0.9667, - "step": 3285 - }, - { - "epoch": 0.18977849561605906, - "grad_norm": 0.3653857409954071, - "learning_rate": 0.00019513175133701474, - "loss": 0.9465, - "step": 3290 - }, - { - "epoch": 0.1900669127826488, - "grad_norm": 0.2919418513774872, - "learning_rate": 0.000195100672543936, - "loss": 0.9252, - "step": 3295 - }, - { - "epoch": 0.19035532994923857, - "grad_norm": 0.29241377115249634, - "learning_rate": 0.00019506949735402588, - "loss": 0.929, - "step": 3300 - }, - { - "epoch": 0.19064374711582832, - "grad_norm": 0.30068260431289673, - "learning_rate": 0.00019503822579888453, - "loss": 1.0254, - "step": 3305 - }, - { - "epoch": 0.19093216428241808, - "grad_norm": 0.2954903542995453, - "learning_rate": 0.00019500685791020968, - "loss": 0.9485, - "step": 3310 - }, - { - "epoch": 0.19122058144900783, - "grad_norm": 0.2899206876754761, - "learning_rate": 0.00019497539371979674, - "loss": 1.036, - "step": 3315 - }, - { - "epoch": 0.1915089986155976, - "grad_norm": 0.3165214955806732, - "learning_rate": 0.00019494383325953875, - "loss": 0.9616, - "step": 3320 - }, - { - "epoch": 0.19179741578218737, - "grad_norm": 0.3250178396701813, - "learning_rate": 0.0001949121765614263, - "loss": 0.9648, - "step": 3325 - }, - { - "epoch": 0.19208583294877712, - "grad_norm": 0.2635006904602051, - "learning_rate": 0.00019488042365754758, - "loss": 0.9789, - "step": 3330 - }, - { - "epoch": 0.19237425011536688, - "grad_norm": 0.2964721620082855, - "learning_rate": 0.0001948485745800882, - "loss": 0.9432, - "step": 3335 - }, - { - "epoch": 0.19266266728195663, - "grad_norm": 0.2993474006652832, - "learning_rate": 0.0001948166293613314, - "loss": 0.9556, - "step": 3340 - }, - { - "epoch": 0.1929510844485464, - "grad_norm": 0.28304216265678406, - "learning_rate": 0.00019478458803365772, - "loss": 0.9445, - "step": 3345 - }, - { - "epoch": 0.19323950161513614, - "grad_norm": 0.2697024941444397, - "learning_rate": 0.00019475245062954523, - "loss": 1.0552, - "step": 3350 - }, - { - "epoch": 0.1935279187817259, - "grad_norm": 0.2875863015651703, - "learning_rate": 0.00019472021718156937, - "loss": 0.9319, - "step": 3355 - }, - { - "epoch": 0.19381633594831565, - "grad_norm": 0.3006811738014221, - "learning_rate": 0.00019468788772240286, - "loss": 1.0049, - "step": 3360 - }, - { - "epoch": 0.1941047531149054, - "grad_norm": 0.30004388093948364, - "learning_rate": 0.0001946554622848158, - "loss": 1.0181, - "step": 3365 - }, - { - "epoch": 0.19439317028149516, - "grad_norm": 0.3029836118221283, - "learning_rate": 0.00019462294090167554, - "loss": 1.045, - "step": 3370 - }, - { - "epoch": 0.19468158744808492, - "grad_norm": 0.2854270339012146, - "learning_rate": 0.00019459032360594677, - "loss": 0.9876, - "step": 3375 - }, - { - "epoch": 0.19497000461467467, - "grad_norm": 0.3001527786254883, - "learning_rate": 0.0001945576104306913, - "loss": 0.9083, - "step": 3380 - }, - { - "epoch": 0.19525842178126443, - "grad_norm": 0.2907600700855255, - "learning_rate": 0.00019452480140906819, - "loss": 0.9734, - "step": 3385 - }, - { - "epoch": 0.19554683894785418, - "grad_norm": 0.2804548442363739, - "learning_rate": 0.00019449189657433358, - "loss": 1.0032, - "step": 3390 - }, - { - "epoch": 0.19583525611444394, - "grad_norm": 0.29847756028175354, - "learning_rate": 0.0001944588959598408, - "loss": 0.9485, - "step": 3395 - }, - { - "epoch": 0.1961236732810337, - "grad_norm": 0.28965532779693604, - "learning_rate": 0.00019442579959904024, - "loss": 0.9713, - "step": 3400 - }, - { - "epoch": 0.19641209044762345, - "grad_norm": 0.295213520526886, - "learning_rate": 0.00019439260752547935, - "loss": 0.9486, - "step": 3405 - }, - { - "epoch": 0.1967005076142132, - "grad_norm": 0.2934512794017792, - "learning_rate": 0.0001943593197728026, - "loss": 1.0448, - "step": 3410 - }, - { - "epoch": 0.19698892478080296, - "grad_norm": 0.29289090633392334, - "learning_rate": 0.00019432593637475138, - "loss": 0.9959, - "step": 3415 - }, - { - "epoch": 0.19727734194739271, - "grad_norm": 0.2757977545261383, - "learning_rate": 0.00019429245736516415, - "loss": 0.9612, - "step": 3420 - }, - { - "epoch": 0.19756575911398247, - "grad_norm": 0.28514814376831055, - "learning_rate": 0.00019425888277797615, - "loss": 1.0246, - "step": 3425 - }, - { - "epoch": 0.19785417628057222, - "grad_norm": 0.32380256056785583, - "learning_rate": 0.00019422521264721962, - "loss": 0.9404, - "step": 3430 - }, - { - "epoch": 0.19814259344716198, - "grad_norm": 0.28507691621780396, - "learning_rate": 0.0001941914470070236, - "loss": 0.8902, - "step": 3435 - }, - { - "epoch": 0.19843101061375173, - "grad_norm": 0.3757873773574829, - "learning_rate": 0.00019415758589161385, - "loss": 1.0038, - "step": 3440 - }, - { - "epoch": 0.1987194277803415, - "grad_norm": 0.3061589300632477, - "learning_rate": 0.00019412362933531307, - "loss": 0.8961, - "step": 3445 - }, - { - "epoch": 0.19900784494693124, - "grad_norm": 0.29617950320243835, - "learning_rate": 0.0001940895773725406, - "loss": 0.9573, - "step": 3450 - }, - { - "epoch": 0.199296262113521, - "grad_norm": 0.27990731596946716, - "learning_rate": 0.00019405543003781251, - "loss": 1.044, - "step": 3455 - }, - { - "epoch": 0.19958467928011075, - "grad_norm": 0.29822319746017456, - "learning_rate": 0.00019402118736574155, - "loss": 0.9799, - "step": 3460 - }, - { - "epoch": 0.1998730964467005, - "grad_norm": 0.3118431866168976, - "learning_rate": 0.00019398684939103707, - "loss": 1.0417, - "step": 3465 - }, - { - "epoch": 0.20016151361329027, - "grad_norm": 0.3202954828739166, - "learning_rate": 0.00019395241614850504, - "loss": 0.9731, - "step": 3470 - }, - { - "epoch": 0.20044993077988002, - "grad_norm": 0.3098292052745819, - "learning_rate": 0.00019391788767304804, - "loss": 0.985, - "step": 3475 - }, - { - "epoch": 0.20073834794646978, - "grad_norm": 0.2931598722934723, - "learning_rate": 0.00019388326399966515, - "loss": 1.0129, - "step": 3480 - }, - { - "epoch": 0.20102676511305953, - "grad_norm": 0.2935352027416229, - "learning_rate": 0.0001938485451634519, - "loss": 0.9402, - "step": 3485 - }, - { - "epoch": 0.20131518227964929, - "grad_norm": 0.3236974775791168, - "learning_rate": 0.00019381373119960033, - "loss": 1.0507, - "step": 3490 - }, - { - "epoch": 0.20160359944623904, - "grad_norm": 0.3834960162639618, - "learning_rate": 0.00019377882214339893, - "loss": 0.9554, - "step": 3495 - }, - { - "epoch": 0.2018920166128288, - "grad_norm": 0.2892552316188812, - "learning_rate": 0.00019374381803023252, - "loss": 1.0119, - "step": 3500 - }, - { - "epoch": 0.20218043377941855, - "grad_norm": 0.29538676142692566, - "learning_rate": 0.0001937087188955823, - "loss": 0.9977, - "step": 3505 - }, - { - "epoch": 0.2024688509460083, - "grad_norm": 0.2964411973953247, - "learning_rate": 0.00019367352477502576, - "loss": 0.9636, - "step": 3510 - }, - { - "epoch": 0.20275726811259806, - "grad_norm": 0.3167349696159363, - "learning_rate": 0.00019363823570423675, - "loss": 0.9345, - "step": 3515 - }, - { - "epoch": 0.20304568527918782, - "grad_norm": 0.3199044466018677, - "learning_rate": 0.0001936028517189852, - "loss": 0.913, - "step": 3520 - }, - { - "epoch": 0.20333410244577757, - "grad_norm": 0.27600806951522827, - "learning_rate": 0.00019356737285513748, - "loss": 0.959, - "step": 3525 - }, - { - "epoch": 0.20362251961236733, - "grad_norm": 0.31621217727661133, - "learning_rate": 0.00019353179914865596, - "loss": 1.0437, - "step": 3530 - }, - { - "epoch": 0.20391093677895708, - "grad_norm": 0.30049943923950195, - "learning_rate": 0.00019349613063559916, - "loss": 0.9675, - "step": 3535 - }, - { - "epoch": 0.20419935394554684, - "grad_norm": 0.3039463460445404, - "learning_rate": 0.00019346036735212177, - "loss": 1.0542, - "step": 3540 - }, - { - "epoch": 0.2044877711121366, - "grad_norm": 0.3049977123737335, - "learning_rate": 0.00019342450933447448, - "loss": 0.8974, - "step": 3545 - }, - { - "epoch": 0.20477618827872635, - "grad_norm": 0.2853706181049347, - "learning_rate": 0.00019338855661900405, - "loss": 0.9711, - "step": 3550 - }, - { - "epoch": 0.2050646054453161, - "grad_norm": 0.2970394492149353, - "learning_rate": 0.00019335250924215318, - "loss": 0.9516, - "step": 3555 - }, - { - "epoch": 0.20535302261190586, - "grad_norm": 0.3310398459434509, - "learning_rate": 0.00019331636724046058, - "loss": 0.9293, - "step": 3560 - }, - { - "epoch": 0.2056414397784956, - "grad_norm": 0.2932792901992798, - "learning_rate": 0.0001932801306505608, - "loss": 1.0088, - "step": 3565 - }, - { - "epoch": 0.20592985694508537, - "grad_norm": 0.3343851566314697, - "learning_rate": 0.00019324379950918437, - "loss": 1.0363, - "step": 3570 - }, - { - "epoch": 0.20621827411167512, - "grad_norm": 0.30094677209854126, - "learning_rate": 0.00019320737385315756, - "loss": 1.0072, - "step": 3575 - }, - { - "epoch": 0.20650669127826488, - "grad_norm": 0.28837206959724426, - "learning_rate": 0.00019317085371940246, - "loss": 0.9139, - "step": 3580 - }, - { - "epoch": 0.20679510844485463, - "grad_norm": 0.29000407457351685, - "learning_rate": 0.00019313423914493703, - "loss": 0.9431, - "step": 3585 - }, - { - "epoch": 0.20708352561144439, - "grad_norm": 0.28823748230934143, - "learning_rate": 0.00019309753016687477, - "loss": 0.9281, - "step": 3590 - }, - { - "epoch": 0.20737194277803414, - "grad_norm": 0.30797070264816284, - "learning_rate": 0.00019306072682242505, - "loss": 0.9611, - "step": 3595 - }, - { - "epoch": 0.2076603599446239, - "grad_norm": 0.2971121370792389, - "learning_rate": 0.00019302382914889284, - "loss": 1.0199, - "step": 3600 - }, - { - "epoch": 0.20794877711121365, - "grad_norm": 0.2938947081565857, - "learning_rate": 0.00019298683718367864, - "loss": 0.9275, - "step": 3605 - }, - { - "epoch": 0.2082371942778034, - "grad_norm": 0.3001919686794281, - "learning_rate": 0.00019294975096427862, - "loss": 0.9963, - "step": 3610 - }, - { - "epoch": 0.20852561144439316, - "grad_norm": 0.3122607469558716, - "learning_rate": 0.00019291257052828447, - "loss": 1.0458, - "step": 3615 - }, - { - "epoch": 0.20881402861098292, - "grad_norm": 0.2895052433013916, - "learning_rate": 0.00019287529591338333, - "loss": 0.9592, - "step": 3620 - }, - { - "epoch": 0.20910244577757267, - "grad_norm": 0.2828371822834015, - "learning_rate": 0.0001928379271573579, - "loss": 0.9518, - "step": 3625 - }, - { - "epoch": 0.20939086294416243, - "grad_norm": 0.30132856965065, - "learning_rate": 0.0001928004642980862, - "loss": 0.9374, - "step": 3630 - }, - { - "epoch": 0.20967928011075218, - "grad_norm": 0.4656534194946289, - "learning_rate": 0.0001927629073735417, - "loss": 0.9824, - "step": 3635 - }, - { - "epoch": 0.20996769727734194, - "grad_norm": 0.2774214744567871, - "learning_rate": 0.00019272525642179323, - "loss": 0.9528, - "step": 3640 - }, - { - "epoch": 0.2102561144439317, - "grad_norm": 0.2919476330280304, - "learning_rate": 0.00019268751148100486, - "loss": 0.9404, - "step": 3645 - }, - { - "epoch": 0.21054453161052145, - "grad_norm": 0.3007878065109253, - "learning_rate": 0.00019264967258943595, - "loss": 0.96, - "step": 3650 - }, - { - "epoch": 0.2108329487771112, - "grad_norm": 0.30731719732284546, - "learning_rate": 0.0001926117397854412, - "loss": 0.9321, - "step": 3655 - }, - { - "epoch": 0.21112136594370096, - "grad_norm": 0.32939255237579346, - "learning_rate": 0.0001925737131074703, - "loss": 1.0182, - "step": 3660 - }, - { - "epoch": 0.2114097831102907, - "grad_norm": 0.29776227474212646, - "learning_rate": 0.0001925355925940683, - "loss": 1.0224, - "step": 3665 - }, - { - "epoch": 0.2116982002768805, - "grad_norm": 0.3057902753353119, - "learning_rate": 0.00019249737828387522, - "loss": 0.9812, - "step": 3670 - }, - { - "epoch": 0.21198661744347025, - "grad_norm": 0.3011026382446289, - "learning_rate": 0.0001924590702156262, - "loss": 0.9753, - "step": 3675 - }, - { - "epoch": 0.21227503461006, - "grad_norm": 0.2978782653808594, - "learning_rate": 0.00019242066842815146, - "loss": 1.0129, - "step": 3680 - }, - { - "epoch": 0.21256345177664976, - "grad_norm": 0.2966994047164917, - "learning_rate": 0.00019238217296037614, - "loss": 1.0068, - "step": 3685 - }, - { - "epoch": 0.21285186894323951, - "grad_norm": 0.2818816602230072, - "learning_rate": 0.00019234358385132038, - "loss": 1.0062, - "step": 3690 - }, - { - "epoch": 0.21314028610982927, - "grad_norm": 0.280269980430603, - "learning_rate": 0.00019230490114009928, - "loss": 0.9392, - "step": 3695 - }, - { - "epoch": 0.21342870327641902, - "grad_norm": 0.29371026158332825, - "learning_rate": 0.00019226612486592271, - "loss": 0.8971, - "step": 3700 - }, - { - "epoch": 0.21371712044300878, - "grad_norm": 0.3066560924053192, - "learning_rate": 0.00019222725506809547, - "loss": 0.9893, - "step": 3705 - }, - { - "epoch": 0.21400553760959853, - "grad_norm": 0.31458479166030884, - "learning_rate": 0.00019218829178601713, - "loss": 1.0389, - "step": 3710 - }, - { - "epoch": 0.2142939547761883, - "grad_norm": 0.3057044446468353, - "learning_rate": 0.00019214923505918202, - "loss": 1.0005, - "step": 3715 - }, - { - "epoch": 0.21458237194277804, - "grad_norm": 0.27441418170928955, - "learning_rate": 0.00019211008492717914, - "loss": 0.9777, - "step": 3720 - }, - { - "epoch": 0.2148707891093678, - "grad_norm": 0.2985784113407135, - "learning_rate": 0.00019207084142969225, - "loss": 1.0475, - "step": 3725 - }, - { - "epoch": 0.21515920627595755, - "grad_norm": 0.305512934923172, - "learning_rate": 0.0001920315046064997, - "loss": 0.9554, - "step": 3730 - }, - { - "epoch": 0.2154476234425473, - "grad_norm": 0.3009251356124878, - "learning_rate": 0.0001919920744974745, - "loss": 0.9912, - "step": 3735 - }, - { - "epoch": 0.21573604060913706, - "grad_norm": 0.29489755630493164, - "learning_rate": 0.00019195255114258408, - "loss": 0.9554, - "step": 3740 - }, - { - "epoch": 0.21602445777572682, - "grad_norm": 0.3059771955013275, - "learning_rate": 0.0001919129345818905, - "loss": 0.9819, - "step": 3745 - }, - { - "epoch": 0.21631287494231657, - "grad_norm": 0.3015615940093994, - "learning_rate": 0.00019187322485555031, - "loss": 0.9948, - "step": 3750 - }, - { - "epoch": 0.21660129210890633, - "grad_norm": 0.3108586072921753, - "learning_rate": 0.0001918334220038144, - "loss": 0.9818, - "step": 3755 - }, - { - "epoch": 0.21688970927549608, - "grad_norm": 0.30573326349258423, - "learning_rate": 0.00019179352606702813, - "loss": 0.9519, - "step": 3760 - }, - { - "epoch": 0.21717812644208584, - "grad_norm": 0.2957397997379303, - "learning_rate": 0.00019175353708563117, - "loss": 1.0094, - "step": 3765 - }, - { - "epoch": 0.2174665436086756, - "grad_norm": 0.2969014644622803, - "learning_rate": 0.00019171345510015758, - "loss": 1.0162, - "step": 3770 - }, - { - "epoch": 0.21775496077526535, - "grad_norm": 0.33074361085891724, - "learning_rate": 0.00019167328015123558, - "loss": 0.9382, - "step": 3775 - }, - { - "epoch": 0.2180433779418551, - "grad_norm": 0.2909998297691345, - "learning_rate": 0.0001916330122795877, - "loss": 0.9768, - "step": 3780 - }, - { - "epoch": 0.21833179510844486, - "grad_norm": 0.28647512197494507, - "learning_rate": 0.00019159265152603064, - "loss": 0.9658, - "step": 3785 - }, - { - "epoch": 0.21862021227503461, - "grad_norm": 0.3733946979045868, - "learning_rate": 0.00019155219793147522, - "loss": 1.037, - "step": 3790 - }, - { - "epoch": 0.21890862944162437, - "grad_norm": 0.2883405089378357, - "learning_rate": 0.00019151165153692644, - "loss": 0.9551, - "step": 3795 - }, - { - "epoch": 0.21919704660821412, - "grad_norm": 0.33625394105911255, - "learning_rate": 0.00019147101238348326, - "loss": 0.995, - "step": 3800 - }, - { - "epoch": 0.21948546377480388, - "grad_norm": 0.4042999744415283, - "learning_rate": 0.00019143028051233873, - "loss": 0.9512, - "step": 3805 - }, - { - "epoch": 0.21977388094139363, - "grad_norm": 0.277295857667923, - "learning_rate": 0.00019138945596477994, - "loss": 0.9281, - "step": 3810 - }, - { - "epoch": 0.2200622981079834, - "grad_norm": 0.3070628046989441, - "learning_rate": 0.0001913485387821877, - "loss": 0.938, - "step": 3815 - }, - { - "epoch": 0.22035071527457314, - "grad_norm": 0.2898661494255066, - "learning_rate": 0.00019130752900603702, - "loss": 1.0103, - "step": 3820 - }, - { - "epoch": 0.2206391324411629, - "grad_norm": 0.2981604039669037, - "learning_rate": 0.00019126642667789654, - "loss": 0.9787, - "step": 3825 - }, - { - "epoch": 0.22092754960775265, - "grad_norm": 0.2816370129585266, - "learning_rate": 0.00019122523183942879, - "loss": 1.039, - "step": 3830 - }, - { - "epoch": 0.2212159667743424, - "grad_norm": 0.306822806596756, - "learning_rate": 0.00019118394453239006, - "loss": 1.0161, - "step": 3835 - }, - { - "epoch": 0.22150438394093216, - "grad_norm": 0.29982468485832214, - "learning_rate": 0.00019114256479863038, - "loss": 0.959, - "step": 3840 - }, - { - "epoch": 0.22179280110752192, - "grad_norm": 0.2966124713420868, - "learning_rate": 0.00019110109268009347, - "loss": 0.9996, - "step": 3845 - }, - { - "epoch": 0.22208121827411167, - "grad_norm": 0.3192947208881378, - "learning_rate": 0.00019105952821881668, - "loss": 1.0132, - "step": 3850 - }, - { - "epoch": 0.22236963544070143, - "grad_norm": 0.2927592694759369, - "learning_rate": 0.00019101787145693098, - "loss": 0.9738, - "step": 3855 - }, - { - "epoch": 0.22265805260729118, - "grad_norm": 0.2782720923423767, - "learning_rate": 0.00019097612243666086, - "loss": 0.9538, - "step": 3860 - }, - { - "epoch": 0.22294646977388094, - "grad_norm": 0.32348090410232544, - "learning_rate": 0.0001909342812003244, - "loss": 0.9593, - "step": 3865 - }, - { - "epoch": 0.2232348869404707, - "grad_norm": 0.32968342304229736, - "learning_rate": 0.00019089234779033306, - "loss": 0.9899, - "step": 3870 - }, - { - "epoch": 0.22352330410706045, - "grad_norm": 0.29580381512641907, - "learning_rate": 0.00019085032224919177, - "loss": 0.9515, - "step": 3875 - }, - { - "epoch": 0.2238117212736502, - "grad_norm": 0.27999478578567505, - "learning_rate": 0.00019080820461949886, - "loss": 0.9596, - "step": 3880 - }, - { - "epoch": 0.22410013844023996, - "grad_norm": 0.31083959341049194, - "learning_rate": 0.00019076599494394602, - "loss": 1.0069, - "step": 3885 - }, - { - "epoch": 0.22438855560682971, - "grad_norm": 0.2649812400341034, - "learning_rate": 0.00019072369326531824, - "loss": 0.9238, - "step": 3890 - }, - { - "epoch": 0.22467697277341947, - "grad_norm": 0.2908613383769989, - "learning_rate": 0.00019068129962649365, - "loss": 0.9745, - "step": 3895 - }, - { - "epoch": 0.22496538994000922, - "grad_norm": 0.2983262538909912, - "learning_rate": 0.00019063881407044373, - "loss": 0.9155, - "step": 3900 - }, - { - "epoch": 0.22525380710659898, - "grad_norm": 0.3074907660484314, - "learning_rate": 0.00019059623664023311, - "loss": 1.0384, - "step": 3905 - }, - { - "epoch": 0.22554222427318874, - "grad_norm": 0.3024677336215973, - "learning_rate": 0.00019055356737901952, - "loss": 1.0626, - "step": 3910 - }, - { - "epoch": 0.2258306414397785, - "grad_norm": 0.324719101190567, - "learning_rate": 0.00019051080633005372, - "loss": 0.9757, - "step": 3915 - }, - { - "epoch": 0.22611905860636825, - "grad_norm": 0.31149742007255554, - "learning_rate": 0.00019046795353667965, - "loss": 1.0294, - "step": 3920 - }, - { - "epoch": 0.226407475772958, - "grad_norm": 0.3361373543739319, - "learning_rate": 0.00019042500904233408, - "loss": 0.949, - "step": 3925 - }, - { - "epoch": 0.22669589293954776, - "grad_norm": 0.3346847593784332, - "learning_rate": 0.00019038197289054684, - "loss": 0.9531, - "step": 3930 - }, - { - "epoch": 0.2269843101061375, - "grad_norm": 0.3011166453361511, - "learning_rate": 0.00019033884512494064, - "loss": 0.9515, - "step": 3935 - }, - { - "epoch": 0.22727272727272727, - "grad_norm": 0.350754052400589, - "learning_rate": 0.00019029562578923106, - "loss": 0.9878, - "step": 3940 - }, - { - "epoch": 0.22756114443931702, - "grad_norm": 0.3115714192390442, - "learning_rate": 0.00019025231492722643, - "loss": 0.9914, - "step": 3945 - }, - { - "epoch": 0.22784956160590678, - "grad_norm": 0.29641732573509216, - "learning_rate": 0.000190208912582828, - "loss": 0.9508, - "step": 3950 - }, - { - "epoch": 0.22813797877249653, - "grad_norm": 0.3013533353805542, - "learning_rate": 0.0001901654188000296, - "loss": 0.9551, - "step": 3955 - }, - { - "epoch": 0.22842639593908629, - "grad_norm": 0.3072235584259033, - "learning_rate": 0.0001901218336229178, - "loss": 1.0324, - "step": 3960 - }, - { - "epoch": 0.22871481310567604, - "grad_norm": 0.2967047691345215, - "learning_rate": 0.00019007815709567183, - "loss": 0.9767, - "step": 3965 - }, - { - "epoch": 0.2290032302722658, - "grad_norm": 0.3344308137893677, - "learning_rate": 0.0001900343892625635, - "loss": 1.053, - "step": 3970 - }, - { - "epoch": 0.22929164743885555, - "grad_norm": 0.279471218585968, - "learning_rate": 0.00018999053016795719, - "loss": 0.9597, - "step": 3975 - }, - { - "epoch": 0.2295800646054453, - "grad_norm": 0.3151692748069763, - "learning_rate": 0.00018994657985630972, - "loss": 0.981, - "step": 3980 - }, - { - "epoch": 0.22986848177203506, - "grad_norm": 0.29757049679756165, - "learning_rate": 0.00018990253837217042, - "loss": 0.9948, - "step": 3985 - }, - { - "epoch": 0.23015689893862482, - "grad_norm": 0.29068654775619507, - "learning_rate": 0.00018985840576018107, - "loss": 0.9492, - "step": 3990 - }, - { - "epoch": 0.23044531610521457, - "grad_norm": 0.29149913787841797, - "learning_rate": 0.00018981418206507575, - "loss": 0.9603, - "step": 3995 - }, - { - "epoch": 0.23073373327180433, - "grad_norm": 0.2850954830646515, - "learning_rate": 0.00018976986733168093, - "loss": 1.0198, - "step": 4000 - }, - { - "epoch": 0.23102215043839408, - "grad_norm": 0.3014662563800812, - "learning_rate": 0.00018972546160491528, - "loss": 1.0628, - "step": 4005 - }, - { - "epoch": 0.23131056760498384, - "grad_norm": 0.29958969354629517, - "learning_rate": 0.00018968096492978976, - "loss": 0.9891, - "step": 4010 - }, - { - "epoch": 0.2315989847715736, - "grad_norm": 0.29551297426223755, - "learning_rate": 0.0001896363773514075, - "loss": 0.9811, - "step": 4015 - }, - { - "epoch": 0.23188740193816337, - "grad_norm": 0.30971017479896545, - "learning_rate": 0.0001895916989149638, - "loss": 1.0459, - "step": 4020 - }, - { - "epoch": 0.23217581910475313, - "grad_norm": 0.3282906115055084, - "learning_rate": 0.000189546929665746, - "loss": 1.0698, - "step": 4025 - }, - { - "epoch": 0.23246423627134288, - "grad_norm": 0.3017507493495941, - "learning_rate": 0.00018950206964913355, - "loss": 0.9867, - "step": 4030 - }, - { - "epoch": 0.23275265343793264, - "grad_norm": 0.34195518493652344, - "learning_rate": 0.0001894571189105979, - "loss": 0.9247, - "step": 4035 - }, - { - "epoch": 0.2330410706045224, - "grad_norm": 0.33378762006759644, - "learning_rate": 0.00018941207749570237, - "loss": 1.0384, - "step": 4040 - }, - { - "epoch": 0.23332948777111215, - "grad_norm": 0.325948029756546, - "learning_rate": 0.00018936694545010232, - "loss": 0.9698, - "step": 4045 - }, - { - "epoch": 0.2336179049377019, - "grad_norm": 0.2848076820373535, - "learning_rate": 0.0001893217228195449, - "loss": 1.0036, - "step": 4050 - }, - { - "epoch": 0.23390632210429166, - "grad_norm": 0.30070775747299194, - "learning_rate": 0.0001892764096498691, - "loss": 1.0397, - "step": 4055 - }, - { - "epoch": 0.2341947392708814, - "grad_norm": 0.3177594244480133, - "learning_rate": 0.00018923100598700561, - "loss": 1.0136, - "step": 4060 - }, - { - "epoch": 0.23448315643747117, - "grad_norm": 0.31077563762664795, - "learning_rate": 0.00018918551187697703, - "loss": 0.9457, - "step": 4065 - }, - { - "epoch": 0.23477157360406092, - "grad_norm": 0.2947135865688324, - "learning_rate": 0.00018913992736589746, - "loss": 0.9988, - "step": 4070 - }, - { - "epoch": 0.23505999077065068, - "grad_norm": 0.26377373933792114, - "learning_rate": 0.00018909425249997267, - "loss": 0.9891, - "step": 4075 - }, - { - "epoch": 0.23534840793724043, - "grad_norm": 0.3427537977695465, - "learning_rate": 0.0001890484873255001, - "loss": 0.993, - "step": 4080 - }, - { - "epoch": 0.2356368251038302, - "grad_norm": 0.28606218099594116, - "learning_rate": 0.00018900263188886864, - "loss": 0.9609, - "step": 4085 - }, - { - "epoch": 0.23592524227041994, - "grad_norm": 0.31335821747779846, - "learning_rate": 0.00018895668623655873, - "loss": 0.9278, - "step": 4090 - }, - { - "epoch": 0.2362136594370097, - "grad_norm": 0.3148699104785919, - "learning_rate": 0.00018891065041514224, - "loss": 0.9486, - "step": 4095 - }, - { - "epoch": 0.23650207660359945, - "grad_norm": 0.30335333943367004, - "learning_rate": 0.0001888645244712824, - "loss": 0.9604, - "step": 4100 - }, - { - "epoch": 0.2367904937701892, - "grad_norm": 0.2990083396434784, - "learning_rate": 0.0001888183084517338, - "loss": 0.9277, - "step": 4105 - }, - { - "epoch": 0.23707891093677896, - "grad_norm": 0.3039418160915375, - "learning_rate": 0.00018877200240334236, - "loss": 1.0381, - "step": 4110 - }, - { - "epoch": 0.23736732810336872, - "grad_norm": 0.3109247386455536, - "learning_rate": 0.0001887256063730453, - "loss": 1.0214, - "step": 4115 - }, - { - "epoch": 0.23765574526995847, - "grad_norm": 0.29135051369667053, - "learning_rate": 0.00018867912040787096, - "loss": 1.0111, - "step": 4120 - }, - { - "epoch": 0.23794416243654823, - "grad_norm": 0.29950061440467834, - "learning_rate": 0.0001886325445549389, - "loss": 0.9879, - "step": 4125 - }, - { - "epoch": 0.23823257960313798, - "grad_norm": 0.3028976619243622, - "learning_rate": 0.00018858587886145975, - "loss": 0.9808, - "step": 4130 - }, - { - "epoch": 0.23852099676972774, - "grad_norm": 0.2960391342639923, - "learning_rate": 0.0001885391233747352, - "loss": 0.9033, - "step": 4135 - }, - { - "epoch": 0.2388094139363175, - "grad_norm": 0.28858163952827454, - "learning_rate": 0.00018849227814215805, - "loss": 0.8774, - "step": 4140 - }, - { - "epoch": 0.23909783110290725, - "grad_norm": 0.3187437653541565, - "learning_rate": 0.00018844534321121195, - "loss": 1.032, - "step": 4145 - }, - { - "epoch": 0.239386248269497, - "grad_norm": 0.30050045251846313, - "learning_rate": 0.00018839831862947152, - "loss": 0.9785, - "step": 4150 - }, - { - "epoch": 0.23967466543608676, - "grad_norm": 0.3172016739845276, - "learning_rate": 0.0001883512044446023, - "loss": 1.0049, - "step": 4155 - }, - { - "epoch": 0.23996308260267651, - "grad_norm": 0.2758901119232178, - "learning_rate": 0.00018830400070436057, - "loss": 0.8758, - "step": 4160 - }, - { - "epoch": 0.24025149976926627, - "grad_norm": 0.31265828013420105, - "learning_rate": 0.00018825670745659345, - "loss": 0.9875, - "step": 4165 - }, - { - "epoch": 0.24053991693585602, - "grad_norm": 0.2935623526573181, - "learning_rate": 0.00018820932474923873, - "loss": 0.9738, - "step": 4170 - }, - { - "epoch": 0.24082833410244578, - "grad_norm": 0.31961116194725037, - "learning_rate": 0.00018816185263032496, - "loss": 0.985, - "step": 4175 - }, - { - "epoch": 0.24111675126903553, - "grad_norm": 0.302990198135376, - "learning_rate": 0.00018811429114797123, - "loss": 0.9693, - "step": 4180 - }, - { - "epoch": 0.2414051684356253, - "grad_norm": 0.3246656358242035, - "learning_rate": 0.00018806664035038727, - "loss": 0.9715, - "step": 4185 - }, - { - "epoch": 0.24169358560221504, - "grad_norm": 0.30691856145858765, - "learning_rate": 0.00018801890028587333, - "loss": 0.9967, - "step": 4190 - }, - { - "epoch": 0.2419820027688048, - "grad_norm": 0.3090788424015045, - "learning_rate": 0.00018797107100282015, - "loss": 1.0014, - "step": 4195 - }, - { - "epoch": 0.24227041993539455, - "grad_norm": 0.28349974751472473, - "learning_rate": 0.0001879231525497089, - "loss": 0.9426, - "step": 4200 - }, - { - "epoch": 0.2425588371019843, - "grad_norm": 0.3226814270019531, - "learning_rate": 0.00018787514497511104, - "loss": 1.0058, - "step": 4205 - }, - { - "epoch": 0.24284725426857406, - "grad_norm": 0.3090320825576782, - "learning_rate": 0.0001878270483276886, - "loss": 0.9565, - "step": 4210 - }, - { - "epoch": 0.24313567143516382, - "grad_norm": 0.29639485478401184, - "learning_rate": 0.00018777886265619365, - "loss": 0.9994, - "step": 4215 - }, - { - "epoch": 0.24342408860175357, - "grad_norm": 0.30157527327537537, - "learning_rate": 0.00018773058800946858, - "loss": 0.9349, - "step": 4220 - }, - { - "epoch": 0.24371250576834333, - "grad_norm": 0.2847401797771454, - "learning_rate": 0.0001876822244364461, - "loss": 0.9882, - "step": 4225 - }, - { - "epoch": 0.24400092293493308, - "grad_norm": 0.2939082086086273, - "learning_rate": 0.00018763377198614887, - "loss": 0.9545, - "step": 4230 - }, - { - "epoch": 0.24428934010152284, - "grad_norm": 0.30300137400627136, - "learning_rate": 0.00018758523070768973, - "loss": 0.9069, - "step": 4235 - }, - { - "epoch": 0.2445777572681126, - "grad_norm": 0.2980591952800751, - "learning_rate": 0.00018753660065027152, - "loss": 0.9992, - "step": 4240 - }, - { - "epoch": 0.24486617443470235, - "grad_norm": 0.31828731298446655, - "learning_rate": 0.00018748788186318712, - "loss": 0.9711, - "step": 4245 - }, - { - "epoch": 0.2451545916012921, - "grad_norm": 0.31123876571655273, - "learning_rate": 0.00018743907439581933, - "loss": 0.9393, - "step": 4250 - }, - { - "epoch": 0.24544300876788186, - "grad_norm": 0.29812201857566833, - "learning_rate": 0.00018739017829764082, - "loss": 0.9653, - "step": 4255 - }, - { - "epoch": 0.24573142593447161, - "grad_norm": 0.33146384358406067, - "learning_rate": 0.0001873411936182141, - "loss": 0.9758, - "step": 4260 - }, - { - "epoch": 0.24601984310106137, - "grad_norm": 0.3051407039165497, - "learning_rate": 0.0001872921204071915, - "loss": 1.0172, - "step": 4265 - }, - { - "epoch": 0.24630826026765112, - "grad_norm": 0.30195561051368713, - "learning_rate": 0.000187242958714315, - "loss": 0.9868, - "step": 4270 - }, - { - "epoch": 0.24659667743424088, - "grad_norm": 0.2948630750179291, - "learning_rate": 0.00018719370858941644, - "loss": 0.9771, - "step": 4275 - }, - { - "epoch": 0.24688509460083063, - "grad_norm": 0.3198891282081604, - "learning_rate": 0.00018714437008241709, - "loss": 1.04, - "step": 4280 - }, - { - "epoch": 0.2471735117674204, - "grad_norm": 0.3208988606929779, - "learning_rate": 0.000187094943243328, - "loss": 0.9666, - "step": 4285 - }, - { - "epoch": 0.24746192893401014, - "grad_norm": 0.3209957182407379, - "learning_rate": 0.00018704542812224956, - "loss": 0.9374, - "step": 4290 - }, - { - "epoch": 0.2477503461005999, - "grad_norm": 0.3006252348423004, - "learning_rate": 0.00018699582476937185, - "loss": 0.9798, - "step": 4295 - }, - { - "epoch": 0.24803876326718965, - "grad_norm": 0.3490176796913147, - "learning_rate": 0.00018694613323497422, - "loss": 1.0087, - "step": 4300 - }, - { - "epoch": 0.2483271804337794, - "grad_norm": 0.3163358271121979, - "learning_rate": 0.0001868963535694255, - "loss": 1.043, - "step": 4305 - }, - { - "epoch": 0.24861559760036916, - "grad_norm": 0.298026442527771, - "learning_rate": 0.0001868464858231838, - "loss": 1.0404, - "step": 4310 - }, - { - "epoch": 0.24890401476695892, - "grad_norm": 0.3209499418735504, - "learning_rate": 0.00018679653004679655, - "loss": 0.9687, - "step": 4315 - }, - { - "epoch": 0.24919243193354867, - "grad_norm": 0.3158719539642334, - "learning_rate": 0.0001867464862909004, - "loss": 0.9548, - "step": 4320 - }, - { - "epoch": 0.24948084910013843, - "grad_norm": 0.28783926367759705, - "learning_rate": 0.00018669635460622107, - "loss": 0.9042, - "step": 4325 - }, - { - "epoch": 0.24976926626672818, - "grad_norm": 0.2980654835700989, - "learning_rate": 0.00018664613504357366, - "loss": 0.97, - "step": 4330 - }, - { - "epoch": 0.25005768343331797, - "grad_norm": 0.2950812876224518, - "learning_rate": 0.00018659582765386204, - "loss": 1.0261, - "step": 4335 - }, - { - "epoch": 0.2503461005999077, - "grad_norm": 0.2984694540500641, - "learning_rate": 0.0001865454324880794, - "loss": 0.9859, - "step": 4340 - }, - { - "epoch": 0.2506345177664975, - "grad_norm": 0.3119395971298218, - "learning_rate": 0.00018649494959730765, - "loss": 1.03, - "step": 4345 - }, - { - "epoch": 0.2509229349330872, - "grad_norm": 0.3380660116672516, - "learning_rate": 0.00018644437903271778, - "loss": 1.0373, - "step": 4350 - }, - { - "epoch": 0.251211352099677, - "grad_norm": 0.310693621635437, - "learning_rate": 0.0001863937208455696, - "loss": 0.977, - "step": 4355 - }, - { - "epoch": 0.2514997692662667, - "grad_norm": 0.3119440972805023, - "learning_rate": 0.00018634297508721167, - "loss": 0.9384, - "step": 4360 - }, - { - "epoch": 0.2517881864328565, - "grad_norm": 0.3072355389595032, - "learning_rate": 0.00018629214180908144, - "loss": 1.0126, - "step": 4365 - }, - { - "epoch": 0.2520766035994462, - "grad_norm": 0.3056802749633789, - "learning_rate": 0.00018624122106270506, - "loss": 0.9496, - "step": 4370 - }, - { - "epoch": 0.252365020766036, - "grad_norm": 0.34883102774620056, - "learning_rate": 0.00018619021289969717, - "loss": 0.9626, - "step": 4375 - }, - { - "epoch": 0.25265343793262574, - "grad_norm": 0.2876664698123932, - "learning_rate": 0.00018613911737176125, - "loss": 0.9452, - "step": 4380 - }, - { - "epoch": 0.2529418550992155, - "grad_norm": 0.3051524758338928, - "learning_rate": 0.00018608793453068914, - "loss": 0.996, - "step": 4385 - }, - { - "epoch": 0.25323027226580525, - "grad_norm": 0.2734985053539276, - "learning_rate": 0.0001860366644283613, - "loss": 0.9395, - "step": 4390 - }, - { - "epoch": 0.25351868943239503, - "grad_norm": 0.30163031816482544, - "learning_rate": 0.00018598530711674667, - "loss": 0.9608, - "step": 4395 - }, - { - "epoch": 0.25380710659898476, - "grad_norm": 0.2709837555885315, - "learning_rate": 0.00018593386264790243, - "loss": 0.9611, - "step": 4400 - }, - { - "epoch": 0.25409552376557454, - "grad_norm": 0.3166120946407318, - "learning_rate": 0.00018588233107397429, - "loss": 0.8999, - "step": 4405 - }, - { - "epoch": 0.25438394093216427, - "grad_norm": 0.2956826090812683, - "learning_rate": 0.00018583071244719607, - "loss": 0.9097, - "step": 4410 - }, - { - "epoch": 0.25467235809875405, - "grad_norm": 0.31426194310188293, - "learning_rate": 0.00018577900681989, - "loss": 0.941, - "step": 4415 - }, - { - "epoch": 0.2549607752653438, - "grad_norm": 0.2746027410030365, - "learning_rate": 0.0001857272142444664, - "loss": 0.9168, - "step": 4420 - }, - { - "epoch": 0.25524919243193356, - "grad_norm": 0.2936379015445709, - "learning_rate": 0.00018567533477342377, - "loss": 0.9536, - "step": 4425 - }, - { - "epoch": 0.2555376095985233, - "grad_norm": 0.31358134746551514, - "learning_rate": 0.0001856233684593486, - "loss": 0.9569, - "step": 4430 - }, - { - "epoch": 0.25582602676511307, - "grad_norm": 0.31144851446151733, - "learning_rate": 0.0001855713153549155, - "loss": 0.9447, - "step": 4435 - }, - { - "epoch": 0.2561144439317028, - "grad_norm": 0.31088197231292725, - "learning_rate": 0.00018551917551288706, - "loss": 0.9873, - "step": 4440 - }, - { - "epoch": 0.2564028610982926, - "grad_norm": 0.31137150526046753, - "learning_rate": 0.0001854669489861137, - "loss": 0.9769, - "step": 4445 - }, - { - "epoch": 0.2566912782648823, - "grad_norm": 0.3470550775527954, - "learning_rate": 0.0001854146358275338, - "loss": 0.9824, - "step": 4450 - }, - { - "epoch": 0.2569796954314721, - "grad_norm": 0.305550754070282, - "learning_rate": 0.00018536223609017348, - "loss": 1.0573, - "step": 4455 - }, - { - "epoch": 0.2572681125980618, - "grad_norm": 0.30111902952194214, - "learning_rate": 0.00018530974982714667, - "loss": 0.9919, - "step": 4460 - }, - { - "epoch": 0.2575565297646516, - "grad_norm": 0.29458123445510864, - "learning_rate": 0.00018525717709165498, - "loss": 1.0249, - "step": 4465 - }, - { - "epoch": 0.2578449469312413, - "grad_norm": 0.2974050045013428, - "learning_rate": 0.0001852045179369877, - "loss": 1.0155, - "step": 4470 - }, - { - "epoch": 0.2581333640978311, - "grad_norm": 0.27646365761756897, - "learning_rate": 0.00018515177241652163, - "loss": 0.9477, - "step": 4475 - }, - { - "epoch": 0.25842178126442084, - "grad_norm": 0.3065283000469208, - "learning_rate": 0.0001850989405837212, - "loss": 0.9789, - "step": 4480 - }, - { - "epoch": 0.2587101984310106, - "grad_norm": 0.31208351254463196, - "learning_rate": 0.00018504602249213838, - "loss": 1.0209, - "step": 4485 - }, - { - "epoch": 0.25899861559760035, - "grad_norm": 0.27680978178977966, - "learning_rate": 0.0001849930181954124, - "loss": 0.9937, - "step": 4490 - }, - { - "epoch": 0.25928703276419013, - "grad_norm": 0.35537493228912354, - "learning_rate": 0.00018493992774727005, - "loss": 1.019, - "step": 4495 - }, - { - "epoch": 0.25957544993077986, - "grad_norm": 0.2992296814918518, - "learning_rate": 0.00018488675120152532, - "loss": 0.9409, - "step": 4500 - }, - { - "epoch": 0.25986386709736964, - "grad_norm": 0.2907122075557709, - "learning_rate": 0.00018483348861207953, - "loss": 0.9925, - "step": 4505 - }, - { - "epoch": 0.26015228426395937, - "grad_norm": 0.3083319664001465, - "learning_rate": 0.00018478014003292116, - "loss": 0.9494, - "step": 4510 - }, - { - "epoch": 0.26044070143054915, - "grad_norm": 0.2940841615200043, - "learning_rate": 0.00018472670551812596, - "loss": 1.0234, - "step": 4515 - }, - { - "epoch": 0.2607291185971389, - "grad_norm": 0.3526857793331146, - "learning_rate": 0.0001846731851218567, - "loss": 1.0047, - "step": 4520 - }, - { - "epoch": 0.26101753576372866, - "grad_norm": 0.2867284119129181, - "learning_rate": 0.00018461957889836324, - "loss": 0.953, - "step": 4525 - }, - { - "epoch": 0.2613059529303184, - "grad_norm": 0.28662440180778503, - "learning_rate": 0.00018456588690198236, - "loss": 0.9734, - "step": 4530 - }, - { - "epoch": 0.26159437009690817, - "grad_norm": 0.2874925136566162, - "learning_rate": 0.0001845121091871379, - "loss": 1.012, - "step": 4535 - }, - { - "epoch": 0.2618827872634979, - "grad_norm": 0.30890873074531555, - "learning_rate": 0.0001844582458083405, - "loss": 0.9317, - "step": 4540 - }, - { - "epoch": 0.2621712044300877, - "grad_norm": 0.2991410791873932, - "learning_rate": 0.0001844042968201877, - "loss": 0.9488, - "step": 4545 - }, - { - "epoch": 0.26245962159667746, - "grad_norm": 0.29846030473709106, - "learning_rate": 0.0001843502622773637, - "loss": 0.9722, - "step": 4550 - }, - { - "epoch": 0.2627480387632672, - "grad_norm": 0.30086445808410645, - "learning_rate": 0.0001842961422346396, - "loss": 0.9901, - "step": 4555 - }, - { - "epoch": 0.26303645592985697, - "grad_norm": 0.3020675778388977, - "learning_rate": 0.00018424193674687297, - "loss": 1.0275, - "step": 4560 - }, - { - "epoch": 0.2633248730964467, - "grad_norm": 0.3111262023448944, - "learning_rate": 0.00018418764586900817, - "loss": 0.9977, - "step": 4565 - }, - { - "epoch": 0.2636132902630365, - "grad_norm": 0.3167891204357147, - "learning_rate": 0.00018413326965607593, - "loss": 1.0266, - "step": 4570 - }, - { - "epoch": 0.2639017074296262, - "grad_norm": 0.28536850214004517, - "learning_rate": 0.00018407880816319363, - "loss": 0.9475, - "step": 4575 - }, - { - "epoch": 0.264190124596216, - "grad_norm": 0.30811807513237, - "learning_rate": 0.00018402426144556504, - "loss": 0.9549, - "step": 4580 - }, - { - "epoch": 0.2644785417628057, - "grad_norm": 0.2881765365600586, - "learning_rate": 0.0001839696295584803, - "loss": 1.0276, - "step": 4585 - }, - { - "epoch": 0.2647669589293955, - "grad_norm": 0.3339601159095764, - "learning_rate": 0.0001839149125573159, - "loss": 0.9772, - "step": 4590 - }, - { - "epoch": 0.26505537609598523, - "grad_norm": 0.2897505760192871, - "learning_rate": 0.0001838601104975346, - "loss": 1.0897, - "step": 4595 - }, - { - "epoch": 0.265343793262575, - "grad_norm": 0.3119150400161743, - "learning_rate": 0.00018380522343468532, - "loss": 0.9842, - "step": 4600 - }, - { - "epoch": 0.265343793262575, - "step": 4600, - "total_flos": 3.2343958172802744e+18, - "train_loss": 0.0, - "train_runtime": 0.0427, - "train_samples_per_second": 9970.556, - "train_steps_per_second": 304.266 + "epoch": 1.0, + "step": 134, + "total_flos": 3.768774247149732e+17, + "train_loss": 0.5603564016854585, + "train_runtime": 1424.9095, + "train_samples_per_second": 3.001, + "train_steps_per_second": 0.094 } ], "logging_steps": 5, - "max_steps": 13, + "max_steps": 134, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, @@ -6477,12 +227,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 3.2343958172802744e+18, + "total_flos": 3.768774247149732e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null