diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17802 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996550874599656, + "eval_steps": 500, + "global_step": 2536, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003941857600394186, + "grad_norm": 105.2694138675085, + "learning_rate": 7.874015748031497e-08, + "loss": 9.1364, + "step": 1 + }, + { + "epoch": 0.0007883715200788372, + "grad_norm": 96.39150101397924, + "learning_rate": 1.5748031496062994e-07, + "loss": 9.154, + "step": 2 + }, + { + "epoch": 0.0011825572801182557, + "grad_norm": 101.32411852803344, + "learning_rate": 2.362204724409449e-07, + "loss": 9.0663, + "step": 3 + }, + { + "epoch": 0.0015767430401576743, + "grad_norm": 82.40406383837171, + "learning_rate": 3.149606299212599e-07, + "loss": 9.042, + "step": 4 + }, + { + "epoch": 0.001970928800197093, + "grad_norm": 67.15234107674974, + "learning_rate": 3.937007874015748e-07, + "loss": 8.8627, + "step": 5 + }, + { + "epoch": 0.0023651145602365115, + "grad_norm": 50.03571730325678, + "learning_rate": 4.724409448818898e-07, + "loss": 8.7217, + "step": 6 + }, + { + "epoch": 0.00275930032027593, + "grad_norm": 44.44168073728412, + "learning_rate": 5.511811023622048e-07, + "loss": 8.7231, + "step": 7 + }, + { + "epoch": 0.0031534860803153486, + "grad_norm": 58.27436136079177, + "learning_rate": 6.299212598425198e-07, + "loss": 8.6106, + "step": 8 + }, + { + "epoch": 0.003547671840354767, + "grad_norm": 53.84109799579168, + "learning_rate": 7.086614173228346e-07, + "loss": 8.5611, + "step": 9 + }, + { + "epoch": 0.003941857600394186, + "grad_norm": 57.647941933879174, + "learning_rate": 7.874015748031496e-07, + "loss": 8.4481, + "step": 10 + }, + { + "epoch": 0.004336043360433604, + "grad_norm": 53.27276526815267, + "learning_rate": 8.661417322834646e-07, + "loss": 8.2106, + "step": 11 + }, + { + "epoch": 0.004730229120473023, + "grad_norm": 41.91985046682287, + "learning_rate": 9.448818897637796e-07, + "loss": 8.178, + "step": 12 + }, + { + "epoch": 0.0051244148805124415, + "grad_norm": 53.755309589361765, + "learning_rate": 1.0236220472440946e-06, + "loss": 7.9319, + "step": 13 + }, + { + "epoch": 0.00551860064055186, + "grad_norm": 46.32010256680663, + "learning_rate": 1.1023622047244096e-06, + "loss": 7.8888, + "step": 14 + }, + { + "epoch": 0.005912786400591279, + "grad_norm": 49.6775736894897, + "learning_rate": 1.1811023622047246e-06, + "loss": 7.5264, + "step": 15 + }, + { + "epoch": 0.006306972160630697, + "grad_norm": 42.62706429768497, + "learning_rate": 1.2598425196850396e-06, + "loss": 7.3966, + "step": 16 + }, + { + "epoch": 0.006701157920670116, + "grad_norm": 49.62426641360553, + "learning_rate": 1.3385826771653545e-06, + "loss": 7.2773, + "step": 17 + }, + { + "epoch": 0.007095343680709534, + "grad_norm": 37.16995292351442, + "learning_rate": 1.4173228346456693e-06, + "loss": 7.1822, + "step": 18 + }, + { + "epoch": 0.007489529440748953, + "grad_norm": 47.04696207168547, + "learning_rate": 1.4960629921259845e-06, + "loss": 7.0047, + "step": 19 + }, + { + "epoch": 0.007883715200788372, + "grad_norm": 37.94846722638193, + "learning_rate": 1.5748031496062992e-06, + "loss": 6.7177, + "step": 20 + }, + { + "epoch": 0.00827790096082779, + "grad_norm": 38.60623637532149, + "learning_rate": 1.6535433070866144e-06, + "loss": 6.6605, + "step": 21 + }, + { + "epoch": 0.008672086720867209, + "grad_norm": 32.058378304509375, + "learning_rate": 1.7322834645669292e-06, + "loss": 6.4935, + "step": 22 + }, + { + "epoch": 0.009066272480906627, + "grad_norm": 29.645376469665575, + "learning_rate": 1.8110236220472444e-06, + "loss": 6.1995, + "step": 23 + }, + { + "epoch": 0.009460458240946046, + "grad_norm": 27.31573397346269, + "learning_rate": 1.8897637795275591e-06, + "loss": 6.0939, + "step": 24 + }, + { + "epoch": 0.009854644000985464, + "grad_norm": 28.841277709048004, + "learning_rate": 1.968503937007874e-06, + "loss": 5.8683, + "step": 25 + }, + { + "epoch": 0.010248829761024883, + "grad_norm": 28.590239784856998, + "learning_rate": 2.0472440944881893e-06, + "loss": 5.7835, + "step": 26 + }, + { + "epoch": 0.010643015521064302, + "grad_norm": 24.204577484360353, + "learning_rate": 2.125984251968504e-06, + "loss": 5.7053, + "step": 27 + }, + { + "epoch": 0.01103720128110372, + "grad_norm": 26.31176344346586, + "learning_rate": 2.2047244094488192e-06, + "loss": 5.4684, + "step": 28 + }, + { + "epoch": 0.011431387041143139, + "grad_norm": 32.01648516861882, + "learning_rate": 2.283464566929134e-06, + "loss": 5.3315, + "step": 29 + }, + { + "epoch": 0.011825572801182557, + "grad_norm": 19.524646014081327, + "learning_rate": 2.362204724409449e-06, + "loss": 5.3126, + "step": 30 + }, + { + "epoch": 0.012219758561221976, + "grad_norm": 17.63176728075391, + "learning_rate": 2.440944881889764e-06, + "loss": 5.0745, + "step": 31 + }, + { + "epoch": 0.012613944321261394, + "grad_norm": 15.40058495681761, + "learning_rate": 2.519685039370079e-06, + "loss": 5.0836, + "step": 32 + }, + { + "epoch": 0.013008130081300813, + "grad_norm": 15.479272180480063, + "learning_rate": 2.598425196850394e-06, + "loss": 4.7458, + "step": 33 + }, + { + "epoch": 0.013402315841340232, + "grad_norm": 14.695908677216751, + "learning_rate": 2.677165354330709e-06, + "loss": 4.7244, + "step": 34 + }, + { + "epoch": 0.01379650160137965, + "grad_norm": 13.524764594889588, + "learning_rate": 2.755905511811024e-06, + "loss": 4.3899, + "step": 35 + }, + { + "epoch": 0.014190687361419069, + "grad_norm": 13.246003272441062, + "learning_rate": 2.8346456692913386e-06, + "loss": 4.5221, + "step": 36 + }, + { + "epoch": 0.014584873121458487, + "grad_norm": 12.203731349756566, + "learning_rate": 2.9133858267716538e-06, + "loss": 4.3881, + "step": 37 + }, + { + "epoch": 0.014979058881497906, + "grad_norm": 14.042134575473153, + "learning_rate": 2.992125984251969e-06, + "loss": 4.2049, + "step": 38 + }, + { + "epoch": 0.015373244641537324, + "grad_norm": 11.055066543698642, + "learning_rate": 3.0708661417322837e-06, + "loss": 4.3104, + "step": 39 + }, + { + "epoch": 0.015767430401576743, + "grad_norm": 10.153496145045878, + "learning_rate": 3.1496062992125985e-06, + "loss": 4.3375, + "step": 40 + }, + { + "epoch": 0.01616161616161616, + "grad_norm": 10.799528438464218, + "learning_rate": 3.2283464566929136e-06, + "loss": 4.4063, + "step": 41 + }, + { + "epoch": 0.01655580192165558, + "grad_norm": 8.368548564564762, + "learning_rate": 3.307086614173229e-06, + "loss": 3.7956, + "step": 42 + }, + { + "epoch": 0.016949987681695, + "grad_norm": 10.759799829642327, + "learning_rate": 3.3858267716535436e-06, + "loss": 3.9338, + "step": 43 + }, + { + "epoch": 0.017344173441734417, + "grad_norm": 9.554117583184022, + "learning_rate": 3.4645669291338583e-06, + "loss": 3.8938, + "step": 44 + }, + { + "epoch": 0.017738359201773836, + "grad_norm": 23.01084152913365, + "learning_rate": 3.5433070866141735e-06, + "loss": 3.8921, + "step": 45 + }, + { + "epoch": 0.018132544961813254, + "grad_norm": 9.532765765693696, + "learning_rate": 3.6220472440944887e-06, + "loss": 3.9267, + "step": 46 + }, + { + "epoch": 0.018526730721852673, + "grad_norm": 7.676169667219361, + "learning_rate": 3.7007874015748035e-06, + "loss": 3.5909, + "step": 47 + }, + { + "epoch": 0.01892091648189209, + "grad_norm": 12.040351419125447, + "learning_rate": 3.7795275590551182e-06, + "loss": 3.9373, + "step": 48 + }, + { + "epoch": 0.01931510224193151, + "grad_norm": 8.25216993424453, + "learning_rate": 3.858267716535433e-06, + "loss": 3.5314, + "step": 49 + }, + { + "epoch": 0.01970928800197093, + "grad_norm": 7.474412198918091, + "learning_rate": 3.937007874015748e-06, + "loss": 3.506, + "step": 50 + }, + { + "epoch": 0.020103473762010347, + "grad_norm": 7.847621110877795, + "learning_rate": 4.015748031496064e-06, + "loss": 3.5028, + "step": 51 + }, + { + "epoch": 0.020497659522049766, + "grad_norm": 6.570956902449958, + "learning_rate": 4.0944881889763785e-06, + "loss": 3.4612, + "step": 52 + }, + { + "epoch": 0.020891845282089185, + "grad_norm": 5.5766242231172924, + "learning_rate": 4.173228346456693e-06, + "loss": 3.2965, + "step": 53 + }, + { + "epoch": 0.021286031042128603, + "grad_norm": 6.108165687578511, + "learning_rate": 4.251968503937008e-06, + "loss": 3.4297, + "step": 54 + }, + { + "epoch": 0.02168021680216802, + "grad_norm": 5.219670006640724, + "learning_rate": 4.330708661417324e-06, + "loss": 2.9365, + "step": 55 + }, + { + "epoch": 0.02207440256220744, + "grad_norm": 11.909762655268862, + "learning_rate": 4.4094488188976384e-06, + "loss": 3.3342, + "step": 56 + }, + { + "epoch": 0.02246858832224686, + "grad_norm": 6.039053713195223, + "learning_rate": 4.488188976377953e-06, + "loss": 3.1308, + "step": 57 + }, + { + "epoch": 0.022862774082286277, + "grad_norm": 6.330821449415944, + "learning_rate": 4.566929133858268e-06, + "loss": 3.1559, + "step": 58 + }, + { + "epoch": 0.023256959842325696, + "grad_norm": 5.850842944173947, + "learning_rate": 4.645669291338583e-06, + "loss": 3.1376, + "step": 59 + }, + { + "epoch": 0.023651145602365115, + "grad_norm": 6.618904157271684, + "learning_rate": 4.724409448818898e-06, + "loss": 3.1044, + "step": 60 + }, + { + "epoch": 0.024045331362404533, + "grad_norm": 12.768772667010369, + "learning_rate": 4.803149606299213e-06, + "loss": 2.8825, + "step": 61 + }, + { + "epoch": 0.02443951712244395, + "grad_norm": 7.679745085489206, + "learning_rate": 4.881889763779528e-06, + "loss": 3.0757, + "step": 62 + }, + { + "epoch": 0.02483370288248337, + "grad_norm": 4.427650604634613, + "learning_rate": 4.960629921259843e-06, + "loss": 2.8175, + "step": 63 + }, + { + "epoch": 0.02522788864252279, + "grad_norm": 6.028182477121757, + "learning_rate": 5.039370078740158e-06, + "loss": 2.998, + "step": 64 + }, + { + "epoch": 0.025622074402562207, + "grad_norm": 5.50324148915112, + "learning_rate": 5.118110236220473e-06, + "loss": 2.9141, + "step": 65 + }, + { + "epoch": 0.026016260162601626, + "grad_norm": 4.48735111430469, + "learning_rate": 5.196850393700788e-06, + "loss": 2.7909, + "step": 66 + }, + { + "epoch": 0.026410445922641045, + "grad_norm": 5.701752085492088, + "learning_rate": 5.2755905511811025e-06, + "loss": 2.8697, + "step": 67 + }, + { + "epoch": 0.026804631682680463, + "grad_norm": 9.227957681435909, + "learning_rate": 5.354330708661418e-06, + "loss": 2.6822, + "step": 68 + }, + { + "epoch": 0.02719881744271988, + "grad_norm": 5.786678373864676, + "learning_rate": 5.433070866141733e-06, + "loss": 2.7271, + "step": 69 + }, + { + "epoch": 0.0275930032027593, + "grad_norm": 4.652746279810885, + "learning_rate": 5.511811023622048e-06, + "loss": 2.7177, + "step": 70 + }, + { + "epoch": 0.02798718896279872, + "grad_norm": 6.252735777715452, + "learning_rate": 5.590551181102362e-06, + "loss": 2.8251, + "step": 71 + }, + { + "epoch": 0.028381374722838137, + "grad_norm": 5.151704866859134, + "learning_rate": 5.669291338582677e-06, + "loss": 2.6813, + "step": 72 + }, + { + "epoch": 0.028775560482877556, + "grad_norm": 4.337181405580127, + "learning_rate": 5.748031496062993e-06, + "loss": 2.4957, + "step": 73 + }, + { + "epoch": 0.029169746242916975, + "grad_norm": 5.91427046899434, + "learning_rate": 5.8267716535433075e-06, + "loss": 2.6815, + "step": 74 + }, + { + "epoch": 0.029563932002956393, + "grad_norm": 7.660058774479181, + "learning_rate": 5.905511811023622e-06, + "loss": 2.7335, + "step": 75 + }, + { + "epoch": 0.029958117762995812, + "grad_norm": 4.115441568706006, + "learning_rate": 5.984251968503938e-06, + "loss": 2.5424, + "step": 76 + }, + { + "epoch": 0.03035230352303523, + "grad_norm": 5.097053848951776, + "learning_rate": 6.062992125984253e-06, + "loss": 2.5098, + "step": 77 + }, + { + "epoch": 0.03074648928307465, + "grad_norm": 3.609880169600323, + "learning_rate": 6.141732283464567e-06, + "loss": 2.4653, + "step": 78 + }, + { + "epoch": 0.031140675043114067, + "grad_norm": 4.8790844537526326, + "learning_rate": 6.220472440944882e-06, + "loss": 2.5257, + "step": 79 + }, + { + "epoch": 0.031534860803153486, + "grad_norm": 5.766910080666288, + "learning_rate": 6.299212598425197e-06, + "loss": 2.5395, + "step": 80 + }, + { + "epoch": 0.031929046563192905, + "grad_norm": 5.536361935443466, + "learning_rate": 6.3779527559055125e-06, + "loss": 2.5367, + "step": 81 + }, + { + "epoch": 0.03232323232323232, + "grad_norm": 4.770127422423979, + "learning_rate": 6.456692913385827e-06, + "loss": 2.4774, + "step": 82 + }, + { + "epoch": 0.03271741808327174, + "grad_norm": 4.416647274076856, + "learning_rate": 6.535433070866142e-06, + "loss": 2.4903, + "step": 83 + }, + { + "epoch": 0.03311160384331116, + "grad_norm": 4.431530080181854, + "learning_rate": 6.614173228346458e-06, + "loss": 2.3936, + "step": 84 + }, + { + "epoch": 0.03350578960335058, + "grad_norm": 5.6472652822872895, + "learning_rate": 6.692913385826772e-06, + "loss": 2.4404, + "step": 85 + }, + { + "epoch": 0.03389997536339, + "grad_norm": 5.200598323481072, + "learning_rate": 6.771653543307087e-06, + "loss": 2.4376, + "step": 86 + }, + { + "epoch": 0.034294161123429416, + "grad_norm": 4.387657662515284, + "learning_rate": 6.850393700787402e-06, + "loss": 2.3363, + "step": 87 + }, + { + "epoch": 0.034688346883468835, + "grad_norm": 3.2185171323039192, + "learning_rate": 6.929133858267717e-06, + "loss": 2.2646, + "step": 88 + }, + { + "epoch": 0.03508253264350825, + "grad_norm": 8.73223179057534, + "learning_rate": 7.0078740157480315e-06, + "loss": 2.3927, + "step": 89 + }, + { + "epoch": 0.03547671840354767, + "grad_norm": 6.784545315493452, + "learning_rate": 7.086614173228347e-06, + "loss": 2.3697, + "step": 90 + }, + { + "epoch": 0.03587090416358709, + "grad_norm": 4.333450921434643, + "learning_rate": 7.165354330708662e-06, + "loss": 2.304, + "step": 91 + }, + { + "epoch": 0.03626508992362651, + "grad_norm": 5.218824764842207, + "learning_rate": 7.2440944881889774e-06, + "loss": 2.3646, + "step": 92 + }, + { + "epoch": 0.03665927568366593, + "grad_norm": 4.149232430620695, + "learning_rate": 7.322834645669292e-06, + "loss": 2.2622, + "step": 93 + }, + { + "epoch": 0.037053461443705346, + "grad_norm": 4.193773298248102, + "learning_rate": 7.401574803149607e-06, + "loss": 2.2887, + "step": 94 + }, + { + "epoch": 0.037447647203744765, + "grad_norm": 4.456311860549035, + "learning_rate": 7.480314960629922e-06, + "loss": 2.3007, + "step": 95 + }, + { + "epoch": 0.03784183296378418, + "grad_norm": 4.576460153117237, + "learning_rate": 7.5590551181102365e-06, + "loss": 2.3021, + "step": 96 + }, + { + "epoch": 0.0382360187238236, + "grad_norm": 8.479196171237232, + "learning_rate": 7.637795275590551e-06, + "loss": 2.4404, + "step": 97 + }, + { + "epoch": 0.03863020448386302, + "grad_norm": 7.433380505053241, + "learning_rate": 7.716535433070867e-06, + "loss": 2.2858, + "step": 98 + }, + { + "epoch": 0.03902439024390244, + "grad_norm": 9.169489148787575, + "learning_rate": 7.79527559055118e-06, + "loss": 2.2905, + "step": 99 + }, + { + "epoch": 0.03941857600394186, + "grad_norm": 4.505614703608414, + "learning_rate": 7.874015748031496e-06, + "loss": 2.2229, + "step": 100 + }, + { + "epoch": 0.039812761763981276, + "grad_norm": 3.251111002629772, + "learning_rate": 7.952755905511812e-06, + "loss": 2.1951, + "step": 101 + }, + { + "epoch": 0.040206947524020695, + "grad_norm": 4.118590361507865, + "learning_rate": 8.031496062992128e-06, + "loss": 2.271, + "step": 102 + }, + { + "epoch": 0.04060113328406011, + "grad_norm": 6.9488591196561815, + "learning_rate": 8.110236220472441e-06, + "loss": 2.3629, + "step": 103 + }, + { + "epoch": 0.04099531904409953, + "grad_norm": 3.5799197580937454, + "learning_rate": 8.188976377952757e-06, + "loss": 2.1602, + "step": 104 + }, + { + "epoch": 0.04138950480413895, + "grad_norm": 3.698515235577877, + "learning_rate": 8.267716535433071e-06, + "loss": 2.1759, + "step": 105 + }, + { + "epoch": 0.04178369056417837, + "grad_norm": 3.2516137577135646, + "learning_rate": 8.346456692913387e-06, + "loss": 2.2093, + "step": 106 + }, + { + "epoch": 0.04217787632421779, + "grad_norm": 3.910051851712546, + "learning_rate": 8.4251968503937e-06, + "loss": 2.2229, + "step": 107 + }, + { + "epoch": 0.042572062084257206, + "grad_norm": 3.7166583065715137, + "learning_rate": 8.503937007874016e-06, + "loss": 2.0932, + "step": 108 + }, + { + "epoch": 0.042966247844296625, + "grad_norm": 2.6575124301921873, + "learning_rate": 8.582677165354332e-06, + "loss": 2.12, + "step": 109 + }, + { + "epoch": 0.04336043360433604, + "grad_norm": 3.482590385246152, + "learning_rate": 8.661417322834647e-06, + "loss": 2.0901, + "step": 110 + }, + { + "epoch": 0.04375461936437546, + "grad_norm": 4.66548163032443, + "learning_rate": 8.740157480314961e-06, + "loss": 2.0983, + "step": 111 + }, + { + "epoch": 0.04414880512441488, + "grad_norm": 2.813248162118009, + "learning_rate": 8.818897637795277e-06, + "loss": 2.0084, + "step": 112 + }, + { + "epoch": 0.0445429908844543, + "grad_norm": 2.667639210004557, + "learning_rate": 8.89763779527559e-06, + "loss": 1.9983, + "step": 113 + }, + { + "epoch": 0.04493717664449372, + "grad_norm": 3.0839886525609463, + "learning_rate": 8.976377952755906e-06, + "loss": 2.0084, + "step": 114 + }, + { + "epoch": 0.045331362404533136, + "grad_norm": 3.000412565293289, + "learning_rate": 9.05511811023622e-06, + "loss": 1.9718, + "step": 115 + }, + { + "epoch": 0.045725548164572555, + "grad_norm": 4.642416950929853, + "learning_rate": 9.133858267716536e-06, + "loss": 1.9841, + "step": 116 + }, + { + "epoch": 0.04611973392461197, + "grad_norm": 2.3154794311302886, + "learning_rate": 9.212598425196852e-06, + "loss": 1.9743, + "step": 117 + }, + { + "epoch": 0.04651391968465139, + "grad_norm": 2.545829361546042, + "learning_rate": 9.291338582677165e-06, + "loss": 1.9539, + "step": 118 + }, + { + "epoch": 0.04690810544469081, + "grad_norm": 2.974703874097749, + "learning_rate": 9.370078740157481e-06, + "loss": 1.91, + "step": 119 + }, + { + "epoch": 0.04730229120473023, + "grad_norm": 2.797427125263561, + "learning_rate": 9.448818897637797e-06, + "loss": 1.9065, + "step": 120 + }, + { + "epoch": 0.04769647696476965, + "grad_norm": 4.324127605691098, + "learning_rate": 9.52755905511811e-06, + "loss": 1.9863, + "step": 121 + }, + { + "epoch": 0.048090662724809066, + "grad_norm": 3.2983025416162945, + "learning_rate": 9.606299212598426e-06, + "loss": 1.9546, + "step": 122 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 2.2657892364343017, + "learning_rate": 9.68503937007874e-06, + "loss": 1.848, + "step": 123 + }, + { + "epoch": 0.0488790342448879, + "grad_norm": 3.2601787777289437, + "learning_rate": 9.763779527559056e-06, + "loss": 1.9285, + "step": 124 + }, + { + "epoch": 0.04927322000492732, + "grad_norm": 3.5243072214231583, + "learning_rate": 9.842519685039371e-06, + "loss": 1.8762, + "step": 125 + }, + { + "epoch": 0.04966740576496674, + "grad_norm": 3.3017593501688394, + "learning_rate": 9.921259842519685e-06, + "loss": 1.8601, + "step": 126 + }, + { + "epoch": 0.05006159152500616, + "grad_norm": 3.2653646060771444, + "learning_rate": 1e-05, + "loss": 1.8686, + "step": 127 + }, + { + "epoch": 0.05045577728504558, + "grad_norm": 2.681802464673681, + "learning_rate": 1.0078740157480316e-05, + "loss": 1.8302, + "step": 128 + }, + { + "epoch": 0.050849963045084996, + "grad_norm": 3.133929350491433, + "learning_rate": 1.015748031496063e-05, + "loss": 1.8372, + "step": 129 + }, + { + "epoch": 0.051244148805124415, + "grad_norm": 2.534354682692382, + "learning_rate": 1.0236220472440946e-05, + "loss": 1.8715, + "step": 130 + }, + { + "epoch": 0.05163833456516383, + "grad_norm": 3.0493154042368023, + "learning_rate": 1.031496062992126e-05, + "loss": 1.8485, + "step": 131 + }, + { + "epoch": 0.05203252032520325, + "grad_norm": 2.0799972512373834, + "learning_rate": 1.0393700787401575e-05, + "loss": 1.7866, + "step": 132 + }, + { + "epoch": 0.05242670608524267, + "grad_norm": 1.598403007988912, + "learning_rate": 1.047244094488189e-05, + "loss": 1.8013, + "step": 133 + }, + { + "epoch": 0.05282089184528209, + "grad_norm": 1.91178664275519, + "learning_rate": 1.0551181102362205e-05, + "loss": 1.8741, + "step": 134 + }, + { + "epoch": 0.05321507760532151, + "grad_norm": 2.1365165713401906, + "learning_rate": 1.0629921259842522e-05, + "loss": 1.7989, + "step": 135 + }, + { + "epoch": 0.053609263365360926, + "grad_norm": 2.6948885430012655, + "learning_rate": 1.0708661417322836e-05, + "loss": 1.7984, + "step": 136 + }, + { + "epoch": 0.054003449125400345, + "grad_norm": 1.8504724810176718, + "learning_rate": 1.0787401574803152e-05, + "loss": 1.7789, + "step": 137 + }, + { + "epoch": 0.05439763488543976, + "grad_norm": 1.992151255132755, + "learning_rate": 1.0866141732283466e-05, + "loss": 1.803, + "step": 138 + }, + { + "epoch": 0.05479182064547918, + "grad_norm": 3.10045850302244, + "learning_rate": 1.0944881889763781e-05, + "loss": 1.823, + "step": 139 + }, + { + "epoch": 0.0551860064055186, + "grad_norm": 2.2624346551381085, + "learning_rate": 1.1023622047244095e-05, + "loss": 1.7608, + "step": 140 + }, + { + "epoch": 0.05558019216555802, + "grad_norm": 1.9683772470424854, + "learning_rate": 1.1102362204724411e-05, + "loss": 1.8037, + "step": 141 + }, + { + "epoch": 0.05597437792559744, + "grad_norm": 3.26220140428376, + "learning_rate": 1.1181102362204725e-05, + "loss": 1.7765, + "step": 142 + }, + { + "epoch": 0.056368563685636856, + "grad_norm": 4.4068981319414595, + "learning_rate": 1.125984251968504e-05, + "loss": 1.8472, + "step": 143 + }, + { + "epoch": 0.056762749445676275, + "grad_norm": 1.6987954071831348, + "learning_rate": 1.1338582677165354e-05, + "loss": 1.7572, + "step": 144 + }, + { + "epoch": 0.057156935205715693, + "grad_norm": 1.847159040073359, + "learning_rate": 1.141732283464567e-05, + "loss": 1.6803, + "step": 145 + }, + { + "epoch": 0.05755112096575511, + "grad_norm": 2.6708041585740596, + "learning_rate": 1.1496062992125985e-05, + "loss": 1.8088, + "step": 146 + }, + { + "epoch": 0.05794530672579453, + "grad_norm": 1.9604986339037445, + "learning_rate": 1.15748031496063e-05, + "loss": 1.7155, + "step": 147 + }, + { + "epoch": 0.05833949248583395, + "grad_norm": 1.6691911028581192, + "learning_rate": 1.1653543307086615e-05, + "loss": 1.7748, + "step": 148 + }, + { + "epoch": 0.05873367824587337, + "grad_norm": 7.3318925396826895, + "learning_rate": 1.1732283464566929e-05, + "loss": 1.7572, + "step": 149 + }, + { + "epoch": 0.059127864005912786, + "grad_norm": 2.283850168056605, + "learning_rate": 1.1811023622047245e-05, + "loss": 1.7774, + "step": 150 + }, + { + "epoch": 0.059522049765952205, + "grad_norm": 1.8019088514589012, + "learning_rate": 1.1889763779527562e-05, + "loss": 1.7786, + "step": 151 + }, + { + "epoch": 0.059916235525991624, + "grad_norm": 1.3816061587980675, + "learning_rate": 1.1968503937007876e-05, + "loss": 1.7504, + "step": 152 + }, + { + "epoch": 0.06031042128603104, + "grad_norm": 5.720763322290118, + "learning_rate": 1.2047244094488191e-05, + "loss": 1.8016, + "step": 153 + }, + { + "epoch": 0.06070460704607046, + "grad_norm": 3.3964912544422994, + "learning_rate": 1.2125984251968505e-05, + "loss": 1.6964, + "step": 154 + }, + { + "epoch": 0.06109879280610988, + "grad_norm": 1.7844098526259298, + "learning_rate": 1.2204724409448821e-05, + "loss": 1.7561, + "step": 155 + }, + { + "epoch": 0.0614929785661493, + "grad_norm": 1.6826530766646766, + "learning_rate": 1.2283464566929135e-05, + "loss": 1.7069, + "step": 156 + }, + { + "epoch": 0.061887164326188716, + "grad_norm": 3.4647919464333152, + "learning_rate": 1.236220472440945e-05, + "loss": 1.7096, + "step": 157 + }, + { + "epoch": 0.062281350086228135, + "grad_norm": 2.0613781006838243, + "learning_rate": 1.2440944881889764e-05, + "loss": 1.732, + "step": 158 + }, + { + "epoch": 0.06267553584626756, + "grad_norm": 1.9503601214626853, + "learning_rate": 1.251968503937008e-05, + "loss": 1.7402, + "step": 159 + }, + { + "epoch": 0.06306972160630697, + "grad_norm": 1.8504549835287638, + "learning_rate": 1.2598425196850394e-05, + "loss": 1.7003, + "step": 160 + }, + { + "epoch": 0.0634639073663464, + "grad_norm": 2.07948846446986, + "learning_rate": 1.267716535433071e-05, + "loss": 1.7004, + "step": 161 + }, + { + "epoch": 0.06385809312638581, + "grad_norm": 1.7485726412453775, + "learning_rate": 1.2755905511811025e-05, + "loss": 1.725, + "step": 162 + }, + { + "epoch": 0.06425227888642523, + "grad_norm": 1.7868478014046527, + "learning_rate": 1.2834645669291339e-05, + "loss": 1.6828, + "step": 163 + }, + { + "epoch": 0.06464646464646465, + "grad_norm": 1.4524583527842783, + "learning_rate": 1.2913385826771655e-05, + "loss": 1.726, + "step": 164 + }, + { + "epoch": 0.06504065040650407, + "grad_norm": 1.5085438907961388, + "learning_rate": 1.2992125984251968e-05, + "loss": 1.6417, + "step": 165 + }, + { + "epoch": 0.06543483616654348, + "grad_norm": 1.5307066166089378, + "learning_rate": 1.3070866141732284e-05, + "loss": 1.6291, + "step": 166 + }, + { + "epoch": 0.06582902192658291, + "grad_norm": 1.5549360763645417, + "learning_rate": 1.3149606299212601e-05, + "loss": 1.6966, + "step": 167 + }, + { + "epoch": 0.06622320768662232, + "grad_norm": 2.1633140111873272, + "learning_rate": 1.3228346456692915e-05, + "loss": 1.5821, + "step": 168 + }, + { + "epoch": 0.06661739344666175, + "grad_norm": 1.4726739949688163, + "learning_rate": 1.3307086614173231e-05, + "loss": 1.6008, + "step": 169 + }, + { + "epoch": 0.06701157920670116, + "grad_norm": 1.933336638607143, + "learning_rate": 1.3385826771653545e-05, + "loss": 1.6237, + "step": 170 + }, + { + "epoch": 0.06740576496674058, + "grad_norm": 1.53709942550425, + "learning_rate": 1.346456692913386e-05, + "loss": 1.6603, + "step": 171 + }, + { + "epoch": 0.06779995072678, + "grad_norm": 5.838182266578105, + "learning_rate": 1.3543307086614174e-05, + "loss": 1.7374, + "step": 172 + }, + { + "epoch": 0.06819413648681942, + "grad_norm": 2.1077670495936105, + "learning_rate": 1.362204724409449e-05, + "loss": 1.6751, + "step": 173 + }, + { + "epoch": 0.06858832224685883, + "grad_norm": 1.79478201657228, + "learning_rate": 1.3700787401574804e-05, + "loss": 1.6147, + "step": 174 + }, + { + "epoch": 0.06898250800689826, + "grad_norm": 1.3332167033318783, + "learning_rate": 1.377952755905512e-05, + "loss": 1.6174, + "step": 175 + }, + { + "epoch": 0.06937669376693767, + "grad_norm": 1.3613261661051188, + "learning_rate": 1.3858267716535433e-05, + "loss": 1.6226, + "step": 176 + }, + { + "epoch": 0.0697708795269771, + "grad_norm": 1.4747645759596355, + "learning_rate": 1.3937007874015749e-05, + "loss": 1.6831, + "step": 177 + }, + { + "epoch": 0.0701650652870165, + "grad_norm": 1.2750429533681837, + "learning_rate": 1.4015748031496063e-05, + "loss": 1.7002, + "step": 178 + }, + { + "epoch": 0.07055925104705593, + "grad_norm": 1.5316341355433367, + "learning_rate": 1.4094488188976379e-05, + "loss": 1.6778, + "step": 179 + }, + { + "epoch": 0.07095343680709534, + "grad_norm": 1.5302517303234198, + "learning_rate": 1.4173228346456694e-05, + "loss": 1.661, + "step": 180 + }, + { + "epoch": 0.07134762256713477, + "grad_norm": 1.4890855169186785, + "learning_rate": 1.4251968503937008e-05, + "loss": 1.6873, + "step": 181 + }, + { + "epoch": 0.07174180832717418, + "grad_norm": 1.4685898866854017, + "learning_rate": 1.4330708661417324e-05, + "loss": 1.6183, + "step": 182 + }, + { + "epoch": 0.0721359940872136, + "grad_norm": 1.1931151423557926, + "learning_rate": 1.440944881889764e-05, + "loss": 1.6106, + "step": 183 + }, + { + "epoch": 0.07253017984725302, + "grad_norm": 1.2548801700230896, + "learning_rate": 1.4488188976377955e-05, + "loss": 1.6201, + "step": 184 + }, + { + "epoch": 0.07292436560729244, + "grad_norm": 1.316626084569457, + "learning_rate": 1.456692913385827e-05, + "loss": 1.6652, + "step": 185 + }, + { + "epoch": 0.07331855136733186, + "grad_norm": 5.515174587786105, + "learning_rate": 1.4645669291338584e-05, + "loss": 1.6672, + "step": 186 + }, + { + "epoch": 0.07371273712737128, + "grad_norm": 1.2435134387010485, + "learning_rate": 1.47244094488189e-05, + "loss": 1.5948, + "step": 187 + }, + { + "epoch": 0.07410692288741069, + "grad_norm": 1.27329799921956, + "learning_rate": 1.4803149606299214e-05, + "loss": 1.6548, + "step": 188 + }, + { + "epoch": 0.07450110864745012, + "grad_norm": 1.2399973778980402, + "learning_rate": 1.488188976377953e-05, + "loss": 1.604, + "step": 189 + }, + { + "epoch": 0.07489529440748953, + "grad_norm": 2.394011363721175, + "learning_rate": 1.4960629921259843e-05, + "loss": 1.6027, + "step": 190 + }, + { + "epoch": 0.07528948016752895, + "grad_norm": 1.3778750181373447, + "learning_rate": 1.5039370078740159e-05, + "loss": 1.6389, + "step": 191 + }, + { + "epoch": 0.07568366592756837, + "grad_norm": 1.5441433369147584, + "learning_rate": 1.5118110236220473e-05, + "loss": 1.6183, + "step": 192 + }, + { + "epoch": 0.07607785168760779, + "grad_norm": 4.415312664776792, + "learning_rate": 1.5196850393700789e-05, + "loss": 1.5881, + "step": 193 + }, + { + "epoch": 0.0764720374476472, + "grad_norm": 1.6220189908817373, + "learning_rate": 1.5275590551181102e-05, + "loss": 1.689, + "step": 194 + }, + { + "epoch": 0.07686622320768663, + "grad_norm": 1.2264711147522527, + "learning_rate": 1.5354330708661416e-05, + "loss": 1.5776, + "step": 195 + }, + { + "epoch": 0.07726040896772604, + "grad_norm": 1.2490481285394455, + "learning_rate": 1.5433070866141734e-05, + "loss": 1.6122, + "step": 196 + }, + { + "epoch": 0.07765459472776547, + "grad_norm": 1.2303899509527259, + "learning_rate": 1.5511811023622048e-05, + "loss": 1.5495, + "step": 197 + }, + { + "epoch": 0.07804878048780488, + "grad_norm": 3.4482635126365997, + "learning_rate": 1.559055118110236e-05, + "loss": 1.6351, + "step": 198 + }, + { + "epoch": 0.0784429662478443, + "grad_norm": 1.4430016707011335, + "learning_rate": 1.566929133858268e-05, + "loss": 1.5224, + "step": 199 + }, + { + "epoch": 0.07883715200788372, + "grad_norm": 1.258723675384828, + "learning_rate": 1.5748031496062993e-05, + "loss": 1.5626, + "step": 200 + }, + { + "epoch": 0.07923133776792314, + "grad_norm": 1.5678661529755662, + "learning_rate": 1.582677165354331e-05, + "loss": 1.5783, + "step": 201 + }, + { + "epoch": 0.07962552352796255, + "grad_norm": 2.1867650050329535, + "learning_rate": 1.5905511811023624e-05, + "loss": 1.5969, + "step": 202 + }, + { + "epoch": 0.08001970928800198, + "grad_norm": 1.2889311434591015, + "learning_rate": 1.5984251968503938e-05, + "loss": 1.564, + "step": 203 + }, + { + "epoch": 0.08041389504804139, + "grad_norm": 1.1654066224514485, + "learning_rate": 1.6062992125984255e-05, + "loss": 1.5517, + "step": 204 + }, + { + "epoch": 0.08080808080808081, + "grad_norm": 1.2834026840027142, + "learning_rate": 1.614173228346457e-05, + "loss": 1.5784, + "step": 205 + }, + { + "epoch": 0.08120226656812023, + "grad_norm": 1.097147109752616, + "learning_rate": 1.6220472440944883e-05, + "loss": 1.593, + "step": 206 + }, + { + "epoch": 0.08159645232815965, + "grad_norm": 1.0826077251947002, + "learning_rate": 1.6299212598425197e-05, + "loss": 1.6672, + "step": 207 + }, + { + "epoch": 0.08199063808819906, + "grad_norm": 1.1105586301185173, + "learning_rate": 1.6377952755905514e-05, + "loss": 1.6279, + "step": 208 + }, + { + "epoch": 0.08238482384823849, + "grad_norm": 1.0509746948712066, + "learning_rate": 1.6456692913385828e-05, + "loss": 1.5676, + "step": 209 + }, + { + "epoch": 0.0827790096082779, + "grad_norm": 1.0983909936032894, + "learning_rate": 1.6535433070866142e-05, + "loss": 1.5829, + "step": 210 + }, + { + "epoch": 0.08317319536831733, + "grad_norm": 5.99007589257119, + "learning_rate": 1.6614173228346456e-05, + "loss": 1.7761, + "step": 211 + }, + { + "epoch": 0.08356738112835674, + "grad_norm": 1.2452212459257412, + "learning_rate": 1.6692913385826773e-05, + "loss": 1.6174, + "step": 212 + }, + { + "epoch": 0.08396156688839616, + "grad_norm": 1.2716752881032753, + "learning_rate": 1.6771653543307087e-05, + "loss": 1.5855, + "step": 213 + }, + { + "epoch": 0.08435575264843558, + "grad_norm": 1.1250735671327408, + "learning_rate": 1.68503937007874e-05, + "loss": 1.6358, + "step": 214 + }, + { + "epoch": 0.084749938408475, + "grad_norm": 1.2260081131211942, + "learning_rate": 1.692913385826772e-05, + "loss": 1.5142, + "step": 215 + }, + { + "epoch": 0.08514412416851441, + "grad_norm": 1.1674035474423037, + "learning_rate": 1.7007874015748032e-05, + "loss": 1.57, + "step": 216 + }, + { + "epoch": 0.08553830992855384, + "grad_norm": 1.2049471298049268, + "learning_rate": 1.708661417322835e-05, + "loss": 1.535, + "step": 217 + }, + { + "epoch": 0.08593249568859325, + "grad_norm": 1.0593135540735228, + "learning_rate": 1.7165354330708663e-05, + "loss": 1.5262, + "step": 218 + }, + { + "epoch": 0.08632668144863268, + "grad_norm": 1.2230277479432223, + "learning_rate": 1.7244094488188977e-05, + "loss": 1.4963, + "step": 219 + }, + { + "epoch": 0.08672086720867209, + "grad_norm": 1.0841400801567742, + "learning_rate": 1.7322834645669295e-05, + "loss": 1.464, + "step": 220 + }, + { + "epoch": 0.08711505296871151, + "grad_norm": 1.0657721135903946, + "learning_rate": 1.740157480314961e-05, + "loss": 1.5183, + "step": 221 + }, + { + "epoch": 0.08750923872875092, + "grad_norm": 1.0176332279317757, + "learning_rate": 1.7480314960629923e-05, + "loss": 1.5272, + "step": 222 + }, + { + "epoch": 0.08790342448879035, + "grad_norm": 1.0202676847155607, + "learning_rate": 1.7559055118110236e-05, + "loss": 1.5327, + "step": 223 + }, + { + "epoch": 0.08829761024882976, + "grad_norm": 6.425041690617794, + "learning_rate": 1.7637795275590554e-05, + "loss": 1.5531, + "step": 224 + }, + { + "epoch": 0.08869179600886919, + "grad_norm": 1.1786231403068714, + "learning_rate": 1.7716535433070868e-05, + "loss": 1.5453, + "step": 225 + }, + { + "epoch": 0.0890859817689086, + "grad_norm": 1.2325207985267532, + "learning_rate": 1.779527559055118e-05, + "loss": 1.6243, + "step": 226 + }, + { + "epoch": 0.08948016752894802, + "grad_norm": 2.8120821758652292, + "learning_rate": 1.7874015748031495e-05, + "loss": 1.5169, + "step": 227 + }, + { + "epoch": 0.08987435328898744, + "grad_norm": 1.1463382537995392, + "learning_rate": 1.7952755905511813e-05, + "loss": 1.5332, + "step": 228 + }, + { + "epoch": 0.09026853904902686, + "grad_norm": 1.0849881708965645, + "learning_rate": 1.8031496062992127e-05, + "loss": 1.5723, + "step": 229 + }, + { + "epoch": 0.09066272480906627, + "grad_norm": 1.1666290000579271, + "learning_rate": 1.811023622047244e-05, + "loss": 1.5618, + "step": 230 + }, + { + "epoch": 0.0910569105691057, + "grad_norm": 1.2015436620694524, + "learning_rate": 1.8188976377952758e-05, + "loss": 1.4479, + "step": 231 + }, + { + "epoch": 0.09145109632914511, + "grad_norm": 1.1770257502445032, + "learning_rate": 1.8267716535433072e-05, + "loss": 1.4907, + "step": 232 + }, + { + "epoch": 0.09184528208918454, + "grad_norm": 1.1626480865358226, + "learning_rate": 1.834645669291339e-05, + "loss": 1.5504, + "step": 233 + }, + { + "epoch": 0.09223946784922395, + "grad_norm": 1.06078382485064, + "learning_rate": 1.8425196850393703e-05, + "loss": 1.4953, + "step": 234 + }, + { + "epoch": 0.09263365360926337, + "grad_norm": 1.0930777847490591, + "learning_rate": 1.8503937007874017e-05, + "loss": 1.5751, + "step": 235 + }, + { + "epoch": 0.09302783936930278, + "grad_norm": 1.0032128686122703, + "learning_rate": 1.858267716535433e-05, + "loss": 1.5573, + "step": 236 + }, + { + "epoch": 0.09342202512934221, + "grad_norm": 1.316223586320374, + "learning_rate": 1.8661417322834648e-05, + "loss": 1.5121, + "step": 237 + }, + { + "epoch": 0.09381621088938162, + "grad_norm": 1.2482520651605957, + "learning_rate": 1.8740157480314962e-05, + "loss": 1.5444, + "step": 238 + }, + { + "epoch": 0.09421039664942105, + "grad_norm": 1.0596918045491734, + "learning_rate": 1.8818897637795276e-05, + "loss": 1.5212, + "step": 239 + }, + { + "epoch": 0.09460458240946046, + "grad_norm": 10.230035305602996, + "learning_rate": 1.8897637795275593e-05, + "loss": 1.5136, + "step": 240 + }, + { + "epoch": 0.09499876816949988, + "grad_norm": 1.7311033327684602, + "learning_rate": 1.8976377952755907e-05, + "loss": 1.5087, + "step": 241 + }, + { + "epoch": 0.0953929539295393, + "grad_norm": 1.3327399439783965, + "learning_rate": 1.905511811023622e-05, + "loss": 1.5182, + "step": 242 + }, + { + "epoch": 0.09578713968957872, + "grad_norm": 1.0615025753084397, + "learning_rate": 1.9133858267716535e-05, + "loss": 1.5321, + "step": 243 + }, + { + "epoch": 0.09618132544961813, + "grad_norm": 1.174065978180721, + "learning_rate": 1.9212598425196852e-05, + "loss": 1.4981, + "step": 244 + }, + { + "epoch": 0.09657551120965756, + "grad_norm": 1.0837767684996553, + "learning_rate": 1.9291338582677166e-05, + "loss": 1.4733, + "step": 245 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 1.0744329648400928, + "learning_rate": 1.937007874015748e-05, + "loss": 1.5172, + "step": 246 + }, + { + "epoch": 0.0973638827297364, + "grad_norm": 1.0479477955815488, + "learning_rate": 1.9448818897637797e-05, + "loss": 1.4767, + "step": 247 + }, + { + "epoch": 0.0977580684897758, + "grad_norm": 0.9622167177031952, + "learning_rate": 1.952755905511811e-05, + "loss": 1.5212, + "step": 248 + }, + { + "epoch": 0.09815225424981523, + "grad_norm": 2.2109867243739867, + "learning_rate": 1.960629921259843e-05, + "loss": 1.534, + "step": 249 + }, + { + "epoch": 0.09854644000985464, + "grad_norm": 1.0330950105773389, + "learning_rate": 1.9685039370078743e-05, + "loss": 1.4988, + "step": 250 + }, + { + "epoch": 0.09894062576989407, + "grad_norm": 1.2543876260436326, + "learning_rate": 1.9763779527559057e-05, + "loss": 1.5515, + "step": 251 + }, + { + "epoch": 0.09933481152993348, + "grad_norm": 1.0907032902576081, + "learning_rate": 1.984251968503937e-05, + "loss": 1.4944, + "step": 252 + }, + { + "epoch": 0.0997289972899729, + "grad_norm": 0.9800946085411166, + "learning_rate": 1.9921259842519688e-05, + "loss": 1.4594, + "step": 253 + }, + { + "epoch": 0.10012318305001232, + "grad_norm": 1.005840927677052, + "learning_rate": 2e-05, + "loss": 1.5125, + "step": 254 + }, + { + "epoch": 0.10051736881005174, + "grad_norm": 0.9877177677204181, + "learning_rate": 1.9999990523708736e-05, + "loss": 1.4953, + "step": 255 + }, + { + "epoch": 0.10091155457009116, + "grad_norm": 1.101690731617668, + "learning_rate": 1.999996209485289e-05, + "loss": 1.5291, + "step": 256 + }, + { + "epoch": 0.10130574033013058, + "grad_norm": 1.056828743252167, + "learning_rate": 1.9999914713486344e-05, + "loss": 1.546, + "step": 257 + }, + { + "epoch": 0.10169992609016999, + "grad_norm": 1.0379730842348571, + "learning_rate": 1.9999848379698906e-05, + "loss": 1.5252, + "step": 258 + }, + { + "epoch": 0.10209411185020942, + "grad_norm": 0.9403586150467369, + "learning_rate": 1.999976309361629e-05, + "loss": 1.4487, + "step": 259 + }, + { + "epoch": 0.10248829761024883, + "grad_norm": 0.9899974982933676, + "learning_rate": 1.9999658855400135e-05, + "loss": 1.4721, + "step": 260 + }, + { + "epoch": 0.10288248337028826, + "grad_norm": 1.8364244542987356, + "learning_rate": 1.9999535665248e-05, + "loss": 1.5609, + "step": 261 + }, + { + "epoch": 0.10327666913032767, + "grad_norm": 1.0844452490408925, + "learning_rate": 1.9999393523393365e-05, + "loss": 1.4418, + "step": 262 + }, + { + "epoch": 0.10367085489036709, + "grad_norm": 0.9972732800206876, + "learning_rate": 1.9999232430105618e-05, + "loss": 1.4595, + "step": 263 + }, + { + "epoch": 0.1040650406504065, + "grad_norm": 1.0507646311810663, + "learning_rate": 1.999905238569008e-05, + "loss": 1.5172, + "step": 264 + }, + { + "epoch": 0.10445922641044593, + "grad_norm": 1.095556225355519, + "learning_rate": 1.999885339048798e-05, + "loss": 1.4543, + "step": 265 + }, + { + "epoch": 0.10485341217048534, + "grad_norm": 1.5429221372847546, + "learning_rate": 1.999863544487646e-05, + "loss": 1.4856, + "step": 266 + }, + { + "epoch": 0.10524759793052477, + "grad_norm": 1.2099357188247561, + "learning_rate": 1.9998398549268594e-05, + "loss": 1.5493, + "step": 267 + }, + { + "epoch": 0.10564178369056418, + "grad_norm": 0.935834153327994, + "learning_rate": 1.999814270411335e-05, + "loss": 1.4679, + "step": 268 + }, + { + "epoch": 0.1060359694506036, + "grad_norm": 0.9438202964074678, + "learning_rate": 1.9997867909895626e-05, + "loss": 1.4995, + "step": 269 + }, + { + "epoch": 0.10643015521064302, + "grad_norm": 1.033515015322255, + "learning_rate": 1.9997574167136225e-05, + "loss": 1.5551, + "step": 270 + }, + { + "epoch": 0.10682434097068244, + "grad_norm": 0.9370254571893236, + "learning_rate": 1.9997261476391867e-05, + "loss": 1.4224, + "step": 271 + }, + { + "epoch": 0.10721852673072185, + "grad_norm": 0.8669854368917412, + "learning_rate": 1.999692983825518e-05, + "loss": 1.4123, + "step": 272 + }, + { + "epoch": 0.10761271249076128, + "grad_norm": 0.944767717267722, + "learning_rate": 1.999657925335471e-05, + "loss": 1.4617, + "step": 273 + }, + { + "epoch": 0.10800689825080069, + "grad_norm": 0.8918613394976922, + "learning_rate": 1.9996209722354896e-05, + "loss": 1.4717, + "step": 274 + }, + { + "epoch": 0.10840108401084012, + "grad_norm": 0.8601703235721511, + "learning_rate": 1.99958212459561e-05, + "loss": 1.4932, + "step": 275 + }, + { + "epoch": 0.10879526977087953, + "grad_norm": 0.8947009718973543, + "learning_rate": 1.9995413824894593e-05, + "loss": 1.4279, + "step": 276 + }, + { + "epoch": 0.10918945553091895, + "grad_norm": 0.9310105648146282, + "learning_rate": 1.9994987459942528e-05, + "loss": 1.4802, + "step": 277 + }, + { + "epoch": 0.10958364129095836, + "grad_norm": 0.8501846281501174, + "learning_rate": 1.9994542151907988e-05, + "loss": 1.4749, + "step": 278 + }, + { + "epoch": 0.10997782705099779, + "grad_norm": 1.0075642218200616, + "learning_rate": 1.999407790163494e-05, + "loss": 1.4024, + "step": 279 + }, + { + "epoch": 0.1103720128110372, + "grad_norm": 0.8724020295218536, + "learning_rate": 1.9993594710003262e-05, + "loss": 1.4781, + "step": 280 + }, + { + "epoch": 0.11076619857107663, + "grad_norm": 0.9028708477460494, + "learning_rate": 1.9993092577928725e-05, + "loss": 1.4662, + "step": 281 + }, + { + "epoch": 0.11116038433111604, + "grad_norm": 0.9000611147078907, + "learning_rate": 1.9992571506362997e-05, + "loss": 1.5075, + "step": 282 + }, + { + "epoch": 0.11155457009115546, + "grad_norm": 0.8987129723251234, + "learning_rate": 1.9992031496293652e-05, + "loss": 1.4287, + "step": 283 + }, + { + "epoch": 0.11194875585119488, + "grad_norm": 0.9407581537583124, + "learning_rate": 1.999147254874414e-05, + "loss": 1.4692, + "step": 284 + }, + { + "epoch": 0.1123429416112343, + "grad_norm": 0.8489305931721897, + "learning_rate": 1.999089466477381e-05, + "loss": 1.4033, + "step": 285 + }, + { + "epoch": 0.11273712737127371, + "grad_norm": 0.9701130113270408, + "learning_rate": 1.999029784547791e-05, + "loss": 1.4633, + "step": 286 + }, + { + "epoch": 0.11313131313131314, + "grad_norm": 0.9645372818337129, + "learning_rate": 1.9989682091987558e-05, + "loss": 1.4762, + "step": 287 + }, + { + "epoch": 0.11352549889135255, + "grad_norm": 0.8958997231087552, + "learning_rate": 1.9989047405469772e-05, + "loss": 1.4915, + "step": 288 + }, + { + "epoch": 0.11391968465139198, + "grad_norm": 0.8671815258371959, + "learning_rate": 1.9988393787127444e-05, + "loss": 1.4463, + "step": 289 + }, + { + "epoch": 0.11431387041143139, + "grad_norm": 0.8618517053204878, + "learning_rate": 1.9987721238199345e-05, + "loss": 1.4234, + "step": 290 + }, + { + "epoch": 0.11470805617147081, + "grad_norm": 0.8902785836218885, + "learning_rate": 1.9987029759960142e-05, + "loss": 1.4214, + "step": 291 + }, + { + "epoch": 0.11510224193151022, + "grad_norm": 0.8858117437885646, + "learning_rate": 1.9986319353720353e-05, + "loss": 1.3894, + "step": 292 + }, + { + "epoch": 0.11549642769154965, + "grad_norm": 0.8611263833038788, + "learning_rate": 1.9985590020826382e-05, + "loss": 1.4862, + "step": 293 + }, + { + "epoch": 0.11589061345158906, + "grad_norm": 0.8533778158931522, + "learning_rate": 1.9984841762660508e-05, + "loss": 1.4738, + "step": 294 + }, + { + "epoch": 0.11628479921162849, + "grad_norm": 0.9054080637678216, + "learning_rate": 1.998407458064087e-05, + "loss": 1.4873, + "step": 295 + }, + { + "epoch": 0.1166789849716679, + "grad_norm": 0.8562878911122067, + "learning_rate": 1.9983288476221482e-05, + "loss": 1.4897, + "step": 296 + }, + { + "epoch": 0.11707317073170732, + "grad_norm": 0.8857579006622172, + "learning_rate": 1.9982483450892206e-05, + "loss": 1.4916, + "step": 297 + }, + { + "epoch": 0.11746735649174674, + "grad_norm": 0.8253228858932441, + "learning_rate": 1.9981659506178778e-05, + "loss": 1.3489, + "step": 298 + }, + { + "epoch": 0.11786154225178616, + "grad_norm": 0.9323194384008091, + "learning_rate": 1.9980816643642787e-05, + "loss": 1.5008, + "step": 299 + }, + { + "epoch": 0.11825572801182557, + "grad_norm": 1.0570822985529353, + "learning_rate": 1.9979954864881672e-05, + "loss": 1.4554, + "step": 300 + }, + { + "epoch": 0.118649913771865, + "grad_norm": 0.9247735264199164, + "learning_rate": 1.997907417152873e-05, + "loss": 1.4352, + "step": 301 + }, + { + "epoch": 0.11904409953190441, + "grad_norm": 0.9467585491612563, + "learning_rate": 1.9978174565253096e-05, + "loss": 1.4937, + "step": 302 + }, + { + "epoch": 0.11943828529194384, + "grad_norm": 0.9054242752625036, + "learning_rate": 1.9977256047759765e-05, + "loss": 1.4672, + "step": 303 + }, + { + "epoch": 0.11983247105198325, + "grad_norm": 0.8664782098266539, + "learning_rate": 1.997631862078956e-05, + "loss": 1.4183, + "step": 304 + }, + { + "epoch": 0.12022665681202267, + "grad_norm": 0.8736218550959834, + "learning_rate": 1.9975362286119145e-05, + "loss": 1.4379, + "step": 305 + }, + { + "epoch": 0.12062084257206208, + "grad_norm": 0.899159416016424, + "learning_rate": 1.9974387045561022e-05, + "loss": 1.4688, + "step": 306 + }, + { + "epoch": 0.12101502833210151, + "grad_norm": 0.9132102225776563, + "learning_rate": 1.997339290096353e-05, + "loss": 1.4195, + "step": 307 + }, + { + "epoch": 0.12140921409214092, + "grad_norm": 0.9022509743935889, + "learning_rate": 1.9972379854210824e-05, + "loss": 1.5341, + "step": 308 + }, + { + "epoch": 0.12180339985218035, + "grad_norm": 0.8909667554707213, + "learning_rate": 1.997134790722289e-05, + "loss": 1.3896, + "step": 309 + }, + { + "epoch": 0.12219758561221976, + "grad_norm": 0.810957265048853, + "learning_rate": 1.9970297061955533e-05, + "loss": 1.3607, + "step": 310 + }, + { + "epoch": 0.12259177137225918, + "grad_norm": 0.8624805968721132, + "learning_rate": 1.996922732040038e-05, + "loss": 1.433, + "step": 311 + }, + { + "epoch": 0.1229859571322986, + "grad_norm": 0.9012262047132807, + "learning_rate": 1.9968138684584862e-05, + "loss": 1.4337, + "step": 312 + }, + { + "epoch": 0.12338014289233802, + "grad_norm": 0.8600494551649118, + "learning_rate": 1.9967031156572233e-05, + "loss": 1.3947, + "step": 313 + }, + { + "epoch": 0.12377432865237743, + "grad_norm": 0.8744528870589704, + "learning_rate": 1.9965904738461534e-05, + "loss": 1.4945, + "step": 314 + }, + { + "epoch": 0.12416851441241686, + "grad_norm": 0.8875872891561535, + "learning_rate": 1.9964759432387626e-05, + "loss": 1.4542, + "step": 315 + }, + { + "epoch": 0.12456270017245627, + "grad_norm": 0.8538438066807553, + "learning_rate": 1.9963595240521158e-05, + "loss": 1.4219, + "step": 316 + }, + { + "epoch": 0.1249568859324957, + "grad_norm": 0.8583935860681176, + "learning_rate": 1.9962412165068575e-05, + "loss": 1.3834, + "step": 317 + }, + { + "epoch": 0.12535107169253512, + "grad_norm": 0.9046850234763439, + "learning_rate": 1.996121020827211e-05, + "loss": 1.4378, + "step": 318 + }, + { + "epoch": 0.12574525745257453, + "grad_norm": 0.8757680720234807, + "learning_rate": 1.9959989372409777e-05, + "loss": 1.4239, + "step": 319 + }, + { + "epoch": 0.12613944321261394, + "grad_norm": 1.1494791062386092, + "learning_rate": 1.9958749659795382e-05, + "loss": 1.407, + "step": 320 + }, + { + "epoch": 0.12653362897265336, + "grad_norm": 0.8689927196254672, + "learning_rate": 1.99574910727785e-05, + "loss": 1.3873, + "step": 321 + }, + { + "epoch": 0.1269278147326928, + "grad_norm": 0.8754813889657387, + "learning_rate": 1.995621361374447e-05, + "loss": 1.522, + "step": 322 + }, + { + "epoch": 0.1273220004927322, + "grad_norm": 0.8486986093611717, + "learning_rate": 1.9954917285114418e-05, + "loss": 1.3494, + "step": 323 + }, + { + "epoch": 0.12771618625277162, + "grad_norm": 0.9722206329399001, + "learning_rate": 1.9953602089345215e-05, + "loss": 1.4088, + "step": 324 + }, + { + "epoch": 0.12811037201281103, + "grad_norm": 0.8967214452714534, + "learning_rate": 1.9952268028929497e-05, + "loss": 1.4024, + "step": 325 + }, + { + "epoch": 0.12850455777285047, + "grad_norm": 0.964703154180979, + "learning_rate": 1.995091510639566e-05, + "loss": 1.4126, + "step": 326 + }, + { + "epoch": 0.12889874353288988, + "grad_norm": 0.9392746691898846, + "learning_rate": 1.9949543324307828e-05, + "loss": 1.405, + "step": 327 + }, + { + "epoch": 0.1292929292929293, + "grad_norm": 0.7628618547760365, + "learning_rate": 1.9948152685265896e-05, + "loss": 1.3899, + "step": 328 + }, + { + "epoch": 0.1296871150529687, + "grad_norm": 0.8699311844515389, + "learning_rate": 1.9946743191905473e-05, + "loss": 1.3766, + "step": 329 + }, + { + "epoch": 0.13008130081300814, + "grad_norm": 0.935450510994964, + "learning_rate": 1.9945314846897922e-05, + "loss": 1.3913, + "step": 330 + }, + { + "epoch": 0.13047548657304756, + "grad_norm": 0.8529532741122805, + "learning_rate": 1.9943867652950323e-05, + "loss": 1.3947, + "step": 331 + }, + { + "epoch": 0.13086967233308697, + "grad_norm": 0.9341157491415716, + "learning_rate": 1.9942401612805478e-05, + "loss": 1.4517, + "step": 332 + }, + { + "epoch": 0.13126385809312638, + "grad_norm": 0.8302844629086936, + "learning_rate": 1.9940916729241918e-05, + "loss": 1.3977, + "step": 333 + }, + { + "epoch": 0.13165804385316582, + "grad_norm": 0.8260253123890825, + "learning_rate": 1.9939413005073873e-05, + "loss": 1.4048, + "step": 334 + }, + { + "epoch": 0.13205222961320523, + "grad_norm": 0.8509245010253166, + "learning_rate": 1.9937890443151294e-05, + "loss": 1.3836, + "step": 335 + }, + { + "epoch": 0.13244641537324464, + "grad_norm": 0.9759926385519552, + "learning_rate": 1.9936349046359833e-05, + "loss": 1.4606, + "step": 336 + }, + { + "epoch": 0.13284060113328405, + "grad_norm": 0.8472765912232332, + "learning_rate": 1.9934788817620827e-05, + "loss": 1.3585, + "step": 337 + }, + { + "epoch": 0.1332347868933235, + "grad_norm": 0.8448284766692432, + "learning_rate": 1.9933209759891318e-05, + "loss": 1.3559, + "step": 338 + }, + { + "epoch": 0.1336289726533629, + "grad_norm": 0.8980105866822069, + "learning_rate": 1.9931611876164024e-05, + "loss": 1.3884, + "step": 339 + }, + { + "epoch": 0.13402315841340232, + "grad_norm": 0.8035875577985496, + "learning_rate": 1.9929995169467346e-05, + "loss": 1.4183, + "step": 340 + }, + { + "epoch": 0.13441734417344173, + "grad_norm": 0.8436688045262849, + "learning_rate": 1.992835964286537e-05, + "loss": 1.3847, + "step": 341 + }, + { + "epoch": 0.13481152993348117, + "grad_norm": 0.9086794949433027, + "learning_rate": 1.992670529945783e-05, + "loss": 1.454, + "step": 342 + }, + { + "epoch": 0.13520571569352058, + "grad_norm": 0.8037193631752932, + "learning_rate": 1.9925032142380144e-05, + "loss": 1.4566, + "step": 343 + }, + { + "epoch": 0.13559990145356, + "grad_norm": 0.9238628826502602, + "learning_rate": 1.992334017480337e-05, + "loss": 1.4551, + "step": 344 + }, + { + "epoch": 0.1359940872135994, + "grad_norm": 0.8954578526881097, + "learning_rate": 1.9921629399934224e-05, + "loss": 1.3993, + "step": 345 + }, + { + "epoch": 0.13638827297363884, + "grad_norm": 0.8298423164388818, + "learning_rate": 1.9919899821015066e-05, + "loss": 1.4251, + "step": 346 + }, + { + "epoch": 0.13678245873367825, + "grad_norm": 0.9558363388772838, + "learning_rate": 1.99181514413239e-05, + "loss": 1.4025, + "step": 347 + }, + { + "epoch": 0.13717664449371766, + "grad_norm": 0.8459196123850001, + "learning_rate": 1.9916384264174354e-05, + "loss": 1.3976, + "step": 348 + }, + { + "epoch": 0.13757083025375708, + "grad_norm": 0.9082414240992348, + "learning_rate": 1.9914598292915684e-05, + "loss": 1.4128, + "step": 349 + }, + { + "epoch": 0.13796501601379652, + "grad_norm": 0.8807624601189884, + "learning_rate": 1.9912793530932765e-05, + "loss": 1.4642, + "step": 350 + }, + { + "epoch": 0.13835920177383593, + "grad_norm": 0.8479509653794212, + "learning_rate": 1.991096998164609e-05, + "loss": 1.4292, + "step": 351 + }, + { + "epoch": 0.13875338753387534, + "grad_norm": 0.8571495642628604, + "learning_rate": 1.9909127648511758e-05, + "loss": 1.4185, + "step": 352 + }, + { + "epoch": 0.13914757329391475, + "grad_norm": 0.8394513200646011, + "learning_rate": 1.9907266535021465e-05, + "loss": 1.3907, + "step": 353 + }, + { + "epoch": 0.1395417590539542, + "grad_norm": 0.8719559245356892, + "learning_rate": 1.9905386644702495e-05, + "loss": 1.4522, + "step": 354 + }, + { + "epoch": 0.1399359448139936, + "grad_norm": 0.8304933398455792, + "learning_rate": 1.9903487981117732e-05, + "loss": 1.37, + "step": 355 + }, + { + "epoch": 0.140330130574033, + "grad_norm": 1.0554645194699375, + "learning_rate": 1.990157054786563e-05, + "loss": 1.3502, + "step": 356 + }, + { + "epoch": 0.14072431633407242, + "grad_norm": 0.7811763156565412, + "learning_rate": 1.9899634348580226e-05, + "loss": 1.3615, + "step": 357 + }, + { + "epoch": 0.14111850209411186, + "grad_norm": 0.941990212474433, + "learning_rate": 1.9897679386931115e-05, + "loss": 1.3639, + "step": 358 + }, + { + "epoch": 0.14151268785415128, + "grad_norm": 0.814954847959052, + "learning_rate": 1.989570566662345e-05, + "loss": 1.3888, + "step": 359 + }, + { + "epoch": 0.1419068736141907, + "grad_norm": 0.8608043228373365, + "learning_rate": 1.9893713191397944e-05, + "loss": 1.3935, + "step": 360 + }, + { + "epoch": 0.1423010593742301, + "grad_norm": 0.890892455025287, + "learning_rate": 1.9891701965030855e-05, + "loss": 1.4008, + "step": 361 + }, + { + "epoch": 0.14269524513426954, + "grad_norm": 0.8356857849278824, + "learning_rate": 1.9889671991333976e-05, + "loss": 1.4298, + "step": 362 + }, + { + "epoch": 0.14308943089430895, + "grad_norm": 0.9106567824779971, + "learning_rate": 1.9887623274154623e-05, + "loss": 1.3618, + "step": 363 + }, + { + "epoch": 0.14348361665434836, + "grad_norm": 0.9437928820477995, + "learning_rate": 1.9885555817375656e-05, + "loss": 1.4348, + "step": 364 + }, + { + "epoch": 0.14387780241438777, + "grad_norm": 0.8738867727854848, + "learning_rate": 1.988346962491543e-05, + "loss": 1.4119, + "step": 365 + }, + { + "epoch": 0.1442719881744272, + "grad_norm": 0.8544123455118898, + "learning_rate": 1.9881364700727827e-05, + "loss": 1.3921, + "step": 366 + }, + { + "epoch": 0.14466617393446662, + "grad_norm": 0.8937019344654401, + "learning_rate": 1.9879241048802213e-05, + "loss": 1.3936, + "step": 367 + }, + { + "epoch": 0.14506035969450604, + "grad_norm": 0.8284420958345725, + "learning_rate": 1.987709867316346e-05, + "loss": 1.4026, + "step": 368 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.989819294325302, + "learning_rate": 1.9874937577871928e-05, + "loss": 1.389, + "step": 369 + }, + { + "epoch": 0.1458487312145849, + "grad_norm": 0.7893349138684312, + "learning_rate": 1.9872757767023445e-05, + "loss": 1.3721, + "step": 370 + }, + { + "epoch": 0.1462429169746243, + "grad_norm": 0.7968967018164466, + "learning_rate": 1.9870559244749317e-05, + "loss": 1.4324, + "step": 371 + }, + { + "epoch": 0.1466371027346637, + "grad_norm": 0.8953034923734662, + "learning_rate": 1.9868342015216312e-05, + "loss": 1.466, + "step": 372 + }, + { + "epoch": 0.14703128849470312, + "grad_norm": 0.8501443759421378, + "learning_rate": 1.986610608262665e-05, + "loss": 1.3055, + "step": 373 + }, + { + "epoch": 0.14742547425474256, + "grad_norm": 0.8315201315122736, + "learning_rate": 1.9863851451218006e-05, + "loss": 1.3872, + "step": 374 + }, + { + "epoch": 0.14781966001478197, + "grad_norm": 0.8236250547602466, + "learning_rate": 1.9861578125263484e-05, + "loss": 1.3778, + "step": 375 + }, + { + "epoch": 0.14821384577482138, + "grad_norm": 0.8467290646865842, + "learning_rate": 1.9859286109071626e-05, + "loss": 1.3848, + "step": 376 + }, + { + "epoch": 0.1486080315348608, + "grad_norm": 0.8755206588442915, + "learning_rate": 1.98569754069864e-05, + "loss": 1.4124, + "step": 377 + }, + { + "epoch": 0.14900221729490024, + "grad_norm": 0.8238920848534587, + "learning_rate": 1.9854646023387173e-05, + "loss": 1.3724, + "step": 378 + }, + { + "epoch": 0.14939640305493965, + "grad_norm": 0.8349137252265575, + "learning_rate": 1.985229796268873e-05, + "loss": 1.3722, + "step": 379 + }, + { + "epoch": 0.14979058881497906, + "grad_norm": 0.8217741172908753, + "learning_rate": 1.9849931229341258e-05, + "loss": 1.4549, + "step": 380 + }, + { + "epoch": 0.15018477457501847, + "grad_norm": 0.9356658298644844, + "learning_rate": 1.9847545827830327e-05, + "loss": 1.3605, + "step": 381 + }, + { + "epoch": 0.1505789603350579, + "grad_norm": 0.8507506609004069, + "learning_rate": 1.9845141762676885e-05, + "loss": 1.3447, + "step": 382 + }, + { + "epoch": 0.15097314609509732, + "grad_norm": 0.8752380208196286, + "learning_rate": 1.984271903843726e-05, + "loss": 1.4148, + "step": 383 + }, + { + "epoch": 0.15136733185513673, + "grad_norm": 0.9244928793694986, + "learning_rate": 1.9840277659703138e-05, + "loss": 1.4949, + "step": 384 + }, + { + "epoch": 0.15176151761517614, + "grad_norm": 0.7660534270592588, + "learning_rate": 1.983781763110156e-05, + "loss": 1.345, + "step": 385 + }, + { + "epoch": 0.15215570337521558, + "grad_norm": 0.84775600235801, + "learning_rate": 1.983533895729492e-05, + "loss": 1.4457, + "step": 386 + }, + { + "epoch": 0.152549889135255, + "grad_norm": 0.823703175205359, + "learning_rate": 1.9832841642980948e-05, + "loss": 1.4155, + "step": 387 + }, + { + "epoch": 0.1529440748952944, + "grad_norm": 0.779646685693002, + "learning_rate": 1.983032569289269e-05, + "loss": 1.459, + "step": 388 + }, + { + "epoch": 0.15333826065533382, + "grad_norm": 0.8240076846457852, + "learning_rate": 1.9827791111798526e-05, + "loss": 1.3924, + "step": 389 + }, + { + "epoch": 0.15373244641537326, + "grad_norm": 0.8625913690976503, + "learning_rate": 1.9825237904502143e-05, + "loss": 1.3492, + "step": 390 + }, + { + "epoch": 0.15412663217541267, + "grad_norm": 0.8365353230811579, + "learning_rate": 1.9822666075842527e-05, + "loss": 1.4228, + "step": 391 + }, + { + "epoch": 0.15452081793545208, + "grad_norm": 0.8259908671120344, + "learning_rate": 1.9820075630693955e-05, + "loss": 1.4015, + "step": 392 + }, + { + "epoch": 0.1549150036954915, + "grad_norm": 0.8637531603835769, + "learning_rate": 1.9817466573965996e-05, + "loss": 1.4159, + "step": 393 + }, + { + "epoch": 0.15530918945553093, + "grad_norm": 0.7939363512701786, + "learning_rate": 1.981483891060348e-05, + "loss": 1.304, + "step": 394 + }, + { + "epoch": 0.15570337521557034, + "grad_norm": 0.8866031449788612, + "learning_rate": 1.981219264558651e-05, + "loss": 1.3626, + "step": 395 + }, + { + "epoch": 0.15609756097560976, + "grad_norm": 0.8228072983791562, + "learning_rate": 1.9809527783930444e-05, + "loss": 1.3833, + "step": 396 + }, + { + "epoch": 0.15649174673564917, + "grad_norm": 0.7978736951343444, + "learning_rate": 1.980684433068588e-05, + "loss": 1.3489, + "step": 397 + }, + { + "epoch": 0.1568859324956886, + "grad_norm": 0.8786273761217978, + "learning_rate": 1.9804142290938654e-05, + "loss": 1.3743, + "step": 398 + }, + { + "epoch": 0.15728011825572802, + "grad_norm": 0.86249011323067, + "learning_rate": 1.9801421669809833e-05, + "loss": 1.3764, + "step": 399 + }, + { + "epoch": 0.15767430401576743, + "grad_norm": 0.8732648413397713, + "learning_rate": 1.9798682472455694e-05, + "loss": 1.4046, + "step": 400 + }, + { + "epoch": 0.15806848977580684, + "grad_norm": 0.8151084661992906, + "learning_rate": 1.979592470406772e-05, + "loss": 1.368, + "step": 401 + }, + { + "epoch": 0.15846267553584628, + "grad_norm": 0.9192834088778115, + "learning_rate": 1.97931483698726e-05, + "loss": 1.4211, + "step": 402 + }, + { + "epoch": 0.1588568612958857, + "grad_norm": 0.8163024312946099, + "learning_rate": 1.9790353475132206e-05, + "loss": 1.3405, + "step": 403 + }, + { + "epoch": 0.1592510470559251, + "grad_norm": 0.8199261685516072, + "learning_rate": 1.9787540025143576e-05, + "loss": 1.4079, + "step": 404 + }, + { + "epoch": 0.15964523281596452, + "grad_norm": 0.8218955327149928, + "learning_rate": 1.9784708025238935e-05, + "loss": 1.3838, + "step": 405 + }, + { + "epoch": 0.16003941857600396, + "grad_norm": 0.8208820007455779, + "learning_rate": 1.9781857480785645e-05, + "loss": 1.3688, + "step": 406 + }, + { + "epoch": 0.16043360433604337, + "grad_norm": 0.8771326041021362, + "learning_rate": 1.977898839718623e-05, + "loss": 1.4101, + "step": 407 + }, + { + "epoch": 0.16082779009608278, + "grad_norm": 0.7558042393459081, + "learning_rate": 1.9776100779878344e-05, + "loss": 1.425, + "step": 408 + }, + { + "epoch": 0.1612219758561222, + "grad_norm": 0.8739591869924033, + "learning_rate": 1.9773194634334764e-05, + "loss": 1.379, + "step": 409 + }, + { + "epoch": 0.16161616161616163, + "grad_norm": 0.7847266820417704, + "learning_rate": 1.977026996606339e-05, + "loss": 1.3367, + "step": 410 + }, + { + "epoch": 0.16201034737620104, + "grad_norm": 0.8477635650808805, + "learning_rate": 1.9767326780607218e-05, + "loss": 1.3511, + "step": 411 + }, + { + "epoch": 0.16240453313624045, + "grad_norm": 0.8632845728066261, + "learning_rate": 1.976436508354435e-05, + "loss": 1.3313, + "step": 412 + }, + { + "epoch": 0.16279871889627986, + "grad_norm": 0.7873959773662924, + "learning_rate": 1.9761384880487967e-05, + "loss": 1.3409, + "step": 413 + }, + { + "epoch": 0.1631929046563193, + "grad_norm": 0.818419644861465, + "learning_rate": 1.9758386177086324e-05, + "loss": 1.4273, + "step": 414 + }, + { + "epoch": 0.16358709041635872, + "grad_norm": 0.8843790656491963, + "learning_rate": 1.9755368979022734e-05, + "loss": 1.4058, + "step": 415 + }, + { + "epoch": 0.16398127617639813, + "grad_norm": 0.8545938358336401, + "learning_rate": 1.9752333292015565e-05, + "loss": 1.4021, + "step": 416 + }, + { + "epoch": 0.16437546193643754, + "grad_norm": 0.9263197519347521, + "learning_rate": 1.9749279121818235e-05, + "loss": 1.3893, + "step": 417 + }, + { + "epoch": 0.16476964769647698, + "grad_norm": 0.7667419924633587, + "learning_rate": 1.9746206474219182e-05, + "loss": 1.3335, + "step": 418 + }, + { + "epoch": 0.1651638334565164, + "grad_norm": 0.8481486595457164, + "learning_rate": 1.9743115355041868e-05, + "loss": 1.3288, + "step": 419 + }, + { + "epoch": 0.1655580192165558, + "grad_norm": 0.7727894220848658, + "learning_rate": 1.9740005770144762e-05, + "loss": 1.333, + "step": 420 + }, + { + "epoch": 0.1659522049765952, + "grad_norm": 0.8607077475883066, + "learning_rate": 1.9736877725421325e-05, + "loss": 1.4611, + "step": 421 + }, + { + "epoch": 0.16634639073663465, + "grad_norm": 0.7998454699496479, + "learning_rate": 1.9733731226800016e-05, + "loss": 1.3622, + "step": 422 + }, + { + "epoch": 0.16674057649667406, + "grad_norm": 0.7314193043164695, + "learning_rate": 1.9730566280244256e-05, + "loss": 1.3375, + "step": 423 + }, + { + "epoch": 0.16713476225671348, + "grad_norm": 0.777752765207413, + "learning_rate": 1.9727382891752446e-05, + "loss": 1.38, + "step": 424 + }, + { + "epoch": 0.1675289480167529, + "grad_norm": 0.8338395199460101, + "learning_rate": 1.9724181067357918e-05, + "loss": 1.3022, + "step": 425 + }, + { + "epoch": 0.16792313377679233, + "grad_norm": 0.8380585348678756, + "learning_rate": 1.9720960813128966e-05, + "loss": 1.3745, + "step": 426 + }, + { + "epoch": 0.16831731953683174, + "grad_norm": 0.8412709090344273, + "learning_rate": 1.9717722135168796e-05, + "loss": 1.3487, + "step": 427 + }, + { + "epoch": 0.16871150529687115, + "grad_norm": 0.8188807655558134, + "learning_rate": 1.9714465039615545e-05, + "loss": 1.4046, + "step": 428 + }, + { + "epoch": 0.16910569105691056, + "grad_norm": 0.7873789728209534, + "learning_rate": 1.9711189532642244e-05, + "loss": 1.3695, + "step": 429 + }, + { + "epoch": 0.16949987681695, + "grad_norm": 0.8380079010888628, + "learning_rate": 1.9707895620456832e-05, + "loss": 1.4121, + "step": 430 + }, + { + "epoch": 0.1698940625769894, + "grad_norm": 0.7464093486132232, + "learning_rate": 1.9704583309302115e-05, + "loss": 1.3383, + "step": 431 + }, + { + "epoch": 0.17028824833702882, + "grad_norm": 0.7745574128518233, + "learning_rate": 1.970125260545579e-05, + "loss": 1.4293, + "step": 432 + }, + { + "epoch": 0.17068243409706824, + "grad_norm": 0.7923250648359519, + "learning_rate": 1.9697903515230387e-05, + "loss": 1.3816, + "step": 433 + }, + { + "epoch": 0.17107661985710768, + "grad_norm": 0.7828760994144639, + "learning_rate": 1.9694536044973303e-05, + "loss": 1.3682, + "step": 434 + }, + { + "epoch": 0.1714708056171471, + "grad_norm": 0.7535267581618733, + "learning_rate": 1.9691150201066765e-05, + "loss": 1.4415, + "step": 435 + }, + { + "epoch": 0.1718649913771865, + "grad_norm": 0.7719938628460055, + "learning_rate": 1.9687745989927823e-05, + "loss": 1.3261, + "step": 436 + }, + { + "epoch": 0.1722591771372259, + "grad_norm": 0.7985396893057591, + "learning_rate": 1.968432341800833e-05, + "loss": 1.3384, + "step": 437 + }, + { + "epoch": 0.17265336289726535, + "grad_norm": 0.7864913353035174, + "learning_rate": 1.9680882491794953e-05, + "loss": 1.4198, + "step": 438 + }, + { + "epoch": 0.17304754865730476, + "grad_norm": 0.7652857695438825, + "learning_rate": 1.9677423217809127e-05, + "loss": 1.4451, + "step": 439 + }, + { + "epoch": 0.17344173441734417, + "grad_norm": 0.7779886907598241, + "learning_rate": 1.9673945602607073e-05, + "loss": 1.445, + "step": 440 + }, + { + "epoch": 0.17383592017738358, + "grad_norm": 0.7526833753446838, + "learning_rate": 1.967044965277977e-05, + "loss": 1.3715, + "step": 441 + }, + { + "epoch": 0.17423010593742302, + "grad_norm": 0.7613651093452684, + "learning_rate": 1.9666935374952946e-05, + "loss": 1.3418, + "step": 442 + }, + { + "epoch": 0.17462429169746244, + "grad_norm": 0.7407113533991782, + "learning_rate": 1.9663402775787066e-05, + "loss": 1.3176, + "step": 443 + }, + { + "epoch": 0.17501847745750185, + "grad_norm": 0.8511077778073948, + "learning_rate": 1.9659851861977316e-05, + "loss": 1.3712, + "step": 444 + }, + { + "epoch": 0.17541266321754126, + "grad_norm": 0.7637296441923789, + "learning_rate": 1.965628264025359e-05, + "loss": 1.3138, + "step": 445 + }, + { + "epoch": 0.1758068489775807, + "grad_norm": 0.7688575868311163, + "learning_rate": 1.9652695117380496e-05, + "loss": 1.3478, + "step": 446 + }, + { + "epoch": 0.1762010347376201, + "grad_norm": 0.8112254863467798, + "learning_rate": 1.9649089300157307e-05, + "loss": 1.3199, + "step": 447 + }, + { + "epoch": 0.17659522049765952, + "grad_norm": 0.7773958932143377, + "learning_rate": 1.9645465195417986e-05, + "loss": 1.3729, + "step": 448 + }, + { + "epoch": 0.17698940625769893, + "grad_norm": 0.7925758880473086, + "learning_rate": 1.9641822810031135e-05, + "loss": 1.3545, + "step": 449 + }, + { + "epoch": 0.17738359201773837, + "grad_norm": 0.7629015638547695, + "learning_rate": 1.9638162150900028e-05, + "loss": 1.3425, + "step": 450 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.7832983576510374, + "learning_rate": 1.9634483224962555e-05, + "loss": 1.3347, + "step": 451 + }, + { + "epoch": 0.1781719635378172, + "grad_norm": 0.8341313973861934, + "learning_rate": 1.963078603919123e-05, + "loss": 1.3995, + "step": 452 + }, + { + "epoch": 0.1785661492978566, + "grad_norm": 0.7778224652767618, + "learning_rate": 1.9627070600593172e-05, + "loss": 1.2996, + "step": 453 + }, + { + "epoch": 0.17896033505789605, + "grad_norm": 0.8243076810986155, + "learning_rate": 1.96233369162101e-05, + "loss": 1.3893, + "step": 454 + }, + { + "epoch": 0.17935452081793546, + "grad_norm": 0.8654955959896804, + "learning_rate": 1.9619584993118308e-05, + "loss": 1.3232, + "step": 455 + }, + { + "epoch": 0.17974870657797487, + "grad_norm": 0.804527846282048, + "learning_rate": 1.9615814838428662e-05, + "loss": 1.3656, + "step": 456 + }, + { + "epoch": 0.18014289233801428, + "grad_norm": 0.7962448753036495, + "learning_rate": 1.961202645928658e-05, + "loss": 1.3637, + "step": 457 + }, + { + "epoch": 0.18053707809805372, + "grad_norm": 0.8354245092920538, + "learning_rate": 1.960821986287201e-05, + "loss": 1.3867, + "step": 458 + }, + { + "epoch": 0.18093126385809313, + "grad_norm": 0.8345477417237376, + "learning_rate": 1.960439505639945e-05, + "loss": 1.3931, + "step": 459 + }, + { + "epoch": 0.18132544961813254, + "grad_norm": 0.9026625490600573, + "learning_rate": 1.9600552047117883e-05, + "loss": 1.3355, + "step": 460 + }, + { + "epoch": 0.18171963537817196, + "grad_norm": 0.7381101689953861, + "learning_rate": 1.9596690842310807e-05, + "loss": 1.3469, + "step": 461 + }, + { + "epoch": 0.1821138211382114, + "grad_norm": 0.8146270963359201, + "learning_rate": 1.9592811449296206e-05, + "loss": 1.3754, + "step": 462 + }, + { + "epoch": 0.1825080068982508, + "grad_norm": 0.7583095033222406, + "learning_rate": 1.9588913875426532e-05, + "loss": 1.3674, + "step": 463 + }, + { + "epoch": 0.18290219265829022, + "grad_norm": 0.7547653358304839, + "learning_rate": 1.9584998128088686e-05, + "loss": 1.3402, + "step": 464 + }, + { + "epoch": 0.18329637841832963, + "grad_norm": 0.8068714500814903, + "learning_rate": 1.958106421470403e-05, + "loss": 1.3792, + "step": 465 + }, + { + "epoch": 0.18369056417836907, + "grad_norm": 0.7623764190926223, + "learning_rate": 1.957711214272834e-05, + "loss": 1.3683, + "step": 466 + }, + { + "epoch": 0.18408474993840848, + "grad_norm": 0.7327762464326012, + "learning_rate": 1.957314191965182e-05, + "loss": 1.3321, + "step": 467 + }, + { + "epoch": 0.1844789356984479, + "grad_norm": 0.8050214138929509, + "learning_rate": 1.9569153552999057e-05, + "loss": 1.4045, + "step": 468 + }, + { + "epoch": 0.1848731214584873, + "grad_norm": 0.7931062968671917, + "learning_rate": 1.9565147050329046e-05, + "loss": 1.3676, + "step": 469 + }, + { + "epoch": 0.18526730721852674, + "grad_norm": 0.7329041782778525, + "learning_rate": 1.9561122419235137e-05, + "loss": 1.3468, + "step": 470 + }, + { + "epoch": 0.18566149297856616, + "grad_norm": 0.7706739838708203, + "learning_rate": 1.955707966734505e-05, + "loss": 1.3456, + "step": 471 + }, + { + "epoch": 0.18605567873860557, + "grad_norm": 0.7721590455864087, + "learning_rate": 1.9553018802320843e-05, + "loss": 1.383, + "step": 472 + }, + { + "epoch": 0.18644986449864498, + "grad_norm": 0.7426283570331748, + "learning_rate": 1.95489398318589e-05, + "loss": 1.3125, + "step": 473 + }, + { + "epoch": 0.18684405025868442, + "grad_norm": 2.063311743166772, + "learning_rate": 1.9544842763689928e-05, + "loss": 1.4202, + "step": 474 + }, + { + "epoch": 0.18723823601872383, + "grad_norm": 0.7311089489840802, + "learning_rate": 1.954072760557893e-05, + "loss": 1.2622, + "step": 475 + }, + { + "epoch": 0.18763242177876324, + "grad_norm": 0.781806989985732, + "learning_rate": 1.953659436532519e-05, + "loss": 1.3805, + "step": 476 + }, + { + "epoch": 0.18802660753880265, + "grad_norm": 0.8019278871709516, + "learning_rate": 1.9532443050762265e-05, + "loss": 1.3006, + "step": 477 + }, + { + "epoch": 0.1884207932988421, + "grad_norm": 0.7493676971003281, + "learning_rate": 1.9528273669757974e-05, + "loss": 1.2912, + "step": 478 + }, + { + "epoch": 0.1888149790588815, + "grad_norm": 0.8268984543433072, + "learning_rate": 1.9524086230214366e-05, + "loss": 1.3565, + "step": 479 + }, + { + "epoch": 0.18920916481892092, + "grad_norm": 0.7801443400096512, + "learning_rate": 1.951988074006772e-05, + "loss": 1.371, + "step": 480 + }, + { + "epoch": 0.18960335057896033, + "grad_norm": 0.7539695626008661, + "learning_rate": 1.9515657207288528e-05, + "loss": 1.3721, + "step": 481 + }, + { + "epoch": 0.18999753633899977, + "grad_norm": 0.7703572570935576, + "learning_rate": 1.9511415639881474e-05, + "loss": 1.4442, + "step": 482 + }, + { + "epoch": 0.19039172209903918, + "grad_norm": 0.7742745558792156, + "learning_rate": 1.9507156045885423e-05, + "loss": 1.2905, + "step": 483 + }, + { + "epoch": 0.1907859078590786, + "grad_norm": 0.7359869825956976, + "learning_rate": 1.950287843337341e-05, + "loss": 1.3254, + "step": 484 + }, + { + "epoch": 0.191180093619118, + "grad_norm": 0.7544568408416208, + "learning_rate": 1.9498582810452607e-05, + "loss": 1.3154, + "step": 485 + }, + { + "epoch": 0.19157427937915744, + "grad_norm": 0.7769753768513467, + "learning_rate": 1.949426918526434e-05, + "loss": 1.3628, + "step": 486 + }, + { + "epoch": 0.19196846513919685, + "grad_norm": 0.7834189136520097, + "learning_rate": 1.9489937565984033e-05, + "loss": 1.3554, + "step": 487 + }, + { + "epoch": 0.19236265089923626, + "grad_norm": 0.7796538796113698, + "learning_rate": 1.948558796082123e-05, + "loss": 1.2925, + "step": 488 + }, + { + "epoch": 0.19275683665927568, + "grad_norm": 1.0372440968179562, + "learning_rate": 1.9481220378019553e-05, + "loss": 1.309, + "step": 489 + }, + { + "epoch": 0.19315102241931512, + "grad_norm": 0.727717117732363, + "learning_rate": 1.9476834825856696e-05, + "loss": 1.353, + "step": 490 + }, + { + "epoch": 0.19354520817935453, + "grad_norm": 0.7330989067981496, + "learning_rate": 1.947243131264442e-05, + "loss": 1.3326, + "step": 491 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.8625663326931535, + "learning_rate": 1.9468009846728515e-05, + "loss": 1.3795, + "step": 492 + }, + { + "epoch": 0.19433357969943335, + "grad_norm": 0.7442872681943762, + "learning_rate": 1.9463570436488803e-05, + "loss": 1.3343, + "step": 493 + }, + { + "epoch": 0.1947277654594728, + "grad_norm": 0.7892831285816906, + "learning_rate": 1.9459113090339107e-05, + "loss": 1.4112, + "step": 494 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 0.7915084905242407, + "learning_rate": 1.945463781672726e-05, + "loss": 1.3867, + "step": 495 + }, + { + "epoch": 0.1955161369795516, + "grad_norm": 0.7558768011341099, + "learning_rate": 1.945014462413505e-05, + "loss": 1.2735, + "step": 496 + }, + { + "epoch": 0.19591032273959103, + "grad_norm": 0.7918551795385935, + "learning_rate": 1.9445633521078246e-05, + "loss": 1.366, + "step": 497 + }, + { + "epoch": 0.19630450849963046, + "grad_norm": 0.7632462761447605, + "learning_rate": 1.944110451610655e-05, + "loss": 1.2919, + "step": 498 + }, + { + "epoch": 0.19669869425966988, + "grad_norm": 0.8619242283408518, + "learning_rate": 1.9436557617803594e-05, + "loss": 1.3433, + "step": 499 + }, + { + "epoch": 0.1970928800197093, + "grad_norm": 0.7486074296088833, + "learning_rate": 1.943199283478693e-05, + "loss": 1.3718, + "step": 500 + }, + { + "epoch": 0.1974870657797487, + "grad_norm": 0.7844981757900801, + "learning_rate": 1.9427410175707993e-05, + "loss": 1.3615, + "step": 501 + }, + { + "epoch": 0.19788125153978814, + "grad_norm": 0.7861270837445861, + "learning_rate": 1.942280964925211e-05, + "loss": 1.4269, + "step": 502 + }, + { + "epoch": 0.19827543729982755, + "grad_norm": 0.7771387444238573, + "learning_rate": 1.9418191264138468e-05, + "loss": 1.3861, + "step": 503 + }, + { + "epoch": 0.19866962305986696, + "grad_norm": 0.7840229669644916, + "learning_rate": 1.94135550291201e-05, + "loss": 1.3508, + "step": 504 + }, + { + "epoch": 0.19906380881990637, + "grad_norm": 0.7578091088675099, + "learning_rate": 1.940890095298386e-05, + "loss": 1.3, + "step": 505 + }, + { + "epoch": 0.1994579945799458, + "grad_norm": 0.7955186622031103, + "learning_rate": 1.9404229044550432e-05, + "loss": 1.3877, + "step": 506 + }, + { + "epoch": 0.19985218033998522, + "grad_norm": 0.7600697521641491, + "learning_rate": 1.939953931267429e-05, + "loss": 1.3083, + "step": 507 + }, + { + "epoch": 0.20024636610002464, + "grad_norm": 0.7997760910789501, + "learning_rate": 1.9394831766243688e-05, + "loss": 1.3574, + "step": 508 + }, + { + "epoch": 0.20064055186006405, + "grad_norm": 0.8324601470930124, + "learning_rate": 1.9390106414180635e-05, + "loss": 1.3314, + "step": 509 + }, + { + "epoch": 0.2010347376201035, + "grad_norm": 0.7986181347574611, + "learning_rate": 1.9385363265440896e-05, + "loss": 1.3701, + "step": 510 + }, + { + "epoch": 0.2014289233801429, + "grad_norm": 0.8390387581661004, + "learning_rate": 1.9380602329013967e-05, + "loss": 1.3278, + "step": 511 + }, + { + "epoch": 0.2018231091401823, + "grad_norm": 0.7756267484264265, + "learning_rate": 1.937582361392305e-05, + "loss": 1.2902, + "step": 512 + }, + { + "epoch": 0.20221729490022172, + "grad_norm": 0.8280742083628098, + "learning_rate": 1.9371027129225042e-05, + "loss": 1.3954, + "step": 513 + }, + { + "epoch": 0.20261148066026116, + "grad_norm": 0.7557033928381056, + "learning_rate": 1.9366212884010523e-05, + "loss": 1.3245, + "step": 514 + }, + { + "epoch": 0.20300566642030057, + "grad_norm": 0.7339490880913666, + "learning_rate": 1.9361380887403726e-05, + "loss": 1.3314, + "step": 515 + }, + { + "epoch": 0.20339985218033999, + "grad_norm": 0.759110598024447, + "learning_rate": 1.935653114856254e-05, + "loss": 1.3075, + "step": 516 + }, + { + "epoch": 0.2037940379403794, + "grad_norm": 0.7330136521742119, + "learning_rate": 1.9351663676678465e-05, + "loss": 1.3105, + "step": 517 + }, + { + "epoch": 0.20418822370041884, + "grad_norm": 0.8396501916315762, + "learning_rate": 1.9346778480976626e-05, + "loss": 1.3555, + "step": 518 + }, + { + "epoch": 0.20458240946045825, + "grad_norm": 0.7833213499224854, + "learning_rate": 1.9341875570715723e-05, + "loss": 1.393, + "step": 519 + }, + { + "epoch": 0.20497659522049766, + "grad_norm": 0.788388912099959, + "learning_rate": 1.9336954955188042e-05, + "loss": 1.3548, + "step": 520 + }, + { + "epoch": 0.20537078098053707, + "grad_norm": 0.7944142250573871, + "learning_rate": 1.9332016643719413e-05, + "loss": 1.3167, + "step": 521 + }, + { + "epoch": 0.2057649667405765, + "grad_norm": 0.7185170009516036, + "learning_rate": 1.932706064566922e-05, + "loss": 1.2763, + "step": 522 + }, + { + "epoch": 0.20615915250061592, + "grad_norm": 0.7625422306230389, + "learning_rate": 1.9322086970430355e-05, + "loss": 1.2991, + "step": 523 + }, + { + "epoch": 0.20655333826065533, + "grad_norm": 0.7528804400146271, + "learning_rate": 1.9317095627429215e-05, + "loss": 1.2744, + "step": 524 + }, + { + "epoch": 0.20694752402069475, + "grad_norm": 0.7235339004181085, + "learning_rate": 1.931208662612569e-05, + "loss": 1.3023, + "step": 525 + }, + { + "epoch": 0.20734170978073418, + "grad_norm": 0.7485454145610042, + "learning_rate": 1.930705997601313e-05, + "loss": 1.2737, + "step": 526 + }, + { + "epoch": 0.2077358955407736, + "grad_norm": 0.7616817297855956, + "learning_rate": 1.9302015686618328e-05, + "loss": 1.3331, + "step": 527 + }, + { + "epoch": 0.208130081300813, + "grad_norm": 0.7224963273000136, + "learning_rate": 1.929695376750152e-05, + "loss": 1.3113, + "step": 528 + }, + { + "epoch": 0.20852426706085242, + "grad_norm": 0.7117066935208167, + "learning_rate": 1.9291874228256355e-05, + "loss": 1.3536, + "step": 529 + }, + { + "epoch": 0.20891845282089186, + "grad_norm": 0.7620668487908003, + "learning_rate": 1.928677707850986e-05, + "loss": 1.3847, + "step": 530 + }, + { + "epoch": 0.20931263858093127, + "grad_norm": 0.7762645227174237, + "learning_rate": 1.9281662327922458e-05, + "loss": 1.3838, + "step": 531 + }, + { + "epoch": 0.20970682434097068, + "grad_norm": 0.7486355068094747, + "learning_rate": 1.9276529986187925e-05, + "loss": 1.2929, + "step": 532 + }, + { + "epoch": 0.2101010101010101, + "grad_norm": 0.7850761598989443, + "learning_rate": 1.9271380063033368e-05, + "loss": 1.3511, + "step": 533 + }, + { + "epoch": 0.21049519586104953, + "grad_norm": 0.7306901593960397, + "learning_rate": 1.9266212568219223e-05, + "loss": 1.3223, + "step": 534 + }, + { + "epoch": 0.21088938162108894, + "grad_norm": 0.8035850088778281, + "learning_rate": 1.9261027511539227e-05, + "loss": 1.3615, + "step": 535 + }, + { + "epoch": 0.21128356738112836, + "grad_norm": 0.7359933674500054, + "learning_rate": 1.9255824902820403e-05, + "loss": 1.3733, + "step": 536 + }, + { + "epoch": 0.21167775314116777, + "grad_norm": 0.7361755019126336, + "learning_rate": 1.9250604751923035e-05, + "loss": 1.2759, + "step": 537 + }, + { + "epoch": 0.2120719389012072, + "grad_norm": 0.7731391184456793, + "learning_rate": 1.9245367068740664e-05, + "loss": 1.3493, + "step": 538 + }, + { + "epoch": 0.21246612466124662, + "grad_norm": 0.7070141898804634, + "learning_rate": 1.9240111863200047e-05, + "loss": 1.3316, + "step": 539 + }, + { + "epoch": 0.21286031042128603, + "grad_norm": 0.7047293130221922, + "learning_rate": 1.9234839145261154e-05, + "loss": 1.309, + "step": 540 + }, + { + "epoch": 0.21325449618132544, + "grad_norm": 0.7787357081571815, + "learning_rate": 1.9229548924917146e-05, + "loss": 1.3572, + "step": 541 + }, + { + "epoch": 0.21364868194136488, + "grad_norm": 0.7390906175625679, + "learning_rate": 1.9224241212194364e-05, + "loss": 1.3855, + "step": 542 + }, + { + "epoch": 0.2140428677014043, + "grad_norm": 0.7348457458913636, + "learning_rate": 1.9218916017152292e-05, + "loss": 1.3093, + "step": 543 + }, + { + "epoch": 0.2144370534614437, + "grad_norm": 0.752656550237857, + "learning_rate": 1.9213573349883545e-05, + "loss": 1.4028, + "step": 544 + }, + { + "epoch": 0.21483123922148312, + "grad_norm": 0.7244840658804366, + "learning_rate": 1.9208213220513866e-05, + "loss": 1.2963, + "step": 545 + }, + { + "epoch": 0.21522542498152256, + "grad_norm": 0.770992566259173, + "learning_rate": 1.9202835639202075e-05, + "loss": 1.2926, + "step": 546 + }, + { + "epoch": 0.21561961074156197, + "grad_norm": 0.7643194008638872, + "learning_rate": 1.919744061614008e-05, + "loss": 1.3145, + "step": 547 + }, + { + "epoch": 0.21601379650160138, + "grad_norm": 0.7366196627549643, + "learning_rate": 1.9192028161552848e-05, + "loss": 1.3536, + "step": 548 + }, + { + "epoch": 0.2164079822616408, + "grad_norm": 0.6968551530472608, + "learning_rate": 1.9186598285698373e-05, + "loss": 1.3063, + "step": 549 + }, + { + "epoch": 0.21680216802168023, + "grad_norm": 0.7641280477443396, + "learning_rate": 1.9181150998867674e-05, + "loss": 1.3252, + "step": 550 + }, + { + "epoch": 0.21719635378171964, + "grad_norm": 0.7864006183375085, + "learning_rate": 1.9175686311384763e-05, + "loss": 1.2925, + "step": 551 + }, + { + "epoch": 0.21759053954175905, + "grad_norm": 0.7510317585657532, + "learning_rate": 1.917020423360664e-05, + "loss": 1.3147, + "step": 552 + }, + { + "epoch": 0.21798472530179847, + "grad_norm": 0.759753668019818, + "learning_rate": 1.9164704775923258e-05, + "loss": 1.2949, + "step": 553 + }, + { + "epoch": 0.2183789110618379, + "grad_norm": 0.7730004582439941, + "learning_rate": 1.9159187948757503e-05, + "loss": 1.2885, + "step": 554 + }, + { + "epoch": 0.21877309682187732, + "grad_norm": 0.7672020235507695, + "learning_rate": 1.915365376256519e-05, + "loss": 1.3914, + "step": 555 + }, + { + "epoch": 0.21916728258191673, + "grad_norm": 0.752157061906444, + "learning_rate": 1.9148102227835033e-05, + "loss": 1.3487, + "step": 556 + }, + { + "epoch": 0.21956146834195614, + "grad_norm": 0.7278798351850428, + "learning_rate": 1.9142533355088628e-05, + "loss": 1.3303, + "step": 557 + }, + { + "epoch": 0.21995565410199558, + "grad_norm": 0.7104471440585667, + "learning_rate": 1.9136947154880413e-05, + "loss": 1.3193, + "step": 558 + }, + { + "epoch": 0.220349839862035, + "grad_norm": 0.7800638989095695, + "learning_rate": 1.9131343637797695e-05, + "loss": 1.3536, + "step": 559 + }, + { + "epoch": 0.2207440256220744, + "grad_norm": 0.7109099389345059, + "learning_rate": 1.9125722814460582e-05, + "loss": 1.2976, + "step": 560 + }, + { + "epoch": 0.22113821138211381, + "grad_norm": 0.709861315894559, + "learning_rate": 1.912008469552198e-05, + "loss": 1.3534, + "step": 561 + }, + { + "epoch": 0.22153239714215325, + "grad_norm": 0.7625065746820054, + "learning_rate": 1.9114429291667583e-05, + "loss": 1.3593, + "step": 562 + }, + { + "epoch": 0.22192658290219267, + "grad_norm": 0.8957024180712038, + "learning_rate": 1.9108756613615846e-05, + "loss": 1.2796, + "step": 563 + }, + { + "epoch": 0.22232076866223208, + "grad_norm": 0.756013792651535, + "learning_rate": 1.9103066672117957e-05, + "loss": 1.2989, + "step": 564 + }, + { + "epoch": 0.2227149544222715, + "grad_norm": 0.7162732062615748, + "learning_rate": 1.9097359477957825e-05, + "loss": 1.2601, + "step": 565 + }, + { + "epoch": 0.22310914018231093, + "grad_norm": 0.7436938571603158, + "learning_rate": 1.9091635041952052e-05, + "loss": 1.3151, + "step": 566 + }, + { + "epoch": 0.22350332594235034, + "grad_norm": 0.7610549683893325, + "learning_rate": 1.9085893374949926e-05, + "loss": 1.2972, + "step": 567 + }, + { + "epoch": 0.22389751170238975, + "grad_norm": 0.7558082450692344, + "learning_rate": 1.9080134487833393e-05, + "loss": 1.3793, + "step": 568 + }, + { + "epoch": 0.22429169746242916, + "grad_norm": 0.7719491717906157, + "learning_rate": 1.9074358391517026e-05, + "loss": 1.3779, + "step": 569 + }, + { + "epoch": 0.2246858832224686, + "grad_norm": 0.7374690493690355, + "learning_rate": 1.9068565096948017e-05, + "loss": 1.3406, + "step": 570 + }, + { + "epoch": 0.225080068982508, + "grad_norm": 0.7538369331733002, + "learning_rate": 1.9062754615106162e-05, + "loss": 1.2936, + "step": 571 + }, + { + "epoch": 0.22547425474254743, + "grad_norm": 0.7296271125635926, + "learning_rate": 1.905692695700382e-05, + "loss": 1.3447, + "step": 572 + }, + { + "epoch": 0.22586844050258684, + "grad_norm": 0.8084596790033229, + "learning_rate": 1.905108213368591e-05, + "loss": 1.2637, + "step": 573 + }, + { + "epoch": 0.22626262626262628, + "grad_norm": 0.7557777464040102, + "learning_rate": 1.904522015622988e-05, + "loss": 1.3563, + "step": 574 + }, + { + "epoch": 0.2266568120226657, + "grad_norm": 0.7483236106401496, + "learning_rate": 1.9039341035745696e-05, + "loss": 1.2815, + "step": 575 + }, + { + "epoch": 0.2270509977827051, + "grad_norm": 0.8169659004896286, + "learning_rate": 1.9033444783375806e-05, + "loss": 1.2968, + "step": 576 + }, + { + "epoch": 0.2274451835427445, + "grad_norm": 0.7564345089200964, + "learning_rate": 1.9027531410295128e-05, + "loss": 1.2903, + "step": 577 + }, + { + "epoch": 0.22783936930278395, + "grad_norm": 0.740064034653702, + "learning_rate": 1.9021600927711037e-05, + "loss": 1.3115, + "step": 578 + }, + { + "epoch": 0.22823355506282336, + "grad_norm": 0.7536666281291825, + "learning_rate": 1.9015653346863322e-05, + "loss": 1.2815, + "step": 579 + }, + { + "epoch": 0.22862774082286277, + "grad_norm": 0.7332255399421099, + "learning_rate": 1.900968867902419e-05, + "loss": 1.2896, + "step": 580 + }, + { + "epoch": 0.22902192658290219, + "grad_norm": 0.7215272966131613, + "learning_rate": 1.9003706935498233e-05, + "loss": 1.3181, + "step": 581 + }, + { + "epoch": 0.22941611234294162, + "grad_norm": 0.8275893204395051, + "learning_rate": 1.8997708127622384e-05, + "loss": 1.293, + "step": 582 + }, + { + "epoch": 0.22981029810298104, + "grad_norm": 0.7495958353788804, + "learning_rate": 1.8991692266765947e-05, + "loss": 1.2679, + "step": 583 + }, + { + "epoch": 0.23020448386302045, + "grad_norm": 0.7772101723875109, + "learning_rate": 1.8985659364330522e-05, + "loss": 1.325, + "step": 584 + }, + { + "epoch": 0.23059866962305986, + "grad_norm": 0.7489454768012945, + "learning_rate": 1.8979609431750025e-05, + "loss": 1.2757, + "step": 585 + }, + { + "epoch": 0.2309928553830993, + "grad_norm": 0.7612569479113607, + "learning_rate": 1.8973542480490636e-05, + "loss": 1.3161, + "step": 586 + }, + { + "epoch": 0.2313870411431387, + "grad_norm": 0.8016105305619344, + "learning_rate": 1.89674585220508e-05, + "loss": 1.3373, + "step": 587 + }, + { + "epoch": 0.23178122690317812, + "grad_norm": 0.7552521095717978, + "learning_rate": 1.8961357567961182e-05, + "loss": 1.3341, + "step": 588 + }, + { + "epoch": 0.23217541266321753, + "grad_norm": 0.8077575349160561, + "learning_rate": 1.8955239629784667e-05, + "loss": 1.3828, + "step": 589 + }, + { + "epoch": 0.23256959842325697, + "grad_norm": 0.7734481164743204, + "learning_rate": 1.8949104719116334e-05, + "loss": 1.2494, + "step": 590 + }, + { + "epoch": 0.23296378418329639, + "grad_norm": 0.7239243239882402, + "learning_rate": 1.8942952847583417e-05, + "loss": 1.3492, + "step": 591 + }, + { + "epoch": 0.2333579699433358, + "grad_norm": 0.7392668666857419, + "learning_rate": 1.8936784026845304e-05, + "loss": 1.2988, + "step": 592 + }, + { + "epoch": 0.2337521557033752, + "grad_norm": 0.737345549169784, + "learning_rate": 1.8930598268593503e-05, + "loss": 1.3593, + "step": 593 + }, + { + "epoch": 0.23414634146341465, + "grad_norm": 0.7739820026696098, + "learning_rate": 1.8924395584551624e-05, + "loss": 1.2917, + "step": 594 + }, + { + "epoch": 0.23454052722345406, + "grad_norm": 0.7370299572384036, + "learning_rate": 1.891817598647535e-05, + "loss": 1.3188, + "step": 595 + }, + { + "epoch": 0.23493471298349347, + "grad_norm": 0.7045735291814132, + "learning_rate": 1.8911939486152433e-05, + "loss": 1.2999, + "step": 596 + }, + { + "epoch": 0.23532889874353288, + "grad_norm": 0.7318502745854408, + "learning_rate": 1.8905686095402648e-05, + "loss": 1.2973, + "step": 597 + }, + { + "epoch": 0.23572308450357232, + "grad_norm": 0.6992717345016547, + "learning_rate": 1.8899415826077784e-05, + "loss": 1.2562, + "step": 598 + }, + { + "epoch": 0.23611727026361173, + "grad_norm": 0.7855449422876546, + "learning_rate": 1.8893128690061625e-05, + "loss": 1.3331, + "step": 599 + }, + { + "epoch": 0.23651145602365115, + "grad_norm": 0.7330330982965301, + "learning_rate": 1.8886824699269916e-05, + "loss": 1.2719, + "step": 600 + }, + { + "epoch": 0.23690564178369056, + "grad_norm": 0.7235999574209688, + "learning_rate": 1.888050386565034e-05, + "loss": 1.2848, + "step": 601 + }, + { + "epoch": 0.23729982754373, + "grad_norm": 0.7259572083243264, + "learning_rate": 1.8874166201182526e-05, + "loss": 1.2901, + "step": 602 + }, + { + "epoch": 0.2376940133037694, + "grad_norm": 0.738733374260345, + "learning_rate": 1.8867811717877966e-05, + "loss": 1.2949, + "step": 603 + }, + { + "epoch": 0.23808819906380882, + "grad_norm": 0.7293917944233541, + "learning_rate": 1.886144042778006e-05, + "loss": 1.2738, + "step": 604 + }, + { + "epoch": 0.23848238482384823, + "grad_norm": 0.7004391383451308, + "learning_rate": 1.885505234296404e-05, + "loss": 1.2703, + "step": 605 + }, + { + "epoch": 0.23887657058388767, + "grad_norm": 0.7664560785377862, + "learning_rate": 1.884864747553698e-05, + "loss": 1.3647, + "step": 606 + }, + { + "epoch": 0.23927075634392708, + "grad_norm": 0.8048750538355759, + "learning_rate": 1.8842225837637765e-05, + "loss": 1.4858, + "step": 607 + }, + { + "epoch": 0.2396649421039665, + "grad_norm": 0.7886892188335735, + "learning_rate": 1.8835787441437043e-05, + "loss": 1.3808, + "step": 608 + }, + { + "epoch": 0.2400591278640059, + "grad_norm": 0.700691895354596, + "learning_rate": 1.8829332299137245e-05, + "loss": 1.3073, + "step": 609 + }, + { + "epoch": 0.24045331362404535, + "grad_norm": 0.749597801010302, + "learning_rate": 1.882286042297254e-05, + "loss": 1.3656, + "step": 610 + }, + { + "epoch": 0.24084749938408476, + "grad_norm": 0.7481923330312744, + "learning_rate": 1.881637182520879e-05, + "loss": 1.3272, + "step": 611 + }, + { + "epoch": 0.24124168514412417, + "grad_norm": 0.6957757781146582, + "learning_rate": 1.880986651814357e-05, + "loss": 1.2368, + "step": 612 + }, + { + "epoch": 0.24163587090416358, + "grad_norm": 0.7428959152728734, + "learning_rate": 1.8803344514106123e-05, + "loss": 1.3561, + "step": 613 + }, + { + "epoch": 0.24203005666420302, + "grad_norm": 0.733482247697521, + "learning_rate": 1.8796805825457324e-05, + "loss": 1.3296, + "step": 614 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.7941648551428049, + "learning_rate": 1.8790250464589676e-05, + "loss": 1.3018, + "step": 615 + }, + { + "epoch": 0.24281842818428184, + "grad_norm": 0.7864984021030504, + "learning_rate": 1.8783678443927282e-05, + "loss": 1.3507, + "step": 616 + }, + { + "epoch": 0.24321261394432125, + "grad_norm": 0.7607319722931054, + "learning_rate": 1.8777089775925822e-05, + "loss": 1.3028, + "step": 617 + }, + { + "epoch": 0.2436067997043607, + "grad_norm": 0.7531520087715251, + "learning_rate": 1.8770484473072518e-05, + "loss": 1.337, + "step": 618 + }, + { + "epoch": 0.2440009854644001, + "grad_norm": 0.7227583108021773, + "learning_rate": 1.8763862547886133e-05, + "loss": 1.3006, + "step": 619 + }, + { + "epoch": 0.24439517122443952, + "grad_norm": 0.7244215425325586, + "learning_rate": 1.8757224012916913e-05, + "loss": 1.3111, + "step": 620 + }, + { + "epoch": 0.24478935698447893, + "grad_norm": 0.726809176042967, + "learning_rate": 1.8750568880746606e-05, + "loss": 1.2595, + "step": 621 + }, + { + "epoch": 0.24518354274451837, + "grad_norm": 0.7409190065458727, + "learning_rate": 1.87438971639884e-05, + "loss": 1.2985, + "step": 622 + }, + { + "epoch": 0.24557772850455778, + "grad_norm": 0.7027463402470976, + "learning_rate": 1.8737208875286933e-05, + "loss": 1.2993, + "step": 623 + }, + { + "epoch": 0.2459719142645972, + "grad_norm": 0.7354741797652073, + "learning_rate": 1.8730504027318223e-05, + "loss": 1.3101, + "step": 624 + }, + { + "epoch": 0.2463661000246366, + "grad_norm": 0.7151055215992336, + "learning_rate": 1.87237826327897e-05, + "loss": 1.3016, + "step": 625 + }, + { + "epoch": 0.24676028578467604, + "grad_norm": 0.7346955837306206, + "learning_rate": 1.871704470444014e-05, + "loss": 1.3026, + "step": 626 + }, + { + "epoch": 0.24715447154471545, + "grad_norm": 0.7087046803059532, + "learning_rate": 1.8710290255039654e-05, + "loss": 1.3149, + "step": 627 + }, + { + "epoch": 0.24754865730475487, + "grad_norm": 0.7301865796459245, + "learning_rate": 1.870351929738967e-05, + "loss": 1.2857, + "step": 628 + }, + { + "epoch": 0.24794284306479428, + "grad_norm": 0.7189028712874932, + "learning_rate": 1.86967318443229e-05, + "loss": 1.3185, + "step": 629 + }, + { + "epoch": 0.24833702882483372, + "grad_norm": 0.6879300842588244, + "learning_rate": 1.8689927908703325e-05, + "loss": 1.2882, + "step": 630 + }, + { + "epoch": 0.24873121458487313, + "grad_norm": 0.6980954368807367, + "learning_rate": 1.8683107503426158e-05, + "loss": 1.2522, + "step": 631 + }, + { + "epoch": 0.24912540034491254, + "grad_norm": 0.7545776954574633, + "learning_rate": 1.8676270641417824e-05, + "loss": 1.322, + "step": 632 + }, + { + "epoch": 0.24951958610495195, + "grad_norm": 0.7115077185501087, + "learning_rate": 1.8669417335635946e-05, + "loss": 1.2723, + "step": 633 + }, + { + "epoch": 0.2499137718649914, + "grad_norm": 0.7379949770472353, + "learning_rate": 1.866254759906931e-05, + "loss": 1.4362, + "step": 634 + }, + { + "epoch": 0.2503079576250308, + "grad_norm": 0.7573308426125499, + "learning_rate": 1.8655661444737835e-05, + "loss": 1.3177, + "step": 635 + }, + { + "epoch": 0.25070214338507024, + "grad_norm": 0.7257743669215548, + "learning_rate": 1.864875888569257e-05, + "loss": 1.3062, + "step": 636 + }, + { + "epoch": 0.25109632914510965, + "grad_norm": 0.6940203952508667, + "learning_rate": 1.864183993501564e-05, + "loss": 1.2652, + "step": 637 + }, + { + "epoch": 0.25149051490514907, + "grad_norm": 0.8172564591114041, + "learning_rate": 1.863490460582025e-05, + "loss": 1.3199, + "step": 638 + }, + { + "epoch": 0.2518847006651885, + "grad_norm": 0.7226317764207526, + "learning_rate": 1.8627952911250632e-05, + "loss": 1.3106, + "step": 639 + }, + { + "epoch": 0.2522788864252279, + "grad_norm": 0.7438657902645007, + "learning_rate": 1.8620984864482046e-05, + "loss": 1.2981, + "step": 640 + }, + { + "epoch": 0.2526730721852673, + "grad_norm": 0.7422399467375352, + "learning_rate": 1.8614000478720743e-05, + "loss": 1.3406, + "step": 641 + }, + { + "epoch": 0.2530672579453067, + "grad_norm": 0.7811618617681046, + "learning_rate": 1.860699976720393e-05, + "loss": 1.3105, + "step": 642 + }, + { + "epoch": 0.2534614437053461, + "grad_norm": 0.7398963519463426, + "learning_rate": 1.8599982743199775e-05, + "loss": 1.3194, + "step": 643 + }, + { + "epoch": 0.2538556294653856, + "grad_norm": 0.7614275275857106, + "learning_rate": 1.859294942000734e-05, + "loss": 1.2825, + "step": 644 + }, + { + "epoch": 0.254249815225425, + "grad_norm": 0.7495597529607684, + "learning_rate": 1.85858998109566e-05, + "loss": 1.2941, + "step": 645 + }, + { + "epoch": 0.2546440009854644, + "grad_norm": 0.76715001759035, + "learning_rate": 1.857883392940837e-05, + "loss": 1.3126, + "step": 646 + }, + { + "epoch": 0.2550381867455038, + "grad_norm": 0.7357189271424588, + "learning_rate": 1.8571751788754336e-05, + "loss": 1.3363, + "step": 647 + }, + { + "epoch": 0.25543237250554324, + "grad_norm": 0.7382893718452418, + "learning_rate": 1.856465340241697e-05, + "loss": 1.2237, + "step": 648 + }, + { + "epoch": 0.25582655826558265, + "grad_norm": 0.7377308175335368, + "learning_rate": 1.8557538783849555e-05, + "loss": 1.2561, + "step": 649 + }, + { + "epoch": 0.25622074402562206, + "grad_norm": 0.7792573574030509, + "learning_rate": 1.8550407946536127e-05, + "loss": 1.2835, + "step": 650 + }, + { + "epoch": 0.25661492978566147, + "grad_norm": 0.8268845473577122, + "learning_rate": 1.8543260903991467e-05, + "loss": 1.2624, + "step": 651 + }, + { + "epoch": 0.25700911554570094, + "grad_norm": 0.7139020431429061, + "learning_rate": 1.8536097669761066e-05, + "loss": 1.2767, + "step": 652 + }, + { + "epoch": 0.25740330130574035, + "grad_norm": 0.836771495489938, + "learning_rate": 1.85289182574211e-05, + "loss": 1.2564, + "step": 653 + }, + { + "epoch": 0.25779748706577976, + "grad_norm": 0.7744188165849301, + "learning_rate": 1.8521722680578413e-05, + "loss": 1.3551, + "step": 654 + }, + { + "epoch": 0.2581916728258192, + "grad_norm": 0.7733400605257766, + "learning_rate": 1.851451095287048e-05, + "loss": 1.3511, + "step": 655 + }, + { + "epoch": 0.2585858585858586, + "grad_norm": 0.7813471536798385, + "learning_rate": 1.850728308796539e-05, + "loss": 1.2426, + "step": 656 + }, + { + "epoch": 0.258980044345898, + "grad_norm": 0.7708022669200939, + "learning_rate": 1.8500039099561807e-05, + "loss": 1.2708, + "step": 657 + }, + { + "epoch": 0.2593742301059374, + "grad_norm": 0.7838881723591813, + "learning_rate": 1.8492779001388964e-05, + "loss": 1.3396, + "step": 658 + }, + { + "epoch": 0.2597684158659768, + "grad_norm": 0.7443818910969162, + "learning_rate": 1.8485502807206624e-05, + "loss": 1.3021, + "step": 659 + }, + { + "epoch": 0.2601626016260163, + "grad_norm": 0.7268444207695822, + "learning_rate": 1.847821053080505e-05, + "loss": 1.3232, + "step": 660 + }, + { + "epoch": 0.2605567873860557, + "grad_norm": 0.7145438455342924, + "learning_rate": 1.8470902186004995e-05, + "loss": 1.2762, + "step": 661 + }, + { + "epoch": 0.2609509731460951, + "grad_norm": 0.798127221257281, + "learning_rate": 1.8463577786657653e-05, + "loss": 1.3434, + "step": 662 + }, + { + "epoch": 0.2613451589061345, + "grad_norm": 0.8286302645386731, + "learning_rate": 1.845623734664465e-05, + "loss": 1.3648, + "step": 663 + }, + { + "epoch": 0.26173934466617393, + "grad_norm": 0.7056475119658424, + "learning_rate": 1.8448880879878026e-05, + "loss": 1.2664, + "step": 664 + }, + { + "epoch": 0.26213353042621335, + "grad_norm": 0.7486227238349661, + "learning_rate": 1.844150840030018e-05, + "loss": 1.3144, + "step": 665 + }, + { + "epoch": 0.26252771618625276, + "grad_norm": 0.7252618893757948, + "learning_rate": 1.8434119921883865e-05, + "loss": 1.2523, + "step": 666 + }, + { + "epoch": 0.26292190194629217, + "grad_norm": 0.7522705686940889, + "learning_rate": 1.8426715458632154e-05, + "loss": 1.3312, + "step": 667 + }, + { + "epoch": 0.26331608770633164, + "grad_norm": 0.7442803975025406, + "learning_rate": 1.8419295024578417e-05, + "loss": 1.3162, + "step": 668 + }, + { + "epoch": 0.26371027346637105, + "grad_norm": 0.7428662761759469, + "learning_rate": 1.8411858633786298e-05, + "loss": 1.3616, + "step": 669 + }, + { + "epoch": 0.26410445922641046, + "grad_norm": 0.6883090253519637, + "learning_rate": 1.8404406300349673e-05, + "loss": 1.2775, + "step": 670 + }, + { + "epoch": 0.26449864498644987, + "grad_norm": 0.7298650894749236, + "learning_rate": 1.8396938038392636e-05, + "loss": 1.2973, + "step": 671 + }, + { + "epoch": 0.2648928307464893, + "grad_norm": 0.7210785949379522, + "learning_rate": 1.838945386206948e-05, + "loss": 1.2651, + "step": 672 + }, + { + "epoch": 0.2652870165065287, + "grad_norm": 0.7455429622427832, + "learning_rate": 1.8381953785564653e-05, + "loss": 1.2784, + "step": 673 + }, + { + "epoch": 0.2656812022665681, + "grad_norm": 0.7101554754335506, + "learning_rate": 1.8374437823092726e-05, + "loss": 1.2153, + "step": 674 + }, + { + "epoch": 0.2660753880266075, + "grad_norm": 0.7052828798902647, + "learning_rate": 1.836690598889839e-05, + "loss": 1.2874, + "step": 675 + }, + { + "epoch": 0.266469573786647, + "grad_norm": 0.7102957673047738, + "learning_rate": 1.835935829725643e-05, + "loss": 1.3323, + "step": 676 + }, + { + "epoch": 0.2668637595466864, + "grad_norm": 0.7113208099408921, + "learning_rate": 1.8351794762471656e-05, + "loss": 1.2808, + "step": 677 + }, + { + "epoch": 0.2672579453067258, + "grad_norm": 0.713012458638494, + "learning_rate": 1.8344215398878925e-05, + "loss": 1.2499, + "step": 678 + }, + { + "epoch": 0.2676521310667652, + "grad_norm": 0.7458478391351581, + "learning_rate": 1.833662022084309e-05, + "loss": 1.2379, + "step": 679 + }, + { + "epoch": 0.26804631682680463, + "grad_norm": 0.6955091694637261, + "learning_rate": 1.8329009242758977e-05, + "loss": 1.2148, + "step": 680 + }, + { + "epoch": 0.26844050258684404, + "grad_norm": 0.7331960366798272, + "learning_rate": 1.832138247905135e-05, + "loss": 1.3051, + "step": 681 + }, + { + "epoch": 0.26883468834688345, + "grad_norm": 0.7207567261465225, + "learning_rate": 1.8313739944174894e-05, + "loss": 1.3065, + "step": 682 + }, + { + "epoch": 0.26922887410692287, + "grad_norm": 0.7148277245246873, + "learning_rate": 1.8306081652614192e-05, + "loss": 1.2788, + "step": 683 + }, + { + "epoch": 0.26962305986696233, + "grad_norm": 0.7155577906316034, + "learning_rate": 1.829840761888368e-05, + "loss": 1.2429, + "step": 684 + }, + { + "epoch": 0.27001724562700175, + "grad_norm": 0.696356161317749, + "learning_rate": 1.829071785752764e-05, + "loss": 1.2729, + "step": 685 + }, + { + "epoch": 0.27041143138704116, + "grad_norm": 0.7128716614175701, + "learning_rate": 1.8283012383120148e-05, + "loss": 1.3227, + "step": 686 + }, + { + "epoch": 0.27080561714708057, + "grad_norm": 0.7465800322640285, + "learning_rate": 1.827529121026507e-05, + "loss": 1.3252, + "step": 687 + }, + { + "epoch": 0.27119980290712, + "grad_norm": 0.8172136430700996, + "learning_rate": 1.8267554353596027e-05, + "loss": 1.2756, + "step": 688 + }, + { + "epoch": 0.2715939886671594, + "grad_norm": 0.7347557447163089, + "learning_rate": 1.8259801827776358e-05, + "loss": 1.2878, + "step": 689 + }, + { + "epoch": 0.2719881744271988, + "grad_norm": 0.6960464962207745, + "learning_rate": 1.82520336474991e-05, + "loss": 1.2508, + "step": 690 + }, + { + "epoch": 0.2723823601872382, + "grad_norm": 0.7323542648353354, + "learning_rate": 1.8244249827486962e-05, + "loss": 1.3276, + "step": 691 + }, + { + "epoch": 0.2727765459472777, + "grad_norm": 0.7334410491777583, + "learning_rate": 1.8236450382492293e-05, + "loss": 1.2446, + "step": 692 + }, + { + "epoch": 0.2731707317073171, + "grad_norm": 0.7700697100142729, + "learning_rate": 1.8228635327297054e-05, + "loss": 1.2647, + "step": 693 + }, + { + "epoch": 0.2735649174673565, + "grad_norm": 0.6868021899359485, + "learning_rate": 1.8220804676712797e-05, + "loss": 1.2585, + "step": 694 + }, + { + "epoch": 0.2739591032273959, + "grad_norm": 0.7056110870773941, + "learning_rate": 1.8212958445580623e-05, + "loss": 1.2978, + "step": 695 + }, + { + "epoch": 0.27435328898743533, + "grad_norm": 0.7042929029435405, + "learning_rate": 1.8205096648771166e-05, + "loss": 1.2778, + "step": 696 + }, + { + "epoch": 0.27474747474747474, + "grad_norm": 0.7960978757280552, + "learning_rate": 1.8197219301184565e-05, + "loss": 1.3364, + "step": 697 + }, + { + "epoch": 0.27514166050751415, + "grad_norm": 0.7288353276886701, + "learning_rate": 1.818932641775043e-05, + "loss": 1.3099, + "step": 698 + }, + { + "epoch": 0.27553584626755356, + "grad_norm": 0.7479924057933423, + "learning_rate": 1.81814180134278e-05, + "loss": 1.3429, + "step": 699 + }, + { + "epoch": 0.27593003202759303, + "grad_norm": 0.7715814930725846, + "learning_rate": 1.817349410320516e-05, + "loss": 1.2634, + "step": 700 + }, + { + "epoch": 0.27632421778763244, + "grad_norm": 0.7186502326915973, + "learning_rate": 1.816555470210036e-05, + "loss": 1.2677, + "step": 701 + }, + { + "epoch": 0.27671840354767185, + "grad_norm": 0.6963815556934851, + "learning_rate": 1.815759982516061e-05, + "loss": 1.2738, + "step": 702 + }, + { + "epoch": 0.27711258930771127, + "grad_norm": 0.725935134036574, + "learning_rate": 1.8149629487462466e-05, + "loss": 1.3357, + "step": 703 + }, + { + "epoch": 0.2775067750677507, + "grad_norm": 0.7440336010726357, + "learning_rate": 1.814164370411177e-05, + "loss": 1.3394, + "step": 704 + }, + { + "epoch": 0.2779009608277901, + "grad_norm": 0.7144497832774677, + "learning_rate": 1.8133642490243642e-05, + "loss": 1.3247, + "step": 705 + }, + { + "epoch": 0.2782951465878295, + "grad_norm": 0.7330387391854017, + "learning_rate": 1.8125625861022455e-05, + "loss": 1.3037, + "step": 706 + }, + { + "epoch": 0.2786893323478689, + "grad_norm": 0.7408644571783576, + "learning_rate": 1.8117593831641788e-05, + "loss": 1.2714, + "step": 707 + }, + { + "epoch": 0.2790835181079084, + "grad_norm": 0.7538056025050238, + "learning_rate": 1.810954641732441e-05, + "loss": 1.2744, + "step": 708 + }, + { + "epoch": 0.2794777038679478, + "grad_norm": 0.7178383604389642, + "learning_rate": 1.8101483633322255e-05, + "loss": 1.3522, + "step": 709 + }, + { + "epoch": 0.2798718896279872, + "grad_norm": 0.7286512088304942, + "learning_rate": 1.8093405494916373e-05, + "loss": 1.2913, + "step": 710 + }, + { + "epoch": 0.2802660753880266, + "grad_norm": 0.7524538518197109, + "learning_rate": 1.8085312017416926e-05, + "loss": 1.3544, + "step": 711 + }, + { + "epoch": 0.280660261148066, + "grad_norm": 0.7789095889944275, + "learning_rate": 1.8077203216163145e-05, + "loss": 1.3328, + "step": 712 + }, + { + "epoch": 0.28105444690810544, + "grad_norm": 0.7027682398341476, + "learning_rate": 1.8069079106523303e-05, + "loss": 1.316, + "step": 713 + }, + { + "epoch": 0.28144863266814485, + "grad_norm": 0.71974038692439, + "learning_rate": 1.8060939703894684e-05, + "loss": 1.3089, + "step": 714 + }, + { + "epoch": 0.28184281842818426, + "grad_norm": 0.750073440309824, + "learning_rate": 1.805278502370356e-05, + "loss": 1.28, + "step": 715 + }, + { + "epoch": 0.28223700418822373, + "grad_norm": 0.7157617956836964, + "learning_rate": 1.8044615081405153e-05, + "loss": 1.2604, + "step": 716 + }, + { + "epoch": 0.28263118994826314, + "grad_norm": 0.7094277876635081, + "learning_rate": 1.8036429892483615e-05, + "loss": 1.2041, + "step": 717 + }, + { + "epoch": 0.28302537570830255, + "grad_norm": 0.6869213238799484, + "learning_rate": 1.8028229472451994e-05, + "loss": 1.2326, + "step": 718 + }, + { + "epoch": 0.28341956146834196, + "grad_norm": 0.7609339774943211, + "learning_rate": 1.80200138368522e-05, + "loss": 1.2778, + "step": 719 + }, + { + "epoch": 0.2838137472283814, + "grad_norm": 0.7445388720919836, + "learning_rate": 1.801178300125499e-05, + "loss": 1.3466, + "step": 720 + }, + { + "epoch": 0.2842079329884208, + "grad_norm": 0.75543054063603, + "learning_rate": 1.800353698125992e-05, + "loss": 1.2684, + "step": 721 + }, + { + "epoch": 0.2846021187484602, + "grad_norm": 0.7126562502264812, + "learning_rate": 1.7995275792495327e-05, + "loss": 1.3145, + "step": 722 + }, + { + "epoch": 0.2849963045084996, + "grad_norm": 0.750515516790499, + "learning_rate": 1.7986999450618295e-05, + "loss": 1.2766, + "step": 723 + }, + { + "epoch": 0.2853904902685391, + "grad_norm": 0.7302431877687291, + "learning_rate": 1.7978707971314636e-05, + "loss": 1.2127, + "step": 724 + }, + { + "epoch": 0.2857846760285785, + "grad_norm": 0.7122551920492798, + "learning_rate": 1.797040137029884e-05, + "loss": 1.2589, + "step": 725 + }, + { + "epoch": 0.2861788617886179, + "grad_norm": 0.7938703124948006, + "learning_rate": 1.796207966331406e-05, + "loss": 1.3729, + "step": 726 + }, + { + "epoch": 0.2865730475486573, + "grad_norm": 0.7541217984200421, + "learning_rate": 1.7953742866132082e-05, + "loss": 1.2927, + "step": 727 + }, + { + "epoch": 0.2869672333086967, + "grad_norm": 0.7255479166779722, + "learning_rate": 1.794539099455329e-05, + "loss": 1.3431, + "step": 728 + }, + { + "epoch": 0.28736141906873613, + "grad_norm": 0.7453202011835943, + "learning_rate": 1.7937024064406637e-05, + "loss": 1.2764, + "step": 729 + }, + { + "epoch": 0.28775560482877555, + "grad_norm": 0.7449089241310055, + "learning_rate": 1.7928642091549616e-05, + "loss": 1.2666, + "step": 730 + }, + { + "epoch": 0.28814979058881496, + "grad_norm": 0.688535746874336, + "learning_rate": 1.792024509186823e-05, + "loss": 1.2396, + "step": 731 + }, + { + "epoch": 0.2885439763488544, + "grad_norm": 0.7179660403513343, + "learning_rate": 1.7911833081276962e-05, + "loss": 1.2404, + "step": 732 + }, + { + "epoch": 0.28893816210889384, + "grad_norm": 0.6957846541829211, + "learning_rate": 1.7903406075718744e-05, + "loss": 1.3032, + "step": 733 + }, + { + "epoch": 0.28933234786893325, + "grad_norm": 0.7453327673964074, + "learning_rate": 1.7894964091164932e-05, + "loss": 1.3043, + "step": 734 + }, + { + "epoch": 0.28972653362897266, + "grad_norm": 0.6889929678498284, + "learning_rate": 1.788650714361526e-05, + "loss": 1.2273, + "step": 735 + }, + { + "epoch": 0.29012071938901207, + "grad_norm": 0.7514828515828875, + "learning_rate": 1.787803524909783e-05, + "loss": 1.232, + "step": 736 + }, + { + "epoch": 0.2905149051490515, + "grad_norm": 0.69838877253169, + "learning_rate": 1.7869548423669075e-05, + "loss": 1.1814, + "step": 737 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.7028140683366864, + "learning_rate": 1.7861046683413717e-05, + "loss": 1.3324, + "step": 738 + }, + { + "epoch": 0.2913032766691303, + "grad_norm": 0.7609333767596239, + "learning_rate": 1.785253004444475e-05, + "loss": 1.3309, + "step": 739 + }, + { + "epoch": 0.2916974624291698, + "grad_norm": 0.6993070009969047, + "learning_rate": 1.78439985229034e-05, + "loss": 1.2958, + "step": 740 + }, + { + "epoch": 0.2920916481892092, + "grad_norm": 0.7895491591304246, + "learning_rate": 1.7835452134959112e-05, + "loss": 1.2721, + "step": 741 + }, + { + "epoch": 0.2924858339492486, + "grad_norm": 0.7484581002135297, + "learning_rate": 1.7826890896809492e-05, + "loss": 1.2696, + "step": 742 + }, + { + "epoch": 0.292880019709288, + "grad_norm": 0.7180118235912724, + "learning_rate": 1.78183148246803e-05, + "loss": 1.3026, + "step": 743 + }, + { + "epoch": 0.2932742054693274, + "grad_norm": 0.7821323900052215, + "learning_rate": 1.7809723934825405e-05, + "loss": 1.244, + "step": 744 + }, + { + "epoch": 0.29366839122936683, + "grad_norm": 0.731279597221484, + "learning_rate": 1.7801118243526764e-05, + "loss": 1.2841, + "step": 745 + }, + { + "epoch": 0.29406257698940624, + "grad_norm": 0.7328987907210074, + "learning_rate": 1.7792497767094384e-05, + "loss": 1.2574, + "step": 746 + }, + { + "epoch": 0.29445676274944566, + "grad_norm": 0.7546401708479835, + "learning_rate": 1.7783862521866296e-05, + "loss": 1.2514, + "step": 747 + }, + { + "epoch": 0.2948509485094851, + "grad_norm": 0.6961282567593424, + "learning_rate": 1.7775212524208513e-05, + "loss": 1.2659, + "step": 748 + }, + { + "epoch": 0.29524513426952453, + "grad_norm": 0.7069163112336031, + "learning_rate": 1.776654779051502e-05, + "loss": 1.2231, + "step": 749 + }, + { + "epoch": 0.29563932002956395, + "grad_norm": 0.7257978049323676, + "learning_rate": 1.775786833720773e-05, + "loss": 1.2728, + "step": 750 + }, + { + "epoch": 0.29603350578960336, + "grad_norm": 0.7560009441390841, + "learning_rate": 1.7749174180736443e-05, + "loss": 1.2819, + "step": 751 + }, + { + "epoch": 0.29642769154964277, + "grad_norm": 0.6956575266835414, + "learning_rate": 1.7740465337578823e-05, + "loss": 1.3005, + "step": 752 + }, + { + "epoch": 0.2968218773096822, + "grad_norm": 0.7079492136542035, + "learning_rate": 1.7731741824240385e-05, + "loss": 1.227, + "step": 753 + }, + { + "epoch": 0.2972160630697216, + "grad_norm": 0.7184097566051775, + "learning_rate": 1.7723003657254447e-05, + "loss": 1.2924, + "step": 754 + }, + { + "epoch": 0.297610248829761, + "grad_norm": 0.6854141387606205, + "learning_rate": 1.771425085318208e-05, + "loss": 1.2557, + "step": 755 + }, + { + "epoch": 0.29800443458980047, + "grad_norm": 0.6879860581907943, + "learning_rate": 1.7705483428612114e-05, + "loss": 1.2204, + "step": 756 + }, + { + "epoch": 0.2983986203498399, + "grad_norm": 0.7067053556944854, + "learning_rate": 1.7696701400161077e-05, + "loss": 1.2709, + "step": 757 + }, + { + "epoch": 0.2987928061098793, + "grad_norm": 0.6684898845941895, + "learning_rate": 1.768790478447319e-05, + "loss": 1.2379, + "step": 758 + }, + { + "epoch": 0.2991869918699187, + "grad_norm": 0.7669440743034426, + "learning_rate": 1.7679093598220305e-05, + "loss": 1.2965, + "step": 759 + }, + { + "epoch": 0.2995811776299581, + "grad_norm": 0.7264067182866932, + "learning_rate": 1.7670267858101895e-05, + "loss": 1.3299, + "step": 760 + }, + { + "epoch": 0.29997536338999753, + "grad_norm": 0.7154874058477277, + "learning_rate": 1.766142758084502e-05, + "loss": 1.2714, + "step": 761 + }, + { + "epoch": 0.30036954915003694, + "grad_norm": 0.7339691526122842, + "learning_rate": 1.7652572783204286e-05, + "loss": 1.2567, + "step": 762 + }, + { + "epoch": 0.30076373491007635, + "grad_norm": 0.7113428916700398, + "learning_rate": 1.764370348196183e-05, + "loss": 1.2466, + "step": 763 + }, + { + "epoch": 0.3011579206701158, + "grad_norm": 0.7468376219349876, + "learning_rate": 1.7634819693927254e-05, + "loss": 1.2894, + "step": 764 + }, + { + "epoch": 0.30155210643015523, + "grad_norm": 0.706632725084111, + "learning_rate": 1.762592143593764e-05, + "loss": 1.2872, + "step": 765 + }, + { + "epoch": 0.30194629219019464, + "grad_norm": 0.6794782711352044, + "learning_rate": 1.761700872485748e-05, + "loss": 1.2807, + "step": 766 + }, + { + "epoch": 0.30234047795023405, + "grad_norm": 0.7244853098320986, + "learning_rate": 1.7608081577578665e-05, + "loss": 1.2835, + "step": 767 + }, + { + "epoch": 0.30273466371027347, + "grad_norm": 0.778447414784227, + "learning_rate": 1.759914001102045e-05, + "loss": 1.2765, + "step": 768 + }, + { + "epoch": 0.3031288494703129, + "grad_norm": 0.6969578931450477, + "learning_rate": 1.7590184042129406e-05, + "loss": 1.231, + "step": 769 + }, + { + "epoch": 0.3035230352303523, + "grad_norm": 0.6772342269604559, + "learning_rate": 1.758121368787941e-05, + "loss": 1.2599, + "step": 770 + }, + { + "epoch": 0.3039172209903917, + "grad_norm": 0.7659352446323853, + "learning_rate": 1.7572228965271595e-05, + "loss": 1.2728, + "step": 771 + }, + { + "epoch": 0.30431140675043117, + "grad_norm": 0.7140083484092759, + "learning_rate": 1.756322989133434e-05, + "loss": 1.273, + "step": 772 + }, + { + "epoch": 0.3047055925104706, + "grad_norm": 0.7580395855737478, + "learning_rate": 1.7554216483123205e-05, + "loss": 1.257, + "step": 773 + }, + { + "epoch": 0.30509977827051, + "grad_norm": 0.7139671098163918, + "learning_rate": 1.7545188757720933e-05, + "loss": 1.2526, + "step": 774 + }, + { + "epoch": 0.3054939640305494, + "grad_norm": 0.7180915186637021, + "learning_rate": 1.753614673223739e-05, + "loss": 1.284, + "step": 775 + }, + { + "epoch": 0.3058881497905888, + "grad_norm": 0.6906674260442509, + "learning_rate": 1.7527090423809553e-05, + "loss": 1.3048, + "step": 776 + }, + { + "epoch": 0.3062823355506282, + "grad_norm": 0.6975851458655973, + "learning_rate": 1.7518019849601466e-05, + "loss": 1.2902, + "step": 777 + }, + { + "epoch": 0.30667652131066764, + "grad_norm": 0.7046928833082814, + "learning_rate": 1.7508935026804202e-05, + "loss": 1.2339, + "step": 778 + }, + { + "epoch": 0.30707070707070705, + "grad_norm": 0.7051521547776037, + "learning_rate": 1.749983597263586e-05, + "loss": 1.2921, + "step": 779 + }, + { + "epoch": 0.3074648928307465, + "grad_norm": 0.6736469006003648, + "learning_rate": 1.749072270434148e-05, + "loss": 1.271, + "step": 780 + }, + { + "epoch": 0.30785907859078593, + "grad_norm": 1.9120037647074484, + "learning_rate": 1.7481595239193073e-05, + "loss": 1.2196, + "step": 781 + }, + { + "epoch": 0.30825326435082534, + "grad_norm": 0.72077851804003, + "learning_rate": 1.747245359448954e-05, + "loss": 1.2623, + "step": 782 + }, + { + "epoch": 0.30864745011086475, + "grad_norm": 0.6879089595866057, + "learning_rate": 1.7463297787556656e-05, + "loss": 1.2604, + "step": 783 + }, + { + "epoch": 0.30904163587090416, + "grad_norm": 0.7126887694269388, + "learning_rate": 1.745412783574704e-05, + "loss": 1.2688, + "step": 784 + }, + { + "epoch": 0.3094358216309436, + "grad_norm": 0.6783349938024574, + "learning_rate": 1.744494375644012e-05, + "loss": 1.2142, + "step": 785 + }, + { + "epoch": 0.309830007390983, + "grad_norm": 0.7591782870663694, + "learning_rate": 1.7435745567042096e-05, + "loss": 1.3246, + "step": 786 + }, + { + "epoch": 0.3102241931510224, + "grad_norm": 0.7137080648341777, + "learning_rate": 1.7426533284985912e-05, + "loss": 1.256, + "step": 787 + }, + { + "epoch": 0.31061837891106187, + "grad_norm": 0.712242808651282, + "learning_rate": 1.7417306927731226e-05, + "loss": 1.2504, + "step": 788 + }, + { + "epoch": 0.3110125646711013, + "grad_norm": 0.7706834788124493, + "learning_rate": 1.7408066512764365e-05, + "loss": 1.2842, + "step": 789 + }, + { + "epoch": 0.3114067504311407, + "grad_norm": 0.6756575206343757, + "learning_rate": 1.73988120575983e-05, + "loss": 1.2302, + "step": 790 + }, + { + "epoch": 0.3118009361911801, + "grad_norm": 0.7172786293209685, + "learning_rate": 1.7389543579772613e-05, + "loss": 1.2746, + "step": 791 + }, + { + "epoch": 0.3121951219512195, + "grad_norm": 0.7114990921157863, + "learning_rate": 1.738026109685347e-05, + "loss": 1.247, + "step": 792 + }, + { + "epoch": 0.3125893077112589, + "grad_norm": 0.7464653029721845, + "learning_rate": 1.737096462643357e-05, + "loss": 1.2843, + "step": 793 + }, + { + "epoch": 0.31298349347129834, + "grad_norm": 0.7246251283451155, + "learning_rate": 1.736165418613212e-05, + "loss": 1.2896, + "step": 794 + }, + { + "epoch": 0.31337767923133775, + "grad_norm": 0.709039744798614, + "learning_rate": 1.7352329793594817e-05, + "loss": 1.2729, + "step": 795 + }, + { + "epoch": 0.3137718649913772, + "grad_norm": 0.7184347792609641, + "learning_rate": 1.7342991466493785e-05, + "loss": 1.3516, + "step": 796 + }, + { + "epoch": 0.3141660507514166, + "grad_norm": 0.677698026889925, + "learning_rate": 1.7333639222527572e-05, + "loss": 1.2565, + "step": 797 + }, + { + "epoch": 0.31456023651145604, + "grad_norm": 0.7345054222302991, + "learning_rate": 1.732427307942109e-05, + "loss": 1.2509, + "step": 798 + }, + { + "epoch": 0.31495442227149545, + "grad_norm": 0.7766755838188357, + "learning_rate": 1.7314893054925604e-05, + "loss": 1.2766, + "step": 799 + }, + { + "epoch": 0.31534860803153486, + "grad_norm": 0.8110496899704974, + "learning_rate": 1.730549916681868e-05, + "loss": 1.3387, + "step": 800 + }, + { + "epoch": 0.31574279379157427, + "grad_norm": 0.7332603361275668, + "learning_rate": 1.7296091432904164e-05, + "loss": 1.3232, + "step": 801 + }, + { + "epoch": 0.3161369795516137, + "grad_norm": 0.7406352642846648, + "learning_rate": 1.728666987101214e-05, + "loss": 1.2996, + "step": 802 + }, + { + "epoch": 0.3165311653116531, + "grad_norm": 0.7257385239706662, + "learning_rate": 1.7277234498998897e-05, + "loss": 1.2809, + "step": 803 + }, + { + "epoch": 0.31692535107169256, + "grad_norm": 0.7450615958562268, + "learning_rate": 1.726778533474691e-05, + "loss": 1.2937, + "step": 804 + }, + { + "epoch": 0.317319536831732, + "grad_norm": 0.7062517786301892, + "learning_rate": 1.725832239616478e-05, + "loss": 1.3006, + "step": 805 + }, + { + "epoch": 0.3177137225917714, + "grad_norm": 0.7080667822251828, + "learning_rate": 1.724884570118722e-05, + "loss": 1.2349, + "step": 806 + }, + { + "epoch": 0.3181079083518108, + "grad_norm": 0.7066931019098044, + "learning_rate": 1.723935526777502e-05, + "loss": 1.2272, + "step": 807 + }, + { + "epoch": 0.3185020941118502, + "grad_norm": 0.6946668338018744, + "learning_rate": 1.722985111391499e-05, + "loss": 1.2962, + "step": 808 + }, + { + "epoch": 0.3188962798718896, + "grad_norm": 0.6796597060520128, + "learning_rate": 1.7220333257619967e-05, + "loss": 1.3037, + "step": 809 + }, + { + "epoch": 0.31929046563192903, + "grad_norm": 1.6609616990291973, + "learning_rate": 1.721080171692874e-05, + "loss": 1.3676, + "step": 810 + }, + { + "epoch": 0.31968465139196844, + "grad_norm": 0.7455397950852571, + "learning_rate": 1.720125650990605e-05, + "loss": 1.2693, + "step": 811 + }, + { + "epoch": 0.3200788371520079, + "grad_norm": 1.8102002609851213, + "learning_rate": 1.7191697654642517e-05, + "loss": 1.443, + "step": 812 + }, + { + "epoch": 0.3204730229120473, + "grad_norm": 1.6105677014342337, + "learning_rate": 1.7182125169254646e-05, + "loss": 1.3548, + "step": 813 + }, + { + "epoch": 0.32086720867208673, + "grad_norm": 1.9398768550889596, + "learning_rate": 1.717253907188477e-05, + "loss": 1.3585, + "step": 814 + }, + { + "epoch": 0.32126139443212615, + "grad_norm": 1.628604424489859, + "learning_rate": 1.716293938070102e-05, + "loss": 1.3206, + "step": 815 + }, + { + "epoch": 0.32165558019216556, + "grad_norm": 2.801181103409832, + "learning_rate": 1.7153326113897286e-05, + "loss": 1.4204, + "step": 816 + }, + { + "epoch": 0.32204976595220497, + "grad_norm": 1.0130939786005846, + "learning_rate": 1.7143699289693193e-05, + "loss": 1.2738, + "step": 817 + }, + { + "epoch": 0.3224439517122444, + "grad_norm": 6.872564216473981, + "learning_rate": 1.7134058926334063e-05, + "loss": 1.262, + "step": 818 + }, + { + "epoch": 0.3228381374722838, + "grad_norm": 1.4200836123074054, + "learning_rate": 1.7124405042090865e-05, + "loss": 1.3799, + "step": 819 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 5.08400535142629, + "learning_rate": 1.711473765526021e-05, + "loss": 1.3092, + "step": 820 + }, + { + "epoch": 0.32362650899236267, + "grad_norm": 1.5849311506474677, + "learning_rate": 1.7105056784164295e-05, + "loss": 1.2599, + "step": 821 + }, + { + "epoch": 0.3240206947524021, + "grad_norm": 1.0013431185133732, + "learning_rate": 1.7095362447150866e-05, + "loss": 1.3207, + "step": 822 + }, + { + "epoch": 0.3244148805124415, + "grad_norm": 0.6866727508748066, + "learning_rate": 1.7085654662593192e-05, + "loss": 1.2265, + "step": 823 + }, + { + "epoch": 0.3248090662724809, + "grad_norm": 0.7423237770798616, + "learning_rate": 1.7075933448890037e-05, + "loss": 1.2494, + "step": 824 + }, + { + "epoch": 0.3252032520325203, + "grad_norm": 0.7327984292482648, + "learning_rate": 1.706619882446561e-05, + "loss": 1.2826, + "step": 825 + }, + { + "epoch": 0.32559743779255973, + "grad_norm": 0.8307141447009255, + "learning_rate": 1.7056450807769543e-05, + "loss": 1.3328, + "step": 826 + }, + { + "epoch": 0.32599162355259914, + "grad_norm": 0.7685568008883157, + "learning_rate": 1.7046689417276836e-05, + "loss": 1.2668, + "step": 827 + }, + { + "epoch": 0.3263858093126386, + "grad_norm": 0.7143149682827579, + "learning_rate": 1.7036914671487854e-05, + "loss": 1.3147, + "step": 828 + }, + { + "epoch": 0.326779995072678, + "grad_norm": 0.7441227072240346, + "learning_rate": 1.7027126588928255e-05, + "loss": 1.2662, + "step": 829 + }, + { + "epoch": 0.32717418083271743, + "grad_norm": 0.8549422472836754, + "learning_rate": 1.701732518814899e-05, + "loss": 1.2276, + "step": 830 + }, + { + "epoch": 0.32756836659275684, + "grad_norm": 0.7104822685684634, + "learning_rate": 1.7007510487726247e-05, + "loss": 1.2174, + "step": 831 + }, + { + "epoch": 0.32796255235279625, + "grad_norm": 0.7990258038527759, + "learning_rate": 1.699768250626141e-05, + "loss": 1.2084, + "step": 832 + }, + { + "epoch": 0.32835673811283567, + "grad_norm": 0.7941920583151476, + "learning_rate": 1.698784126238105e-05, + "loss": 1.3014, + "step": 833 + }, + { + "epoch": 0.3287509238728751, + "grad_norm": 0.7565823644252784, + "learning_rate": 1.697798677473686e-05, + "loss": 1.3198, + "step": 834 + }, + { + "epoch": 0.3291451096329145, + "grad_norm": 0.776895609925856, + "learning_rate": 1.6968119062005644e-05, + "loss": 1.3171, + "step": 835 + }, + { + "epoch": 0.32953929539295396, + "grad_norm": 0.7511145926401521, + "learning_rate": 1.6958238142889258e-05, + "loss": 1.2645, + "step": 836 + }, + { + "epoch": 0.32993348115299337, + "grad_norm": 0.8590843085742348, + "learning_rate": 1.6948344036114604e-05, + "loss": 1.2381, + "step": 837 + }, + { + "epoch": 0.3303276669130328, + "grad_norm": 0.7298728955089272, + "learning_rate": 1.6938436760433565e-05, + "loss": 1.2919, + "step": 838 + }, + { + "epoch": 0.3307218526730722, + "grad_norm": 0.723873691001796, + "learning_rate": 1.6928516334622988e-05, + "loss": 1.2859, + "step": 839 + }, + { + "epoch": 0.3311160384331116, + "grad_norm": 0.6739547357750979, + "learning_rate": 1.6918582777484642e-05, + "loss": 1.2698, + "step": 840 + }, + { + "epoch": 0.331510224193151, + "grad_norm": 0.7603942315040987, + "learning_rate": 1.690863610784518e-05, + "loss": 1.3326, + "step": 841 + }, + { + "epoch": 0.3319044099531904, + "grad_norm": 0.7428516273827751, + "learning_rate": 1.689867634455612e-05, + "loss": 1.3044, + "step": 842 + }, + { + "epoch": 0.33229859571322984, + "grad_norm": 0.6987204595473288, + "learning_rate": 1.6888703506493774e-05, + "loss": 1.2418, + "step": 843 + }, + { + "epoch": 0.3326927814732693, + "grad_norm": 0.6798174720438129, + "learning_rate": 1.687871761255925e-05, + "loss": 1.2692, + "step": 844 + }, + { + "epoch": 0.3330869672333087, + "grad_norm": 0.6812029162107662, + "learning_rate": 1.6868718681678397e-05, + "loss": 1.2651, + "step": 845 + }, + { + "epoch": 0.33348115299334813, + "grad_norm": 5.833213521596053, + "learning_rate": 1.6858706732801767e-05, + "loss": 1.2184, + "step": 846 + }, + { + "epoch": 0.33387533875338754, + "grad_norm": 2.1210809511503856, + "learning_rate": 1.6848681784904597e-05, + "loss": 1.3386, + "step": 847 + }, + { + "epoch": 0.33426952451342695, + "grad_norm": 4.2587995536151135, + "learning_rate": 1.6838643856986746e-05, + "loss": 1.2538, + "step": 848 + }, + { + "epoch": 0.33466371027346636, + "grad_norm": 0.814091566447592, + "learning_rate": 1.682859296807268e-05, + "loss": 1.2472, + "step": 849 + }, + { + "epoch": 0.3350578960335058, + "grad_norm": 0.7308070804439674, + "learning_rate": 1.6818529137211427e-05, + "loss": 1.222, + "step": 850 + }, + { + "epoch": 0.3354520817935452, + "grad_norm": 0.733680332929859, + "learning_rate": 1.680845238347655e-05, + "loss": 1.2992, + "step": 851 + }, + { + "epoch": 0.33584626755358465, + "grad_norm": 0.7265681835122267, + "learning_rate": 1.6798362725966102e-05, + "loss": 1.2956, + "step": 852 + }, + { + "epoch": 0.33624045331362407, + "grad_norm": 0.7402397151917712, + "learning_rate": 1.6788260183802586e-05, + "loss": 1.3171, + "step": 853 + }, + { + "epoch": 0.3366346390736635, + "grad_norm": 0.7137092615288991, + "learning_rate": 1.6778144776132927e-05, + "loss": 1.2102, + "step": 854 + }, + { + "epoch": 0.3370288248337029, + "grad_norm": 0.7156854110239057, + "learning_rate": 1.6768016522128435e-05, + "loss": 1.3038, + "step": 855 + }, + { + "epoch": 0.3374230105937423, + "grad_norm": 0.711623409866771, + "learning_rate": 1.675787544098477e-05, + "loss": 1.2436, + "step": 856 + }, + { + "epoch": 0.3378171963537817, + "grad_norm": 0.7171571327488878, + "learning_rate": 1.6747721551921894e-05, + "loss": 1.2316, + "step": 857 + }, + { + "epoch": 0.3382113821138211, + "grad_norm": 0.8547583498487163, + "learning_rate": 1.6737554874184058e-05, + "loss": 1.2736, + "step": 858 + }, + { + "epoch": 0.33860556787386054, + "grad_norm": 0.7302470996316592, + "learning_rate": 1.6727375427039734e-05, + "loss": 1.3211, + "step": 859 + }, + { + "epoch": 0.3389997536339, + "grad_norm": 0.8374723063663263, + "learning_rate": 1.671718322978161e-05, + "loss": 1.22, + "step": 860 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.6807758814646102, + "learning_rate": 1.6706978301726523e-05, + "loss": 1.1737, + "step": 861 + }, + { + "epoch": 0.3397881251539788, + "grad_norm": 0.8925191795209313, + "learning_rate": 1.6696760662215457e-05, + "loss": 1.3089, + "step": 862 + }, + { + "epoch": 0.34018231091401824, + "grad_norm": 0.7669197207119955, + "learning_rate": 1.6686530330613472e-05, + "loss": 1.2567, + "step": 863 + }, + { + "epoch": 0.34057649667405765, + "grad_norm": 0.7863821853939692, + "learning_rate": 1.6676287326309684e-05, + "loss": 1.2913, + "step": 864 + }, + { + "epoch": 0.34097068243409706, + "grad_norm": 0.7288234899543948, + "learning_rate": 1.6666031668717246e-05, + "loss": 1.2282, + "step": 865 + }, + { + "epoch": 0.3413648681941365, + "grad_norm": 0.7392427569586649, + "learning_rate": 1.6655763377273258e-05, + "loss": 1.2523, + "step": 866 + }, + { + "epoch": 0.3417590539541759, + "grad_norm": 0.773906001259452, + "learning_rate": 1.6645482471438805e-05, + "loss": 1.2792, + "step": 867 + }, + { + "epoch": 0.34215323971421535, + "grad_norm": 0.7307235011238918, + "learning_rate": 1.6635188970698843e-05, + "loss": 1.2767, + "step": 868 + }, + { + "epoch": 0.34254742547425476, + "grad_norm": 0.7781474135830119, + "learning_rate": 1.662488289456222e-05, + "loss": 1.2846, + "step": 869 + }, + { + "epoch": 0.3429416112342942, + "grad_norm": 0.7962078143230832, + "learning_rate": 1.661456426256161e-05, + "loss": 1.256, + "step": 870 + }, + { + "epoch": 0.3433357969943336, + "grad_norm": 0.6984713096930648, + "learning_rate": 1.660423309425349e-05, + "loss": 1.2114, + "step": 871 + }, + { + "epoch": 0.343729982754373, + "grad_norm": 0.9653083144870128, + "learning_rate": 1.6593889409218084e-05, + "loss": 1.27, + "step": 872 + }, + { + "epoch": 0.3441241685144124, + "grad_norm": 0.7327421492980511, + "learning_rate": 1.6583533227059353e-05, + "loss": 1.2789, + "step": 873 + }, + { + "epoch": 0.3445183542744518, + "grad_norm": 0.7398126983540253, + "learning_rate": 1.657316456740494e-05, + "loss": 1.3085, + "step": 874 + }, + { + "epoch": 0.34491254003449123, + "grad_norm": 0.7299557711967728, + "learning_rate": 1.656278344990612e-05, + "loss": 1.2173, + "step": 875 + }, + { + "epoch": 0.3453067257945307, + "grad_norm": 0.6863138322240955, + "learning_rate": 1.6552389894237806e-05, + "loss": 1.2902, + "step": 876 + }, + { + "epoch": 0.3457009115545701, + "grad_norm": 0.7199868674478724, + "learning_rate": 1.6541983920098462e-05, + "loss": 1.2807, + "step": 877 + }, + { + "epoch": 0.3460950973146095, + "grad_norm": 0.7634746076633273, + "learning_rate": 1.6531565547210095e-05, + "loss": 1.2891, + "step": 878 + }, + { + "epoch": 0.34648928307464893, + "grad_norm": 0.7334440482002302, + "learning_rate": 1.6521134795318214e-05, + "loss": 1.2927, + "step": 879 + }, + { + "epoch": 0.34688346883468835, + "grad_norm": 0.7223249271668641, + "learning_rate": 1.6510691684191795e-05, + "loss": 1.328, + "step": 880 + }, + { + "epoch": 0.34727765459472776, + "grad_norm": 0.7283270674703335, + "learning_rate": 1.650023623362322e-05, + "loss": 1.2518, + "step": 881 + }, + { + "epoch": 0.34767184035476717, + "grad_norm": 0.6859617703188744, + "learning_rate": 1.648976846342827e-05, + "loss": 1.2036, + "step": 882 + }, + { + "epoch": 0.3480660261148066, + "grad_norm": 0.743057000636584, + "learning_rate": 1.647928839344608e-05, + "loss": 1.1975, + "step": 883 + }, + { + "epoch": 0.34846021187484605, + "grad_norm": 0.8879799533842352, + "learning_rate": 1.6468796043539082e-05, + "loss": 1.2689, + "step": 884 + }, + { + "epoch": 0.34885439763488546, + "grad_norm": 0.8750572793943686, + "learning_rate": 1.645829143359299e-05, + "loss": 1.2318, + "step": 885 + }, + { + "epoch": 0.34924858339492487, + "grad_norm": 0.7446142347770219, + "learning_rate": 1.6447774583516756e-05, + "loss": 1.2977, + "step": 886 + }, + { + "epoch": 0.3496427691549643, + "grad_norm": 0.7504423660668825, + "learning_rate": 1.6437245513242523e-05, + "loss": 1.2924, + "step": 887 + }, + { + "epoch": 0.3500369549150037, + "grad_norm": 0.7101861154635718, + "learning_rate": 1.6426704242725603e-05, + "loss": 1.2577, + "step": 888 + }, + { + "epoch": 0.3504311406750431, + "grad_norm": 0.747939528808994, + "learning_rate": 1.6416150791944422e-05, + "loss": 1.258, + "step": 889 + }, + { + "epoch": 0.3508253264350825, + "grad_norm": 0.8886537060733, + "learning_rate": 1.640558518090049e-05, + "loss": 1.2302, + "step": 890 + }, + { + "epoch": 0.35121951219512193, + "grad_norm": 0.7590526147979498, + "learning_rate": 1.639500742961838e-05, + "loss": 1.2814, + "step": 891 + }, + { + "epoch": 0.3516136979551614, + "grad_norm": 0.7361888142899841, + "learning_rate": 1.6384417558145654e-05, + "loss": 1.284, + "step": 892 + }, + { + "epoch": 0.3520078837152008, + "grad_norm": 0.7328949864046489, + "learning_rate": 1.637381558655286e-05, + "loss": 1.2238, + "step": 893 + }, + { + "epoch": 0.3524020694752402, + "grad_norm": 0.7763585243100655, + "learning_rate": 1.6363201534933465e-05, + "loss": 1.2669, + "step": 894 + }, + { + "epoch": 0.35279625523527963, + "grad_norm": 0.7724373870079227, + "learning_rate": 1.635257542340384e-05, + "loss": 1.2572, + "step": 895 + }, + { + "epoch": 0.35319044099531904, + "grad_norm": 0.7384217206450774, + "learning_rate": 1.6341937272103213e-05, + "loss": 1.2394, + "step": 896 + }, + { + "epoch": 0.35358462675535846, + "grad_norm": 0.910247717689576, + "learning_rate": 1.6331287101193625e-05, + "loss": 1.2368, + "step": 897 + }, + { + "epoch": 0.35397881251539787, + "grad_norm": 0.7158162891901805, + "learning_rate": 1.6320624930859905e-05, + "loss": 1.2402, + "step": 898 + }, + { + "epoch": 0.3543729982754373, + "grad_norm": 0.8329732085362143, + "learning_rate": 1.6309950781309612e-05, + "loss": 1.2966, + "step": 899 + }, + { + "epoch": 0.35476718403547675, + "grad_norm": 0.8155246854171831, + "learning_rate": 1.6299264672773025e-05, + "loss": 1.2497, + "step": 900 + }, + { + "epoch": 0.35516136979551616, + "grad_norm": 0.7837030128107672, + "learning_rate": 1.6288566625503076e-05, + "loss": 1.2868, + "step": 901 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.7235353047172081, + "learning_rate": 1.627785665977532e-05, + "loss": 1.3201, + "step": 902 + }, + { + "epoch": 0.355949741315595, + "grad_norm": 0.7380179619209855, + "learning_rate": 1.6267134795887914e-05, + "loss": 1.3081, + "step": 903 + }, + { + "epoch": 0.3563439270756344, + "grad_norm": 0.7592157290500411, + "learning_rate": 1.6256401054161565e-05, + "loss": 1.1903, + "step": 904 + }, + { + "epoch": 0.3567381128356738, + "grad_norm": 0.7467318769646345, + "learning_rate": 1.6245655454939474e-05, + "loss": 1.2442, + "step": 905 + }, + { + "epoch": 0.3571322985957132, + "grad_norm": 0.8375209294354106, + "learning_rate": 1.6234898018587336e-05, + "loss": 1.3645, + "step": 906 + }, + { + "epoch": 0.3575264843557526, + "grad_norm": 0.6897682274849407, + "learning_rate": 1.622412876549327e-05, + "loss": 1.2427, + "step": 907 + }, + { + "epoch": 0.3579206701157921, + "grad_norm": 0.6863050257352118, + "learning_rate": 1.621334771606778e-05, + "loss": 1.2618, + "step": 908 + }, + { + "epoch": 0.3583148558758315, + "grad_norm": 0.7753517670222771, + "learning_rate": 1.6202554890743754e-05, + "loss": 1.3007, + "step": 909 + }, + { + "epoch": 0.3587090416358709, + "grad_norm": 0.7259581040390859, + "learning_rate": 1.619175030997638e-05, + "loss": 1.2528, + "step": 910 + }, + { + "epoch": 0.35910322739591033, + "grad_norm": 0.7718789856797308, + "learning_rate": 1.6180933994243123e-05, + "loss": 1.3085, + "step": 911 + }, + { + "epoch": 0.35949741315594974, + "grad_norm": 0.7146087165544308, + "learning_rate": 1.6170105964043698e-05, + "loss": 1.2306, + "step": 912 + }, + { + "epoch": 0.35989159891598915, + "grad_norm": 0.7346445190650487, + "learning_rate": 1.6159266239900015e-05, + "loss": 1.2984, + "step": 913 + }, + { + "epoch": 0.36028578467602856, + "grad_norm": 0.6888116952571305, + "learning_rate": 1.614841484235616e-05, + "loss": 1.2657, + "step": 914 + }, + { + "epoch": 0.360679970436068, + "grad_norm": 0.6989568807671639, + "learning_rate": 1.6137551791978325e-05, + "loss": 1.2347, + "step": 915 + }, + { + "epoch": 0.36107415619610744, + "grad_norm": 0.6755063819703383, + "learning_rate": 1.61266771093548e-05, + "loss": 1.2551, + "step": 916 + }, + { + "epoch": 0.36146834195614685, + "grad_norm": 0.6534472286383475, + "learning_rate": 1.6115790815095914e-05, + "loss": 1.1829, + "step": 917 + }, + { + "epoch": 0.36186252771618627, + "grad_norm": 0.7262958248573816, + "learning_rate": 1.610489292983401e-05, + "loss": 1.31, + "step": 918 + }, + { + "epoch": 0.3622567134762257, + "grad_norm": 0.7648471804581862, + "learning_rate": 1.6093983474223392e-05, + "loss": 1.259, + "step": 919 + }, + { + "epoch": 0.3626508992362651, + "grad_norm": 0.7020781761512667, + "learning_rate": 1.6083062468940297e-05, + "loss": 1.3028, + "step": 920 + }, + { + "epoch": 0.3630450849963045, + "grad_norm": 0.6839393628121689, + "learning_rate": 1.6072129934682847e-05, + "loss": 1.2558, + "step": 921 + }, + { + "epoch": 0.3634392707563439, + "grad_norm": 0.7058988465923998, + "learning_rate": 1.606118589217102e-05, + "loss": 1.2582, + "step": 922 + }, + { + "epoch": 0.3638334565163833, + "grad_norm": 0.6791475273873648, + "learning_rate": 1.605023036214661e-05, + "loss": 1.2142, + "step": 923 + }, + { + "epoch": 0.3642276422764228, + "grad_norm": 0.6970350336814236, + "learning_rate": 1.6039263365373167e-05, + "loss": 1.2528, + "step": 924 + }, + { + "epoch": 0.3646218280364622, + "grad_norm": 0.6699695799738228, + "learning_rate": 1.602828492263598e-05, + "loss": 1.1959, + "step": 925 + }, + { + "epoch": 0.3650160137965016, + "grad_norm": 0.663408412743378, + "learning_rate": 1.6017295054742045e-05, + "loss": 1.288, + "step": 926 + }, + { + "epoch": 0.365410199556541, + "grad_norm": 0.7158290886170531, + "learning_rate": 1.6006293782519988e-05, + "loss": 1.2376, + "step": 927 + }, + { + "epoch": 0.36580438531658044, + "grad_norm": 0.7543773228580308, + "learning_rate": 1.5995281126820067e-05, + "loss": 1.2899, + "step": 928 + }, + { + "epoch": 0.36619857107661985, + "grad_norm": 0.744149002729838, + "learning_rate": 1.5984257108514107e-05, + "loss": 1.3389, + "step": 929 + }, + { + "epoch": 0.36659275683665926, + "grad_norm": 0.7182715748702388, + "learning_rate": 1.5973221748495472e-05, + "loss": 1.3381, + "step": 930 + }, + { + "epoch": 0.3669869425966987, + "grad_norm": 0.7001237272757365, + "learning_rate": 1.5962175067679013e-05, + "loss": 1.2702, + "step": 931 + }, + { + "epoch": 0.36738112835673814, + "grad_norm": 0.7077959320676287, + "learning_rate": 1.5951117087001048e-05, + "loss": 1.2647, + "step": 932 + }, + { + "epoch": 0.36777531411677755, + "grad_norm": 0.693416521429882, + "learning_rate": 1.5940047827419305e-05, + "loss": 1.307, + "step": 933 + }, + { + "epoch": 0.36816949987681696, + "grad_norm": 5.313983840642, + "learning_rate": 1.592896730991289e-05, + "loss": 1.3227, + "step": 934 + }, + { + "epoch": 0.3685636856368564, + "grad_norm": 0.7174526675424638, + "learning_rate": 1.591787555548225e-05, + "loss": 1.2003, + "step": 935 + }, + { + "epoch": 0.3689578713968958, + "grad_norm": 0.7620783078614348, + "learning_rate": 1.590677258514911e-05, + "loss": 1.2984, + "step": 936 + }, + { + "epoch": 0.3693520571569352, + "grad_norm": 0.7102280092234018, + "learning_rate": 1.5895658419956485e-05, + "loss": 1.1827, + "step": 937 + }, + { + "epoch": 0.3697462429169746, + "grad_norm": 0.7106880003780766, + "learning_rate": 1.588453308096857e-05, + "loss": 1.2557, + "step": 938 + }, + { + "epoch": 0.370140428677014, + "grad_norm": 0.7113617397724621, + "learning_rate": 1.587339658927077e-05, + "loss": 1.2874, + "step": 939 + }, + { + "epoch": 0.3705346144370535, + "grad_norm": 0.7043522365953943, + "learning_rate": 1.5862248965969604e-05, + "loss": 1.2596, + "step": 940 + }, + { + "epoch": 0.3709288001970929, + "grad_norm": 0.7433597815080879, + "learning_rate": 1.5851090232192704e-05, + "loss": 1.3157, + "step": 941 + }, + { + "epoch": 0.3713229859571323, + "grad_norm": 0.6920086062528787, + "learning_rate": 1.5839920409088743e-05, + "loss": 1.2526, + "step": 942 + }, + { + "epoch": 0.3717171717171717, + "grad_norm": 0.6806330894798819, + "learning_rate": 1.5828739517827426e-05, + "loss": 1.2665, + "step": 943 + }, + { + "epoch": 0.37211135747721114, + "grad_norm": 0.693773375915683, + "learning_rate": 1.5817547579599436e-05, + "loss": 1.2284, + "step": 944 + }, + { + "epoch": 0.37250554323725055, + "grad_norm": 0.679887610966136, + "learning_rate": 1.5806344615616375e-05, + "loss": 1.2231, + "step": 945 + }, + { + "epoch": 0.37289972899728996, + "grad_norm": 0.6898748206285744, + "learning_rate": 1.5795130647110755e-05, + "loss": 1.3302, + "step": 946 + }, + { + "epoch": 0.37329391475732937, + "grad_norm": 0.7348938348769922, + "learning_rate": 1.5783905695335947e-05, + "loss": 1.2388, + "step": 947 + }, + { + "epoch": 0.37368810051736884, + "grad_norm": 0.7160016591377841, + "learning_rate": 1.577266978156613e-05, + "loss": 1.2105, + "step": 948 + }, + { + "epoch": 0.37408228627740825, + "grad_norm": 0.840969755169091, + "learning_rate": 1.5761422927096268e-05, + "loss": 1.3243, + "step": 949 + }, + { + "epoch": 0.37447647203744766, + "grad_norm": 0.6987504047644173, + "learning_rate": 1.5750165153242048e-05, + "loss": 1.28, + "step": 950 + }, + { + "epoch": 0.3748706577974871, + "grad_norm": 0.6995543811490563, + "learning_rate": 1.5738896481339857e-05, + "loss": 1.2808, + "step": 951 + }, + { + "epoch": 0.3752648435575265, + "grad_norm": 0.7027815016727716, + "learning_rate": 1.5727616932746748e-05, + "loss": 1.348, + "step": 952 + }, + { + "epoch": 0.3756590293175659, + "grad_norm": 0.7080676371673893, + "learning_rate": 1.5716326528840374e-05, + "loss": 1.2808, + "step": 953 + }, + { + "epoch": 0.3760532150776053, + "grad_norm": 0.6906991486703912, + "learning_rate": 1.570502529101896e-05, + "loss": 1.2822, + "step": 954 + }, + { + "epoch": 0.3764474008376447, + "grad_norm": 0.667842860069977, + "learning_rate": 1.569371324070128e-05, + "loss": 1.3153, + "step": 955 + }, + { + "epoch": 0.3768415865976842, + "grad_norm": 0.6680351163338653, + "learning_rate": 1.5682390399326585e-05, + "loss": 1.2659, + "step": 956 + }, + { + "epoch": 0.3772357723577236, + "grad_norm": 0.6839204182409985, + "learning_rate": 1.5671056788354583e-05, + "loss": 1.2726, + "step": 957 + }, + { + "epoch": 0.377629958117763, + "grad_norm": 0.6663129665848542, + "learning_rate": 1.5659712429265403e-05, + "loss": 1.2778, + "step": 958 + }, + { + "epoch": 0.3780241438778024, + "grad_norm": 0.693810071056339, + "learning_rate": 1.5648357343559518e-05, + "loss": 1.313, + "step": 959 + }, + { + "epoch": 0.37841832963784183, + "grad_norm": 0.7242639411060869, + "learning_rate": 1.5636991552757762e-05, + "loss": 1.229, + "step": 960 + }, + { + "epoch": 0.37881251539788124, + "grad_norm": 0.6902168937478176, + "learning_rate": 1.5625615078401244e-05, + "loss": 1.2342, + "step": 961 + }, + { + "epoch": 0.37920670115792066, + "grad_norm": 0.6978251892798721, + "learning_rate": 1.561422794205131e-05, + "loss": 1.3456, + "step": 962 + }, + { + "epoch": 0.37960088691796007, + "grad_norm": 0.710891024016947, + "learning_rate": 1.5602830165289536e-05, + "loss": 1.2539, + "step": 963 + }, + { + "epoch": 0.37999507267799953, + "grad_norm": 0.6933794057288072, + "learning_rate": 1.5591421769717642e-05, + "loss": 1.2406, + "step": 964 + }, + { + "epoch": 0.38038925843803895, + "grad_norm": 0.6512417427643563, + "learning_rate": 1.5580002776957493e-05, + "loss": 1.2212, + "step": 965 + }, + { + "epoch": 0.38078344419807836, + "grad_norm": 0.6798711415370834, + "learning_rate": 1.5568573208651027e-05, + "loss": 1.2299, + "step": 966 + }, + { + "epoch": 0.38117762995811777, + "grad_norm": 0.7169966010210781, + "learning_rate": 1.555713308646022e-05, + "loss": 1.2823, + "step": 967 + }, + { + "epoch": 0.3815718157181572, + "grad_norm": 0.7176225879361188, + "learning_rate": 1.5545682432067068e-05, + "loss": 1.3277, + "step": 968 + }, + { + "epoch": 0.3819660014781966, + "grad_norm": 0.6634455855323579, + "learning_rate": 1.5534221267173513e-05, + "loss": 1.2707, + "step": 969 + }, + { + "epoch": 0.382360187238236, + "grad_norm": 0.6523220060774133, + "learning_rate": 1.5522749613501424e-05, + "loss": 1.2224, + "step": 970 + }, + { + "epoch": 0.3827543729982754, + "grad_norm": 0.697086935286512, + "learning_rate": 1.551126749279255e-05, + "loss": 1.2247, + "step": 971 + }, + { + "epoch": 0.3831485587583149, + "grad_norm": 0.6605814150970358, + "learning_rate": 1.5499774926808468e-05, + "loss": 1.2624, + "step": 972 + }, + { + "epoch": 0.3835427445183543, + "grad_norm": 0.7011947342499778, + "learning_rate": 1.5488271937330562e-05, + "loss": 1.2972, + "step": 973 + }, + { + "epoch": 0.3839369302783937, + "grad_norm": 0.693697524489148, + "learning_rate": 1.5476758546159966e-05, + "loss": 1.2054, + "step": 974 + }, + { + "epoch": 0.3843311160384331, + "grad_norm": 0.6700050469107739, + "learning_rate": 1.5465234775117538e-05, + "loss": 1.2642, + "step": 975 + }, + { + "epoch": 0.38472530179847253, + "grad_norm": 0.6977970028023794, + "learning_rate": 1.5453700646043793e-05, + "loss": 1.2929, + "step": 976 + }, + { + "epoch": 0.38511948755851194, + "grad_norm": 0.7256704791236026, + "learning_rate": 1.5442156180798883e-05, + "loss": 1.2111, + "step": 977 + }, + { + "epoch": 0.38551367331855135, + "grad_norm": 0.6833079658478705, + "learning_rate": 1.5430601401262554e-05, + "loss": 1.3011, + "step": 978 + }, + { + "epoch": 0.38590785907859076, + "grad_norm": 0.6451358367434681, + "learning_rate": 1.54190363293341e-05, + "loss": 1.1995, + "step": 979 + }, + { + "epoch": 0.38630204483863023, + "grad_norm": 0.6686548263294536, + "learning_rate": 1.540746098693231e-05, + "loss": 1.2538, + "step": 980 + }, + { + "epoch": 0.38669623059866964, + "grad_norm": 0.6858165127408108, + "learning_rate": 1.5395875395995456e-05, + "loss": 1.3015, + "step": 981 + }, + { + "epoch": 0.38709041635870906, + "grad_norm": 0.6603138490124963, + "learning_rate": 1.5384279578481223e-05, + "loss": 1.2443, + "step": 982 + }, + { + "epoch": 0.38748460211874847, + "grad_norm": 0.6594884559786018, + "learning_rate": 1.537267355636668e-05, + "loss": 1.2314, + "step": 983 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.6918016955048513, + "learning_rate": 1.536105735164823e-05, + "loss": 1.2714, + "step": 984 + }, + { + "epoch": 0.3882729736388273, + "grad_norm": 0.75800219367767, + "learning_rate": 1.5349430986341588e-05, + "loss": 1.2889, + "step": 985 + }, + { + "epoch": 0.3886671593988667, + "grad_norm": 0.7150281786878397, + "learning_rate": 1.5337794482481714e-05, + "loss": 1.2301, + "step": 986 + }, + { + "epoch": 0.3890613451589061, + "grad_norm": 0.6864306072281939, + "learning_rate": 1.5326147862122796e-05, + "loss": 1.2146, + "step": 987 + }, + { + "epoch": 0.3894555309189456, + "grad_norm": 0.7281857146660934, + "learning_rate": 1.531449114733818e-05, + "loss": 1.2998, + "step": 988 + }, + { + "epoch": 0.389849716678985, + "grad_norm": 0.7064026433919306, + "learning_rate": 1.5302824360220352e-05, + "loss": 1.213, + "step": 989 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.678827373077648, + "learning_rate": 1.5291147522880887e-05, + "loss": 1.2899, + "step": 990 + }, + { + "epoch": 0.3906380881990638, + "grad_norm": 0.6825912010344036, + "learning_rate": 1.5279460657450408e-05, + "loss": 1.2508, + "step": 991 + }, + { + "epoch": 0.3910322739591032, + "grad_norm": 0.6897275293582734, + "learning_rate": 1.5267763786078544e-05, + "loss": 1.3103, + "step": 992 + }, + { + "epoch": 0.39142645971914264, + "grad_norm": 0.6889677484856918, + "learning_rate": 1.5256056930933884e-05, + "loss": 1.2385, + "step": 993 + }, + { + "epoch": 0.39182064547918205, + "grad_norm": 0.6756715938128258, + "learning_rate": 1.5244340114203946e-05, + "loss": 1.2811, + "step": 994 + }, + { + "epoch": 0.39221483123922146, + "grad_norm": 0.6588263063642222, + "learning_rate": 1.5232613358095121e-05, + "loss": 1.2008, + "step": 995 + }, + { + "epoch": 0.39260901699926093, + "grad_norm": 0.6649629443766613, + "learning_rate": 1.522087668483264e-05, + "loss": 1.2887, + "step": 996 + }, + { + "epoch": 0.39300320275930034, + "grad_norm": 0.69537586560042, + "learning_rate": 1.5209130116660532e-05, + "loss": 1.2318, + "step": 997 + }, + { + "epoch": 0.39339738851933975, + "grad_norm": 0.6548532801163026, + "learning_rate": 1.5197373675841572e-05, + "loss": 1.2321, + "step": 998 + }, + { + "epoch": 0.39379157427937916, + "grad_norm": 0.6789611198366031, + "learning_rate": 1.5185607384657257e-05, + "loss": 1.2501, + "step": 999 + }, + { + "epoch": 0.3941857600394186, + "grad_norm": 0.669469647081716, + "learning_rate": 1.5173831265407749e-05, + "loss": 1.2316, + "step": 1000 + }, + { + "epoch": 0.394579945799458, + "grad_norm": 0.6441524856006325, + "learning_rate": 1.5162045340411826e-05, + "loss": 1.2215, + "step": 1001 + }, + { + "epoch": 0.3949741315594974, + "grad_norm": 0.6585151163796467, + "learning_rate": 1.5150249632006871e-05, + "loss": 1.2364, + "step": 1002 + }, + { + "epoch": 0.3953683173195368, + "grad_norm": 0.6590764235984096, + "learning_rate": 1.5138444162548791e-05, + "loss": 1.2507, + "step": 1003 + }, + { + "epoch": 0.3957625030795763, + "grad_norm": 0.6746142261487992, + "learning_rate": 1.5126628954412002e-05, + "loss": 1.3095, + "step": 1004 + }, + { + "epoch": 0.3961566888396157, + "grad_norm": 0.6425820917957424, + "learning_rate": 1.5114804029989372e-05, + "loss": 1.2455, + "step": 1005 + }, + { + "epoch": 0.3965508745996551, + "grad_norm": 0.6885768302093563, + "learning_rate": 1.5102969411692186e-05, + "loss": 1.2067, + "step": 1006 + }, + { + "epoch": 0.3969450603596945, + "grad_norm": 0.6715538405865114, + "learning_rate": 1.5091125121950105e-05, + "loss": 1.2723, + "step": 1007 + }, + { + "epoch": 0.3973392461197339, + "grad_norm": 0.6572204758977973, + "learning_rate": 1.5079271183211118e-05, + "loss": 1.2676, + "step": 1008 + }, + { + "epoch": 0.39773343187977334, + "grad_norm": 0.6913182919431603, + "learning_rate": 1.5067407617941499e-05, + "loss": 1.2723, + "step": 1009 + }, + { + "epoch": 0.39812761763981275, + "grad_norm": 0.6859364323759741, + "learning_rate": 1.5055534448625766e-05, + "loss": 1.2672, + "step": 1010 + }, + { + "epoch": 0.39852180339985216, + "grad_norm": 0.6924966624789022, + "learning_rate": 1.5043651697766642e-05, + "loss": 1.2032, + "step": 1011 + }, + { + "epoch": 0.3989159891598916, + "grad_norm": 0.696108235634334, + "learning_rate": 1.5031759387885008e-05, + "loss": 1.2286, + "step": 1012 + }, + { + "epoch": 0.39931017491993104, + "grad_norm": 0.683816830333667, + "learning_rate": 1.5019857541519866e-05, + "loss": 1.2596, + "step": 1013 + }, + { + "epoch": 0.39970436067997045, + "grad_norm": 0.6544409734476196, + "learning_rate": 1.5007946181228286e-05, + "loss": 1.1861, + "step": 1014 + }, + { + "epoch": 0.40009854644000986, + "grad_norm": 0.6828313055454289, + "learning_rate": 1.4996025329585368e-05, + "loss": 1.2627, + "step": 1015 + }, + { + "epoch": 0.4004927322000493, + "grad_norm": 0.7238896612698483, + "learning_rate": 1.4984095009184215e-05, + "loss": 1.2237, + "step": 1016 + }, + { + "epoch": 0.4008869179600887, + "grad_norm": 0.7255960311346755, + "learning_rate": 1.4972155242635853e-05, + "loss": 1.2553, + "step": 1017 + }, + { + "epoch": 0.4012811037201281, + "grad_norm": 0.6462351578732584, + "learning_rate": 1.496020605256923e-05, + "loss": 1.1924, + "step": 1018 + }, + { + "epoch": 0.4016752894801675, + "grad_norm": 0.6627446653808322, + "learning_rate": 1.4948247461631148e-05, + "loss": 1.237, + "step": 1019 + }, + { + "epoch": 0.402069475240207, + "grad_norm": 0.6825306611455508, + "learning_rate": 1.4936279492486222e-05, + "loss": 1.2397, + "step": 1020 + }, + { + "epoch": 0.4024636610002464, + "grad_norm": 0.7150438816039062, + "learning_rate": 1.4924302167816845e-05, + "loss": 1.2152, + "step": 1021 + }, + { + "epoch": 0.4028578467602858, + "grad_norm": 0.7093178992414255, + "learning_rate": 1.4912315510323138e-05, + "loss": 1.2576, + "step": 1022 + }, + { + "epoch": 0.4032520325203252, + "grad_norm": 0.6985543458898392, + "learning_rate": 1.4900319542722921e-05, + "loss": 1.2673, + "step": 1023 + }, + { + "epoch": 0.4036462182803646, + "grad_norm": 0.6831019226556653, + "learning_rate": 1.488831428775164e-05, + "loss": 1.2049, + "step": 1024 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 0.6567400662964415, + "learning_rate": 1.4876299768162361e-05, + "loss": 1.1799, + "step": 1025 + }, + { + "epoch": 0.40443458980044344, + "grad_norm": 0.6954618711419809, + "learning_rate": 1.48642760067257e-05, + "loss": 1.329, + "step": 1026 + }, + { + "epoch": 0.40482877556048286, + "grad_norm": 0.7107685604813471, + "learning_rate": 1.4852243026229787e-05, + "loss": 1.2487, + "step": 1027 + }, + { + "epoch": 0.4052229613205223, + "grad_norm": 0.674580720557361, + "learning_rate": 1.4840200849480226e-05, + "loss": 1.2157, + "step": 1028 + }, + { + "epoch": 0.40561714708056174, + "grad_norm": 0.6638304289674144, + "learning_rate": 1.4828149499300061e-05, + "loss": 1.314, + "step": 1029 + }, + { + "epoch": 0.40601133284060115, + "grad_norm": 0.6757193376832867, + "learning_rate": 1.4816088998529707e-05, + "loss": 1.1997, + "step": 1030 + }, + { + "epoch": 0.40640551860064056, + "grad_norm": 0.7111016241633684, + "learning_rate": 1.4804019370026927e-05, + "loss": 1.2307, + "step": 1031 + }, + { + "epoch": 0.40679970436067997, + "grad_norm": 0.6336887603576372, + "learning_rate": 1.4791940636666785e-05, + "loss": 1.2429, + "step": 1032 + }, + { + "epoch": 0.4071938901207194, + "grad_norm": 0.7121301295945476, + "learning_rate": 1.47798528213416e-05, + "loss": 1.2347, + "step": 1033 + }, + { + "epoch": 0.4075880758807588, + "grad_norm": 0.6798719496665275, + "learning_rate": 1.4767755946960902e-05, + "loss": 1.214, + "step": 1034 + }, + { + "epoch": 0.4079822616407982, + "grad_norm": 0.672163959841733, + "learning_rate": 1.4755650036451397e-05, + "loss": 1.2129, + "step": 1035 + }, + { + "epoch": 0.40837644740083767, + "grad_norm": 0.6580322284929199, + "learning_rate": 1.474353511275691e-05, + "loss": 1.233, + "step": 1036 + }, + { + "epoch": 0.4087706331608771, + "grad_norm": 0.8559124631644651, + "learning_rate": 1.4731411198838346e-05, + "loss": 1.3092, + "step": 1037 + }, + { + "epoch": 0.4091648189209165, + "grad_norm": 0.6612192406553391, + "learning_rate": 1.4719278317673655e-05, + "loss": 1.255, + "step": 1038 + }, + { + "epoch": 0.4095590046809559, + "grad_norm": 0.6480565858040689, + "learning_rate": 1.4707136492257783e-05, + "loss": 1.1938, + "step": 1039 + }, + { + "epoch": 0.4099531904409953, + "grad_norm": 0.6847017126697683, + "learning_rate": 1.4694985745602623e-05, + "loss": 1.2823, + "step": 1040 + }, + { + "epoch": 0.41034737620103473, + "grad_norm": 0.6625824656368514, + "learning_rate": 1.4682826100736973e-05, + "loss": 1.2196, + "step": 1041 + }, + { + "epoch": 0.41074156196107414, + "grad_norm": 0.6520046231301477, + "learning_rate": 1.4670657580706511e-05, + "loss": 1.2129, + "step": 1042 + }, + { + "epoch": 0.41113574772111355, + "grad_norm": 0.6568163192077175, + "learning_rate": 1.4658480208573717e-05, + "loss": 1.205, + "step": 1043 + }, + { + "epoch": 0.411529933481153, + "grad_norm": 0.7355354070775183, + "learning_rate": 1.4646294007417858e-05, + "loss": 1.2509, + "step": 1044 + }, + { + "epoch": 0.41192411924119243, + "grad_norm": 0.6584335682341751, + "learning_rate": 1.4634099000334932e-05, + "loss": 1.2131, + "step": 1045 + }, + { + "epoch": 0.41231830500123184, + "grad_norm": 0.6787385568676211, + "learning_rate": 1.4621895210437627e-05, + "loss": 1.2844, + "step": 1046 + }, + { + "epoch": 0.41271249076127126, + "grad_norm": 0.6534106417043676, + "learning_rate": 1.4609682660855277e-05, + "loss": 1.2036, + "step": 1047 + }, + { + "epoch": 0.41310667652131067, + "grad_norm": 0.6670476383359956, + "learning_rate": 1.4597461374733817e-05, + "loss": 1.2027, + "step": 1048 + }, + { + "epoch": 0.4135008622813501, + "grad_norm": 0.6869267202912966, + "learning_rate": 1.458523137523574e-05, + "loss": 1.2417, + "step": 1049 + }, + { + "epoch": 0.4138950480413895, + "grad_norm": 0.6825156046026267, + "learning_rate": 1.4572992685540057e-05, + "loss": 1.2732, + "step": 1050 + }, + { + "epoch": 0.4142892338014289, + "grad_norm": 0.6393859537214149, + "learning_rate": 1.4560745328842238e-05, + "loss": 1.2022, + "step": 1051 + }, + { + "epoch": 0.41468341956146837, + "grad_norm": 0.6783345452247255, + "learning_rate": 1.4548489328354197e-05, + "loss": 1.2039, + "step": 1052 + }, + { + "epoch": 0.4150776053215078, + "grad_norm": 0.6856742550565621, + "learning_rate": 1.4536224707304209e-05, + "loss": 1.2333, + "step": 1053 + }, + { + "epoch": 0.4154717910815472, + "grad_norm": 0.6797781228333333, + "learning_rate": 1.4523951488936905e-05, + "loss": 1.2458, + "step": 1054 + }, + { + "epoch": 0.4158659768415866, + "grad_norm": 0.6687542124726085, + "learning_rate": 1.4511669696513206e-05, + "loss": 1.2859, + "step": 1055 + }, + { + "epoch": 0.416260162601626, + "grad_norm": 0.654994598290333, + "learning_rate": 1.4499379353310275e-05, + "loss": 1.2514, + "step": 1056 + }, + { + "epoch": 0.4166543483616654, + "grad_norm": 0.6710277195302214, + "learning_rate": 1.4487080482621485e-05, + "loss": 1.1726, + "step": 1057 + }, + { + "epoch": 0.41704853412170484, + "grad_norm": 0.6975157864795727, + "learning_rate": 1.4474773107756379e-05, + "loss": 1.3039, + "step": 1058 + }, + { + "epoch": 0.41744271988174425, + "grad_norm": 0.6847631484475221, + "learning_rate": 1.4462457252040606e-05, + "loss": 1.2934, + "step": 1059 + }, + { + "epoch": 0.4178369056417837, + "grad_norm": 0.6569155149197007, + "learning_rate": 1.4450132938815896e-05, + "loss": 1.2399, + "step": 1060 + }, + { + "epoch": 0.41823109140182313, + "grad_norm": 0.6551116832105975, + "learning_rate": 1.443780019144e-05, + "loss": 1.2549, + "step": 1061 + }, + { + "epoch": 0.41862527716186254, + "grad_norm": 0.6908963315449874, + "learning_rate": 1.4425459033286664e-05, + "loss": 1.2723, + "step": 1062 + }, + { + "epoch": 0.41901946292190195, + "grad_norm": 0.669999734161243, + "learning_rate": 1.4413109487745571e-05, + "loss": 1.2034, + "step": 1063 + }, + { + "epoch": 0.41941364868194136, + "grad_norm": 0.6569047790921405, + "learning_rate": 1.4400751578222293e-05, + "loss": 1.2124, + "step": 1064 + }, + { + "epoch": 0.4198078344419808, + "grad_norm": 0.6641788447379324, + "learning_rate": 1.438838532813827e-05, + "loss": 1.2311, + "step": 1065 + }, + { + "epoch": 0.4202020202020202, + "grad_norm": 0.6421382945573415, + "learning_rate": 1.437601076093073e-05, + "loss": 1.2624, + "step": 1066 + }, + { + "epoch": 0.4205962059620596, + "grad_norm": 0.6987072260804941, + "learning_rate": 1.4363627900052676e-05, + "loss": 1.2533, + "step": 1067 + }, + { + "epoch": 0.42099039172209907, + "grad_norm": 0.7038543852283208, + "learning_rate": 1.435123676897283e-05, + "loss": 1.2362, + "step": 1068 + }, + { + "epoch": 0.4213845774821385, + "grad_norm": 0.6582422377441999, + "learning_rate": 1.4338837391175582e-05, + "loss": 1.2929, + "step": 1069 + }, + { + "epoch": 0.4217787632421779, + "grad_norm": 0.6549666509553242, + "learning_rate": 1.4326429790160958e-05, + "loss": 1.2912, + "step": 1070 + }, + { + "epoch": 0.4221729490022173, + "grad_norm": 0.6609389567208854, + "learning_rate": 1.4314013989444566e-05, + "loss": 1.2242, + "step": 1071 + }, + { + "epoch": 0.4225671347622567, + "grad_norm": 0.6742945321694513, + "learning_rate": 1.4301590012557553e-05, + "loss": 1.2606, + "step": 1072 + }, + { + "epoch": 0.4229613205222961, + "grad_norm": 0.6841196388200714, + "learning_rate": 1.4289157883046567e-05, + "loss": 1.1914, + "step": 1073 + }, + { + "epoch": 0.42335550628233554, + "grad_norm": 0.6781835047036432, + "learning_rate": 1.4276717624473697e-05, + "loss": 1.2149, + "step": 1074 + }, + { + "epoch": 0.42374969204237495, + "grad_norm": 0.6384771187611207, + "learning_rate": 1.4264269260416455e-05, + "loss": 1.194, + "step": 1075 + }, + { + "epoch": 0.4241438778024144, + "grad_norm": 0.6392205051998697, + "learning_rate": 1.4251812814467701e-05, + "loss": 1.2314, + "step": 1076 + }, + { + "epoch": 0.4245380635624538, + "grad_norm": 0.6789060040382907, + "learning_rate": 1.4239348310235613e-05, + "loss": 1.2207, + "step": 1077 + }, + { + "epoch": 0.42493224932249324, + "grad_norm": 0.6479589435408246, + "learning_rate": 1.4226875771343656e-05, + "loss": 1.2104, + "step": 1078 + }, + { + "epoch": 0.42532643508253265, + "grad_norm": 0.6575432784037729, + "learning_rate": 1.4214395221430501e-05, + "loss": 1.2749, + "step": 1079 + }, + { + "epoch": 0.42572062084257206, + "grad_norm": 0.701850378214208, + "learning_rate": 1.420190668415002e-05, + "loss": 1.2202, + "step": 1080 + }, + { + "epoch": 0.4261148066026115, + "grad_norm": 2.0536053216353896, + "learning_rate": 1.4189410183171214e-05, + "loss": 1.1963, + "step": 1081 + }, + { + "epoch": 0.4265089923626509, + "grad_norm": 0.6609999350419868, + "learning_rate": 1.417690574217818e-05, + "loss": 1.2504, + "step": 1082 + }, + { + "epoch": 0.4269031781226903, + "grad_norm": 0.6612267333571307, + "learning_rate": 1.4164393384870065e-05, + "loss": 1.2665, + "step": 1083 + }, + { + "epoch": 0.42729736388272976, + "grad_norm": 0.6757638887255789, + "learning_rate": 1.4151873134961014e-05, + "loss": 1.1514, + "step": 1084 + }, + { + "epoch": 0.4276915496427692, + "grad_norm": 0.683456163531099, + "learning_rate": 1.4139345016180135e-05, + "loss": 1.3079, + "step": 1085 + }, + { + "epoch": 0.4280857354028086, + "grad_norm": 0.8513875836873347, + "learning_rate": 1.4126809052271453e-05, + "loss": 1.2724, + "step": 1086 + }, + { + "epoch": 0.428479921162848, + "grad_norm": 0.6442638283664752, + "learning_rate": 1.4114265266993847e-05, + "loss": 1.2173, + "step": 1087 + }, + { + "epoch": 0.4288741069228874, + "grad_norm": 0.6509895157275494, + "learning_rate": 1.4101713684121042e-05, + "loss": 1.2479, + "step": 1088 + }, + { + "epoch": 0.4292682926829268, + "grad_norm": 0.6474693228576278, + "learning_rate": 1.408915432744152e-05, + "loss": 1.2125, + "step": 1089 + }, + { + "epoch": 0.42966247844296623, + "grad_norm": 0.6735783131189829, + "learning_rate": 1.407658722075851e-05, + "loss": 1.2068, + "step": 1090 + }, + { + "epoch": 0.43005666420300565, + "grad_norm": 0.6537663595057571, + "learning_rate": 1.406401238788992e-05, + "loss": 1.2156, + "step": 1091 + }, + { + "epoch": 0.4304508499630451, + "grad_norm": 0.6544657627047221, + "learning_rate": 1.4051429852668312e-05, + "loss": 1.2576, + "step": 1092 + }, + { + "epoch": 0.4308450357230845, + "grad_norm": 0.6301328044253675, + "learning_rate": 1.4038839638940835e-05, + "loss": 1.1426, + "step": 1093 + }, + { + "epoch": 0.43123922148312394, + "grad_norm": 0.6847962737010194, + "learning_rate": 1.4026241770569198e-05, + "loss": 1.1885, + "step": 1094 + }, + { + "epoch": 0.43163340724316335, + "grad_norm": 0.6471962172332811, + "learning_rate": 1.4013636271429612e-05, + "loss": 1.2111, + "step": 1095 + }, + { + "epoch": 0.43202759300320276, + "grad_norm": 0.6655421827524571, + "learning_rate": 1.4001023165412754e-05, + "loss": 1.2754, + "step": 1096 + }, + { + "epoch": 0.43242177876324217, + "grad_norm": 0.6748073371066969, + "learning_rate": 1.3988402476423722e-05, + "loss": 1.254, + "step": 1097 + }, + { + "epoch": 0.4328159645232816, + "grad_norm": 0.6557610559912413, + "learning_rate": 1.3975774228381975e-05, + "loss": 1.2439, + "step": 1098 + }, + { + "epoch": 0.433210150283321, + "grad_norm": 0.6632658788983514, + "learning_rate": 1.3963138445221311e-05, + "loss": 1.2516, + "step": 1099 + }, + { + "epoch": 0.43360433604336046, + "grad_norm": 0.6491486867598589, + "learning_rate": 1.3950495150889793e-05, + "loss": 1.2335, + "step": 1100 + }, + { + "epoch": 0.4339985218033999, + "grad_norm": 0.6517729673881756, + "learning_rate": 1.3937844369349736e-05, + "loss": 1.2167, + "step": 1101 + }, + { + "epoch": 0.4343927075634393, + "grad_norm": 0.6782382384926667, + "learning_rate": 1.3925186124577639e-05, + "loss": 1.2425, + "step": 1102 + }, + { + "epoch": 0.4347868933234787, + "grad_norm": 0.6591309286023143, + "learning_rate": 1.3912520440564139e-05, + "loss": 1.2043, + "step": 1103 + }, + { + "epoch": 0.4351810790835181, + "grad_norm": 0.6546464680178252, + "learning_rate": 1.3899847341313982e-05, + "loss": 1.1904, + "step": 1104 + }, + { + "epoch": 0.4355752648435575, + "grad_norm": 0.6446542186074286, + "learning_rate": 1.3887166850845963e-05, + "loss": 1.1976, + "step": 1105 + }, + { + "epoch": 0.43596945060359693, + "grad_norm": 0.6591279097552126, + "learning_rate": 1.3874478993192886e-05, + "loss": 1.2711, + "step": 1106 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.6877144132235246, + "learning_rate": 1.386178379240152e-05, + "loss": 1.2061, + "step": 1107 + }, + { + "epoch": 0.4367578221236758, + "grad_norm": 0.6207199280492006, + "learning_rate": 1.3849081272532545e-05, + "loss": 1.1999, + "step": 1108 + }, + { + "epoch": 0.4371520078837152, + "grad_norm": 0.6863520493826831, + "learning_rate": 1.383637145766052e-05, + "loss": 1.2781, + "step": 1109 + }, + { + "epoch": 0.43754619364375463, + "grad_norm": 0.6329597392455102, + "learning_rate": 1.3823654371873827e-05, + "loss": 1.2203, + "step": 1110 + }, + { + "epoch": 0.43794037940379404, + "grad_norm": 0.6453430853174527, + "learning_rate": 1.3810930039274626e-05, + "loss": 1.2341, + "step": 1111 + }, + { + "epoch": 0.43833456516383346, + "grad_norm": 0.7008614015575915, + "learning_rate": 1.3798198483978816e-05, + "loss": 1.3045, + "step": 1112 + }, + { + "epoch": 0.43872875092387287, + "grad_norm": 0.6526995169723234, + "learning_rate": 1.3785459730115975e-05, + "loss": 1.2444, + "step": 1113 + }, + { + "epoch": 0.4391229366839123, + "grad_norm": 0.6648665882412224, + "learning_rate": 1.3772713801829338e-05, + "loss": 1.2346, + "step": 1114 + }, + { + "epoch": 0.4395171224439517, + "grad_norm": 0.6521080562166568, + "learning_rate": 1.375996072327573e-05, + "loss": 1.2473, + "step": 1115 + }, + { + "epoch": 0.43991130820399116, + "grad_norm": 0.6354275169637564, + "learning_rate": 1.374720051862553e-05, + "loss": 1.2316, + "step": 1116 + }, + { + "epoch": 0.44030549396403057, + "grad_norm": 0.6614840460671958, + "learning_rate": 1.3734433212062617e-05, + "loss": 1.2004, + "step": 1117 + }, + { + "epoch": 0.44069967972407, + "grad_norm": 0.6662537159779596, + "learning_rate": 1.3721658827784335e-05, + "loss": 1.2901, + "step": 1118 + }, + { + "epoch": 0.4410938654841094, + "grad_norm": 0.6687056517988047, + "learning_rate": 1.3708877390001442e-05, + "loss": 1.2539, + "step": 1119 + }, + { + "epoch": 0.4414880512441488, + "grad_norm": 0.6733214755511964, + "learning_rate": 1.3696088922938065e-05, + "loss": 1.2515, + "step": 1120 + }, + { + "epoch": 0.4418822370041882, + "grad_norm": 0.6535655596127494, + "learning_rate": 1.3683293450831649e-05, + "loss": 1.2305, + "step": 1121 + }, + { + "epoch": 0.44227642276422763, + "grad_norm": 0.710139241305188, + "learning_rate": 1.3670490997932922e-05, + "loss": 1.3349, + "step": 1122 + }, + { + "epoch": 0.44267060852426704, + "grad_norm": 0.6301043045063337, + "learning_rate": 1.3657681588505835e-05, + "loss": 1.1704, + "step": 1123 + }, + { + "epoch": 0.4430647942843065, + "grad_norm": 0.6659655009342225, + "learning_rate": 1.3644865246827528e-05, + "loss": 1.2175, + "step": 1124 + }, + { + "epoch": 0.4434589800443459, + "grad_norm": 0.6562665211091786, + "learning_rate": 1.3632041997188278e-05, + "loss": 1.298, + "step": 1125 + }, + { + "epoch": 0.44385316580438533, + "grad_norm": 0.6649159181775033, + "learning_rate": 1.3619211863891458e-05, + "loss": 1.2194, + "step": 1126 + }, + { + "epoch": 0.44424735156442474, + "grad_norm": 0.6563076400799585, + "learning_rate": 1.3606374871253474e-05, + "loss": 1.2257, + "step": 1127 + }, + { + "epoch": 0.44464153732446415, + "grad_norm": 0.6289604646597672, + "learning_rate": 1.3593531043603756e-05, + "loss": 1.2144, + "step": 1128 + }, + { + "epoch": 0.44503572308450356, + "grad_norm": 1.1206270057176397, + "learning_rate": 1.3580680405284666e-05, + "loss": 1.1742, + "step": 1129 + }, + { + "epoch": 0.445429908844543, + "grad_norm": 0.7010573881465098, + "learning_rate": 1.3567822980651481e-05, + "loss": 1.2557, + "step": 1130 + }, + { + "epoch": 0.4458240946045824, + "grad_norm": 0.6819687881969332, + "learning_rate": 1.3554958794072346e-05, + "loss": 1.2628, + "step": 1131 + }, + { + "epoch": 0.44621828036462186, + "grad_norm": 0.6631424239254387, + "learning_rate": 1.3542087869928215e-05, + "loss": 1.2664, + "step": 1132 + }, + { + "epoch": 0.44661246612466127, + "grad_norm": 0.6884792830902806, + "learning_rate": 1.3529210232612815e-05, + "loss": 1.2151, + "step": 1133 + }, + { + "epoch": 0.4470066518847007, + "grad_norm": 0.6743020797905825, + "learning_rate": 1.3516325906532592e-05, + "loss": 1.2173, + "step": 1134 + }, + { + "epoch": 0.4474008376447401, + "grad_norm": 0.6748726425122616, + "learning_rate": 1.350343491610667e-05, + "loss": 1.2951, + "step": 1135 + }, + { + "epoch": 0.4477950234047795, + "grad_norm": 0.6790188323448472, + "learning_rate": 1.3490537285766809e-05, + "loss": 1.2548, + "step": 1136 + }, + { + "epoch": 0.4481892091648189, + "grad_norm": 0.7188066208980596, + "learning_rate": 1.3477633039957346e-05, + "loss": 1.3093, + "step": 1137 + }, + { + "epoch": 0.4485833949248583, + "grad_norm": 0.6778429503766523, + "learning_rate": 1.3464722203135164e-05, + "loss": 1.253, + "step": 1138 + }, + { + "epoch": 0.44897758068489774, + "grad_norm": 0.6610758959536769, + "learning_rate": 1.3451804799769625e-05, + "loss": 1.1997, + "step": 1139 + }, + { + "epoch": 0.4493717664449372, + "grad_norm": 0.6661694419731813, + "learning_rate": 1.3438880854342552e-05, + "loss": 1.2346, + "step": 1140 + }, + { + "epoch": 0.4497659522049766, + "grad_norm": 0.6668706103840563, + "learning_rate": 1.3425950391348154e-05, + "loss": 1.2652, + "step": 1141 + }, + { + "epoch": 0.450160137965016, + "grad_norm": 0.653413813618824, + "learning_rate": 1.3413013435293004e-05, + "loss": 1.1574, + "step": 1142 + }, + { + "epoch": 0.45055432372505544, + "grad_norm": 0.6626392658566362, + "learning_rate": 1.3400070010695966e-05, + "loss": 1.2584, + "step": 1143 + }, + { + "epoch": 0.45094850948509485, + "grad_norm": 0.6612645982158664, + "learning_rate": 1.3387120142088182e-05, + "loss": 1.3095, + "step": 1144 + }, + { + "epoch": 0.45134269524513426, + "grad_norm": 0.6343193781713191, + "learning_rate": 1.3374163854012987e-05, + "loss": 1.1738, + "step": 1145 + }, + { + "epoch": 0.4517368810051737, + "grad_norm": 0.6914178485118841, + "learning_rate": 1.33612011710259e-05, + "loss": 1.2289, + "step": 1146 + }, + { + "epoch": 0.4521310667652131, + "grad_norm": 0.6349842783208113, + "learning_rate": 1.3348232117694555e-05, + "loss": 1.1942, + "step": 1147 + }, + { + "epoch": 0.45252525252525255, + "grad_norm": 0.6878005677404854, + "learning_rate": 1.333525671859865e-05, + "loss": 1.2197, + "step": 1148 + }, + { + "epoch": 0.45291943828529196, + "grad_norm": 0.708515154245003, + "learning_rate": 1.3322274998329925e-05, + "loss": 1.217, + "step": 1149 + }, + { + "epoch": 0.4533136240453314, + "grad_norm": 0.6654307895746174, + "learning_rate": 1.3309286981492084e-05, + "loss": 1.2182, + "step": 1150 + }, + { + "epoch": 0.4537078098053708, + "grad_norm": 0.6849958565571799, + "learning_rate": 1.3296292692700781e-05, + "loss": 1.262, + "step": 1151 + }, + { + "epoch": 0.4541019955654102, + "grad_norm": 0.661458414456228, + "learning_rate": 1.3283292156583542e-05, + "loss": 1.2237, + "step": 1152 + }, + { + "epoch": 0.4544961813254496, + "grad_norm": 0.6445694725984406, + "learning_rate": 1.3270285397779743e-05, + "loss": 1.2046, + "step": 1153 + }, + { + "epoch": 0.454890367085489, + "grad_norm": 0.6880572438702209, + "learning_rate": 1.3257272440940559e-05, + "loss": 1.2517, + "step": 1154 + }, + { + "epoch": 0.45528455284552843, + "grad_norm": 0.6462853469948439, + "learning_rate": 1.324425331072889e-05, + "loss": 1.1937, + "step": 1155 + }, + { + "epoch": 0.4556787386055679, + "grad_norm": 0.6937504964864099, + "learning_rate": 1.3231228031819358e-05, + "loss": 1.2315, + "step": 1156 + }, + { + "epoch": 0.4560729243656073, + "grad_norm": 0.6935002768528703, + "learning_rate": 1.3218196628898232e-05, + "loss": 1.2941, + "step": 1157 + }, + { + "epoch": 0.4564671101256467, + "grad_norm": 0.6646155460144206, + "learning_rate": 1.320515912666338e-05, + "loss": 1.1961, + "step": 1158 + }, + { + "epoch": 0.45686129588568614, + "grad_norm": 0.675642433429094, + "learning_rate": 1.319211554982424e-05, + "loss": 1.1793, + "step": 1159 + }, + { + "epoch": 0.45725548164572555, + "grad_norm": 0.6626358544782226, + "learning_rate": 1.3179065923101759e-05, + "loss": 1.2279, + "step": 1160 + }, + { + "epoch": 0.45764966740576496, + "grad_norm": 0.6633366399850951, + "learning_rate": 1.3166010271228347e-05, + "loss": 1.2472, + "step": 1161 + }, + { + "epoch": 0.45804385316580437, + "grad_norm": 0.6572172161629819, + "learning_rate": 1.3152948618947839e-05, + "loss": 1.2959, + "step": 1162 + }, + { + "epoch": 0.4584380389258438, + "grad_norm": 0.6234010246471685, + "learning_rate": 1.3139880991015432e-05, + "loss": 1.1878, + "step": 1163 + }, + { + "epoch": 0.45883222468588325, + "grad_norm": 0.6445399860459299, + "learning_rate": 1.3126807412197666e-05, + "loss": 1.2468, + "step": 1164 + }, + { + "epoch": 0.45922641044592266, + "grad_norm": 0.6746604279800079, + "learning_rate": 1.3113727907272341e-05, + "loss": 1.2452, + "step": 1165 + }, + { + "epoch": 0.4596205962059621, + "grad_norm": 0.6634669603961608, + "learning_rate": 1.3100642501028502e-05, + "loss": 1.2124, + "step": 1166 + }, + { + "epoch": 0.4600147819660015, + "grad_norm": 0.6589031509633928, + "learning_rate": 1.3087551218266373e-05, + "loss": 1.2681, + "step": 1167 + }, + { + "epoch": 0.4604089677260409, + "grad_norm": 0.6488880528092997, + "learning_rate": 1.307445408379731e-05, + "loss": 1.2313, + "step": 1168 + }, + { + "epoch": 0.4608031534860803, + "grad_norm": 0.6461518831877928, + "learning_rate": 1.3061351122443774e-05, + "loss": 1.173, + "step": 1169 + }, + { + "epoch": 0.4611973392461197, + "grad_norm": 0.6719867860616543, + "learning_rate": 1.304824235903925e-05, + "loss": 1.2363, + "step": 1170 + }, + { + "epoch": 0.46159152500615913, + "grad_norm": 0.6720218506435118, + "learning_rate": 1.3035127818428239e-05, + "loss": 1.2999, + "step": 1171 + }, + { + "epoch": 0.4619857107661986, + "grad_norm": 0.6216405882359431, + "learning_rate": 1.302200752546618e-05, + "loss": 1.1873, + "step": 1172 + }, + { + "epoch": 0.462379896526238, + "grad_norm": 0.6615993873842473, + "learning_rate": 1.3008881505019413e-05, + "loss": 1.2329, + "step": 1173 + }, + { + "epoch": 0.4627740822862774, + "grad_norm": 0.6332451929136712, + "learning_rate": 1.2995749781965139e-05, + "loss": 1.1945, + "step": 1174 + }, + { + "epoch": 0.46316826804631683, + "grad_norm": 0.6600204388313866, + "learning_rate": 1.2982612381191368e-05, + "loss": 1.1736, + "step": 1175 + }, + { + "epoch": 0.46356245380635624, + "grad_norm": 0.6700748596784245, + "learning_rate": 1.296946932759686e-05, + "loss": 1.2847, + "step": 1176 + }, + { + "epoch": 0.46395663956639566, + "grad_norm": 0.6650184197669182, + "learning_rate": 1.2956320646091106e-05, + "loss": 1.2097, + "step": 1177 + }, + { + "epoch": 0.46435082532643507, + "grad_norm": 0.6626476795340289, + "learning_rate": 1.2943166361594242e-05, + "loss": 1.2041, + "step": 1178 + }, + { + "epoch": 0.4647450110864745, + "grad_norm": 0.6475300925870908, + "learning_rate": 1.293000649903704e-05, + "loss": 1.2847, + "step": 1179 + }, + { + "epoch": 0.46513919684651395, + "grad_norm": 0.6563755699385965, + "learning_rate": 1.2916841083360836e-05, + "loss": 1.2188, + "step": 1180 + }, + { + "epoch": 0.46553338260655336, + "grad_norm": 0.6558206126815487, + "learning_rate": 1.2903670139517495e-05, + "loss": 1.2171, + "step": 1181 + }, + { + "epoch": 0.46592756836659277, + "grad_norm": 0.6366861432284558, + "learning_rate": 1.2890493692469357e-05, + "loss": 1.2451, + "step": 1182 + }, + { + "epoch": 0.4663217541266322, + "grad_norm": 0.6759773243408979, + "learning_rate": 1.2877311767189192e-05, + "loss": 1.2673, + "step": 1183 + }, + { + "epoch": 0.4667159398866716, + "grad_norm": 0.6419744413255126, + "learning_rate": 1.2864124388660148e-05, + "loss": 1.1927, + "step": 1184 + }, + { + "epoch": 0.467110125646711, + "grad_norm": 0.6665800678685042, + "learning_rate": 1.2850931581875723e-05, + "loss": 1.241, + "step": 1185 + }, + { + "epoch": 0.4675043114067504, + "grad_norm": 0.647473022755396, + "learning_rate": 1.283773337183968e-05, + "loss": 1.2654, + "step": 1186 + }, + { + "epoch": 0.46789849716678983, + "grad_norm": 0.6627384520276431, + "learning_rate": 1.2824529783566044e-05, + "loss": 1.2103, + "step": 1187 + }, + { + "epoch": 0.4682926829268293, + "grad_norm": 0.6984420515522787, + "learning_rate": 1.2811320842079026e-05, + "loss": 1.2189, + "step": 1188 + }, + { + "epoch": 0.4686868686868687, + "grad_norm": 0.6838425822588616, + "learning_rate": 1.2798106572412973e-05, + "loss": 1.2817, + "step": 1189 + }, + { + "epoch": 0.4690810544469081, + "grad_norm": 0.6918032431384864, + "learning_rate": 1.278488699961235e-05, + "loss": 1.2529, + "step": 1190 + }, + { + "epoch": 0.46947524020694753, + "grad_norm": 0.6948726963202924, + "learning_rate": 1.2771662148731653e-05, + "loss": 1.2411, + "step": 1191 + }, + { + "epoch": 0.46986942596698694, + "grad_norm": 0.6429092095036071, + "learning_rate": 1.275843204483539e-05, + "loss": 1.2295, + "step": 1192 + }, + { + "epoch": 0.47026361172702635, + "grad_norm": 0.6351964026733381, + "learning_rate": 1.2745196712998032e-05, + "loss": 1.2073, + "step": 1193 + }, + { + "epoch": 0.47065779748706577, + "grad_norm": 0.6921674003382929, + "learning_rate": 1.2731956178303941e-05, + "loss": 1.2549, + "step": 1194 + }, + { + "epoch": 0.4710519832471052, + "grad_norm": 0.6322772440878668, + "learning_rate": 1.2718710465847355e-05, + "loss": 1.2263, + "step": 1195 + }, + { + "epoch": 0.47144616900714464, + "grad_norm": 0.6452486149856621, + "learning_rate": 1.2705459600732319e-05, + "loss": 1.2562, + "step": 1196 + }, + { + "epoch": 0.47184035476718406, + "grad_norm": 0.6629534381246308, + "learning_rate": 1.2692203608072646e-05, + "loss": 1.2418, + "step": 1197 + }, + { + "epoch": 0.47223454052722347, + "grad_norm": 0.6619087288650083, + "learning_rate": 1.2678942512991865e-05, + "loss": 1.1517, + "step": 1198 + }, + { + "epoch": 0.4726287262872629, + "grad_norm": 0.6639361742877278, + "learning_rate": 1.2665676340623172e-05, + "loss": 1.1919, + "step": 1199 + }, + { + "epoch": 0.4730229120473023, + "grad_norm": 0.6771450309425207, + "learning_rate": 1.2652405116109394e-05, + "loss": 1.2983, + "step": 1200 + }, + { + "epoch": 0.4734170978073417, + "grad_norm": 0.6592820641641075, + "learning_rate": 1.2639128864602932e-05, + "loss": 1.2035, + "step": 1201 + }, + { + "epoch": 0.4738112835673811, + "grad_norm": 0.6754237204338704, + "learning_rate": 1.2625847611265703e-05, + "loss": 1.2545, + "step": 1202 + }, + { + "epoch": 0.4742054693274205, + "grad_norm": 0.6746663309712343, + "learning_rate": 1.2612561381269113e-05, + "loss": 1.167, + "step": 1203 + }, + { + "epoch": 0.47459965508746, + "grad_norm": 0.6499219261911088, + "learning_rate": 1.2599270199794008e-05, + "loss": 1.2697, + "step": 1204 + }, + { + "epoch": 0.4749938408474994, + "grad_norm": 0.6496215506080194, + "learning_rate": 1.2585974092030597e-05, + "loss": 1.2177, + "step": 1205 + }, + { + "epoch": 0.4753880266075388, + "grad_norm": 0.6507804232904032, + "learning_rate": 1.2572673083178448e-05, + "loss": 1.2166, + "step": 1206 + }, + { + "epoch": 0.47578221236757823, + "grad_norm": 0.6350993220502519, + "learning_rate": 1.2559367198446401e-05, + "loss": 1.1809, + "step": 1207 + }, + { + "epoch": 0.47617639812761764, + "grad_norm": 0.6638184807925088, + "learning_rate": 1.254605646305255e-05, + "loss": 1.3182, + "step": 1208 + }, + { + "epoch": 0.47657058388765705, + "grad_norm": 0.638690190001186, + "learning_rate": 1.2532740902224171e-05, + "loss": 1.219, + "step": 1209 + }, + { + "epoch": 0.47696476964769646, + "grad_norm": 0.6431222064327176, + "learning_rate": 1.2519420541197696e-05, + "loss": 1.2105, + "step": 1210 + }, + { + "epoch": 0.4773589554077359, + "grad_norm": 0.6385515617572074, + "learning_rate": 1.2506095405218646e-05, + "loss": 1.2066, + "step": 1211 + }, + { + "epoch": 0.47775314116777534, + "grad_norm": 0.6625298662888042, + "learning_rate": 1.249276551954159e-05, + "loss": 1.2048, + "step": 1212 + }, + { + "epoch": 0.47814732692781475, + "grad_norm": 0.6511188776236311, + "learning_rate": 1.2479430909430109e-05, + "loss": 1.2683, + "step": 1213 + }, + { + "epoch": 0.47854151268785416, + "grad_norm": 0.6431132536314119, + "learning_rate": 1.2466091600156736e-05, + "loss": 1.2451, + "step": 1214 + }, + { + "epoch": 0.4789356984478936, + "grad_norm": 0.6639747730945537, + "learning_rate": 1.2452747617002902e-05, + "loss": 1.2442, + "step": 1215 + }, + { + "epoch": 0.479329884207933, + "grad_norm": 0.6533976794673589, + "learning_rate": 1.24393989852589e-05, + "loss": 1.2325, + "step": 1216 + }, + { + "epoch": 0.4797240699679724, + "grad_norm": 0.6457330805526268, + "learning_rate": 1.2426045730223842e-05, + "loss": 1.2082, + "step": 1217 + }, + { + "epoch": 0.4801182557280118, + "grad_norm": 0.6610877473382107, + "learning_rate": 1.2412687877205587e-05, + "loss": 1.2377, + "step": 1218 + }, + { + "epoch": 0.4805124414880512, + "grad_norm": 0.6592577931155573, + "learning_rate": 1.2399325451520718e-05, + "loss": 1.2529, + "step": 1219 + }, + { + "epoch": 0.4809066272480907, + "grad_norm": 0.6661159851544838, + "learning_rate": 1.2385958478494487e-05, + "loss": 1.3026, + "step": 1220 + }, + { + "epoch": 0.4813008130081301, + "grad_norm": 0.6643157743331228, + "learning_rate": 1.2372586983460755e-05, + "loss": 1.1742, + "step": 1221 + }, + { + "epoch": 0.4816949987681695, + "grad_norm": 0.6520829662785887, + "learning_rate": 1.2359210991761958e-05, + "loss": 1.2212, + "step": 1222 + }, + { + "epoch": 0.4820891845282089, + "grad_norm": 0.6421284812980386, + "learning_rate": 1.2345830528749059e-05, + "loss": 1.2352, + "step": 1223 + }, + { + "epoch": 0.48248337028824834, + "grad_norm": 0.6474967726372801, + "learning_rate": 1.233244561978149e-05, + "loss": 1.1619, + "step": 1224 + }, + { + "epoch": 0.48287755604828775, + "grad_norm": 0.6621910058206888, + "learning_rate": 1.2319056290227106e-05, + "loss": 1.2398, + "step": 1225 + }, + { + "epoch": 0.48327174180832716, + "grad_norm": 0.5884735021292232, + "learning_rate": 1.2305662565462146e-05, + "loss": 1.2038, + "step": 1226 + }, + { + "epoch": 0.48366592756836657, + "grad_norm": 0.641700494355378, + "learning_rate": 1.2292264470871183e-05, + "loss": 1.2872, + "step": 1227 + }, + { + "epoch": 0.48406011332840604, + "grad_norm": 0.6360792810507947, + "learning_rate": 1.2278862031847061e-05, + "loss": 1.237, + "step": 1228 + }, + { + "epoch": 0.48445429908844545, + "grad_norm": 0.6242051518141506, + "learning_rate": 1.226545527379086e-05, + "loss": 1.1896, + "step": 1229 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.6506990087447501, + "learning_rate": 1.2252044222111859e-05, + "loss": 1.1949, + "step": 1230 + }, + { + "epoch": 0.4852426706085243, + "grad_norm": 0.6592019538150893, + "learning_rate": 1.2238628902227454e-05, + "loss": 1.1833, + "step": 1231 + }, + { + "epoch": 0.4856368563685637, + "grad_norm": 0.6880800573570197, + "learning_rate": 1.2225209339563144e-05, + "loss": 1.2481, + "step": 1232 + }, + { + "epoch": 0.4860310421286031, + "grad_norm": 0.6700259002004992, + "learning_rate": 1.2211785559552472e-05, + "loss": 1.27, + "step": 1233 + }, + { + "epoch": 0.4864252278886425, + "grad_norm": 0.6679202540830845, + "learning_rate": 1.2198357587636958e-05, + "loss": 1.182, + "step": 1234 + }, + { + "epoch": 0.4868194136486819, + "grad_norm": 0.6583277626537555, + "learning_rate": 1.2184925449266083e-05, + "loss": 1.2575, + "step": 1235 + }, + { + "epoch": 0.4872135994087214, + "grad_norm": 0.6510891521467633, + "learning_rate": 1.2171489169897217e-05, + "loss": 1.216, + "step": 1236 + }, + { + "epoch": 0.4876077851687608, + "grad_norm": 0.697605524032823, + "learning_rate": 1.215804877499558e-05, + "loss": 1.2935, + "step": 1237 + }, + { + "epoch": 0.4880019709288002, + "grad_norm": 0.6752644934446952, + "learning_rate": 1.2144604290034193e-05, + "loss": 1.1875, + "step": 1238 + }, + { + "epoch": 0.4883961566888396, + "grad_norm": 0.6290688021299883, + "learning_rate": 1.2131155740493816e-05, + "loss": 1.1835, + "step": 1239 + }, + { + "epoch": 0.48879034244887903, + "grad_norm": 0.6341260406172561, + "learning_rate": 1.211770315186294e-05, + "loss": 1.2685, + "step": 1240 + }, + { + "epoch": 0.48918452820891845, + "grad_norm": 0.6299349925825592, + "learning_rate": 1.2104246549637683e-05, + "loss": 1.2167, + "step": 1241 + }, + { + "epoch": 0.48957871396895786, + "grad_norm": 0.6372753688281468, + "learning_rate": 1.2090785959321783e-05, + "loss": 1.2302, + "step": 1242 + }, + { + "epoch": 0.48997289972899727, + "grad_norm": 0.6420141409041106, + "learning_rate": 1.2077321406426542e-05, + "loss": 1.1826, + "step": 1243 + }, + { + "epoch": 0.49036708548903674, + "grad_norm": 0.6693778503790639, + "learning_rate": 1.2063852916470755e-05, + "loss": 1.2352, + "step": 1244 + }, + { + "epoch": 0.49076127124907615, + "grad_norm": 0.6667762505796914, + "learning_rate": 1.2050380514980697e-05, + "loss": 1.2304, + "step": 1245 + }, + { + "epoch": 0.49115545700911556, + "grad_norm": 0.6574623314489658, + "learning_rate": 1.2036904227490043e-05, + "loss": 1.2036, + "step": 1246 + }, + { + "epoch": 0.49154964276915497, + "grad_norm": 0.6576866899161838, + "learning_rate": 1.2023424079539841e-05, + "loss": 1.2693, + "step": 1247 + }, + { + "epoch": 0.4919438285291944, + "grad_norm": 0.6854866850287104, + "learning_rate": 1.2009940096678451e-05, + "loss": 1.2331, + "step": 1248 + }, + { + "epoch": 0.4923380142892338, + "grad_norm": 0.6591589410360849, + "learning_rate": 1.1996452304461502e-05, + "loss": 1.1481, + "step": 1249 + }, + { + "epoch": 0.4927322000492732, + "grad_norm": 0.657166055362852, + "learning_rate": 1.1982960728451847e-05, + "loss": 1.2066, + "step": 1250 + }, + { + "epoch": 0.4931263858093126, + "grad_norm": 0.6500616754839462, + "learning_rate": 1.1969465394219503e-05, + "loss": 1.2311, + "step": 1251 + }, + { + "epoch": 0.4935205715693521, + "grad_norm": 0.7215977353713153, + "learning_rate": 1.1955966327341614e-05, + "loss": 1.2991, + "step": 1252 + }, + { + "epoch": 0.4939147573293915, + "grad_norm": 0.6380629207396062, + "learning_rate": 1.1942463553402407e-05, + "loss": 1.1492, + "step": 1253 + }, + { + "epoch": 0.4943089430894309, + "grad_norm": 0.6438522141604093, + "learning_rate": 1.192895709799311e-05, + "loss": 1.2256, + "step": 1254 + }, + { + "epoch": 0.4947031288494703, + "grad_norm": 0.6829774495136759, + "learning_rate": 1.1915446986711953e-05, + "loss": 1.2092, + "step": 1255 + }, + { + "epoch": 0.49509731460950973, + "grad_norm": 0.6414485475773434, + "learning_rate": 1.1901933245164085e-05, + "loss": 1.1672, + "step": 1256 + }, + { + "epoch": 0.49549150036954914, + "grad_norm": 0.6353044864393161, + "learning_rate": 1.1888415898961538e-05, + "loss": 1.2124, + "step": 1257 + }, + { + "epoch": 0.49588568612958855, + "grad_norm": 0.6459942965869777, + "learning_rate": 1.1874894973723173e-05, + "loss": 1.2434, + "step": 1258 + }, + { + "epoch": 0.49627987188962797, + "grad_norm": 0.6455190632225122, + "learning_rate": 1.1861370495074631e-05, + "loss": 1.1948, + "step": 1259 + }, + { + "epoch": 0.49667405764966743, + "grad_norm": 0.6611317837642312, + "learning_rate": 1.1847842488648296e-05, + "loss": 1.2226, + "step": 1260 + }, + { + "epoch": 0.49706824340970684, + "grad_norm": 0.6438093407353985, + "learning_rate": 1.1834310980083234e-05, + "loss": 1.1885, + "step": 1261 + }, + { + "epoch": 0.49746242916974626, + "grad_norm": 0.6724323601652606, + "learning_rate": 1.1820775995025147e-05, + "loss": 1.2409, + "step": 1262 + }, + { + "epoch": 0.49785661492978567, + "grad_norm": 0.6748553238124116, + "learning_rate": 1.1807237559126325e-05, + "loss": 1.2272, + "step": 1263 + }, + { + "epoch": 0.4982508006898251, + "grad_norm": 0.6139036537344899, + "learning_rate": 1.1793695698045606e-05, + "loss": 1.2306, + "step": 1264 + }, + { + "epoch": 0.4986449864498645, + "grad_norm": 0.6274786131500468, + "learning_rate": 1.1780150437448308e-05, + "loss": 1.2436, + "step": 1265 + }, + { + "epoch": 0.4990391722099039, + "grad_norm": 0.6947108304184417, + "learning_rate": 1.1766601803006204e-05, + "loss": 1.2404, + "step": 1266 + }, + { + "epoch": 0.4994333579699433, + "grad_norm": 0.6330610294257072, + "learning_rate": 1.1753049820397449e-05, + "loss": 1.2661, + "step": 1267 + }, + { + "epoch": 0.4998275437299828, + "grad_norm": 0.6526188172174275, + "learning_rate": 1.1739494515306553e-05, + "loss": 1.2404, + "step": 1268 + }, + { + "epoch": 0.5002217294900222, + "grad_norm": 0.6669476058696817, + "learning_rate": 1.172593591342432e-05, + "loss": 1.2259, + "step": 1269 + }, + { + "epoch": 0.5006159152500615, + "grad_norm": 0.6632364458454981, + "learning_rate": 1.1712374040447802e-05, + "loss": 1.2059, + "step": 1270 + }, + { + "epoch": 0.501010101010101, + "grad_norm": 0.6580075066736768, + "learning_rate": 1.1698808922080248e-05, + "loss": 1.2125, + "step": 1271 + }, + { + "epoch": 0.5014042867701405, + "grad_norm": 0.6477489624350686, + "learning_rate": 1.1685240584031068e-05, + "loss": 1.2346, + "step": 1272 + }, + { + "epoch": 0.5017984725301798, + "grad_norm": 0.6536067797543117, + "learning_rate": 1.1671669052015757e-05, + "loss": 1.2087, + "step": 1273 + }, + { + "epoch": 0.5021926582902193, + "grad_norm": 0.6652544869437115, + "learning_rate": 1.1658094351755883e-05, + "loss": 1.2333, + "step": 1274 + }, + { + "epoch": 0.5025868440502587, + "grad_norm": 0.6600451654966094, + "learning_rate": 1.1644516508978998e-05, + "loss": 1.213, + "step": 1275 + }, + { + "epoch": 0.5029810298102981, + "grad_norm": 0.6590398336514781, + "learning_rate": 1.1630935549418627e-05, + "loss": 1.2184, + "step": 1276 + }, + { + "epoch": 0.5033752155703375, + "grad_norm": 0.660891374872714, + "learning_rate": 1.1617351498814199e-05, + "loss": 1.2451, + "step": 1277 + }, + { + "epoch": 0.503769401330377, + "grad_norm": 0.6091765102262902, + "learning_rate": 1.1603764382910989e-05, + "loss": 1.1412, + "step": 1278 + }, + { + "epoch": 0.5041635870904163, + "grad_norm": 0.6735824808082984, + "learning_rate": 1.1590174227460098e-05, + "loss": 1.1786, + "step": 1279 + }, + { + "epoch": 0.5045577728504558, + "grad_norm": 0.6532363704591942, + "learning_rate": 1.1576581058218375e-05, + "loss": 1.1864, + "step": 1280 + }, + { + "epoch": 0.5049519586104952, + "grad_norm": 0.6606502828456684, + "learning_rate": 1.156298490094839e-05, + "loss": 1.1888, + "step": 1281 + }, + { + "epoch": 0.5053461443705346, + "grad_norm": 0.6342921397541668, + "learning_rate": 1.1549385781418372e-05, + "loss": 1.2213, + "step": 1282 + }, + { + "epoch": 0.5057403301305741, + "grad_norm": 0.6689825246282982, + "learning_rate": 1.1535783725402163e-05, + "loss": 1.2618, + "step": 1283 + }, + { + "epoch": 0.5061345158906134, + "grad_norm": 0.640115147587615, + "learning_rate": 1.1522178758679172e-05, + "loss": 1.222, + "step": 1284 + }, + { + "epoch": 0.5065287016506529, + "grad_norm": 0.6676485619547307, + "learning_rate": 1.1508570907034325e-05, + "loss": 1.2239, + "step": 1285 + }, + { + "epoch": 0.5069228874106922, + "grad_norm": 0.6584471811582958, + "learning_rate": 1.1494960196258016e-05, + "loss": 1.2261, + "step": 1286 + }, + { + "epoch": 0.5073170731707317, + "grad_norm": 0.6313871712156794, + "learning_rate": 1.1481346652146057e-05, + "loss": 1.2352, + "step": 1287 + }, + { + "epoch": 0.5077112589307712, + "grad_norm": 0.6192657373849317, + "learning_rate": 1.1467730300499626e-05, + "loss": 1.2161, + "step": 1288 + }, + { + "epoch": 0.5081054446908105, + "grad_norm": 0.661823259158885, + "learning_rate": 1.1454111167125231e-05, + "loss": 1.1869, + "step": 1289 + }, + { + "epoch": 0.50849963045085, + "grad_norm": 0.6581281171795876, + "learning_rate": 1.1440489277834645e-05, + "loss": 1.2408, + "step": 1290 + }, + { + "epoch": 0.5088938162108894, + "grad_norm": 0.673672216319801, + "learning_rate": 1.1426864658444865e-05, + "loss": 1.2423, + "step": 1291 + }, + { + "epoch": 0.5092880019709288, + "grad_norm": 0.6709234458079614, + "learning_rate": 1.1413237334778064e-05, + "loss": 1.2092, + "step": 1292 + }, + { + "epoch": 0.5096821877309682, + "grad_norm": 0.6704668753810613, + "learning_rate": 1.139960733266154e-05, + "loss": 1.2005, + "step": 1293 + }, + { + "epoch": 0.5100763734910077, + "grad_norm": 0.6665476817077829, + "learning_rate": 1.1385974677927667e-05, + "loss": 1.2879, + "step": 1294 + }, + { + "epoch": 0.510470559251047, + "grad_norm": 0.6491129692417508, + "learning_rate": 1.1372339396413845e-05, + "loss": 1.2029, + "step": 1295 + }, + { + "epoch": 0.5108647450110865, + "grad_norm": 0.6370912475464865, + "learning_rate": 1.1358701513962457e-05, + "loss": 1.2327, + "step": 1296 + }, + { + "epoch": 0.5112589307711259, + "grad_norm": 0.648157038901389, + "learning_rate": 1.134506105642081e-05, + "loss": 1.2124, + "step": 1297 + }, + { + "epoch": 0.5116531165311653, + "grad_norm": 0.6461266035285687, + "learning_rate": 1.1331418049641091e-05, + "loss": 1.1982, + "step": 1298 + }, + { + "epoch": 0.5120473022912048, + "grad_norm": 0.6281200807330076, + "learning_rate": 1.1317772519480328e-05, + "loss": 1.2601, + "step": 1299 + }, + { + "epoch": 0.5124414880512441, + "grad_norm": 0.6422476551253151, + "learning_rate": 1.130412449180032e-05, + "loss": 1.1964, + "step": 1300 + }, + { + "epoch": 0.5128356738112836, + "grad_norm": 0.63650842337126, + "learning_rate": 1.1290473992467607e-05, + "loss": 1.2076, + "step": 1301 + }, + { + "epoch": 0.5132298595713229, + "grad_norm": 0.6773389045891938, + "learning_rate": 1.1276821047353403e-05, + "loss": 1.2352, + "step": 1302 + }, + { + "epoch": 0.5136240453313624, + "grad_norm": 0.6309296879156464, + "learning_rate": 1.1263165682333577e-05, + "loss": 1.1772, + "step": 1303 + }, + { + "epoch": 0.5140182310914019, + "grad_norm": 0.6765478799067353, + "learning_rate": 1.1249507923288563e-05, + "loss": 1.2115, + "step": 1304 + }, + { + "epoch": 0.5144124168514412, + "grad_norm": 0.6831067353554151, + "learning_rate": 1.1235847796103345e-05, + "loss": 1.2322, + "step": 1305 + }, + { + "epoch": 0.5148066026114807, + "grad_norm": 0.6680880986848273, + "learning_rate": 1.122218532666739e-05, + "loss": 1.2728, + "step": 1306 + }, + { + "epoch": 0.5152007883715201, + "grad_norm": 0.645405977896472, + "learning_rate": 1.1208520540874607e-05, + "loss": 1.2003, + "step": 1307 + }, + { + "epoch": 0.5155949741315595, + "grad_norm": 0.6696823139879742, + "learning_rate": 1.1194853464623294e-05, + "loss": 1.1981, + "step": 1308 + }, + { + "epoch": 0.5159891598915989, + "grad_norm": 0.6530439594705855, + "learning_rate": 1.1181184123816092e-05, + "loss": 1.1805, + "step": 1309 + }, + { + "epoch": 0.5163833456516383, + "grad_norm": 0.662122019391009, + "learning_rate": 1.1167512544359929e-05, + "loss": 1.2935, + "step": 1310 + }, + { + "epoch": 0.5167775314116777, + "grad_norm": 0.6515187138374906, + "learning_rate": 1.115383875216598e-05, + "loss": 1.236, + "step": 1311 + }, + { + "epoch": 0.5171717171717172, + "grad_norm": 0.6514508648345718, + "learning_rate": 1.1140162773149612e-05, + "loss": 1.1743, + "step": 1312 + }, + { + "epoch": 0.5175659029317566, + "grad_norm": 0.6440703774811735, + "learning_rate": 1.112648463323034e-05, + "loss": 1.2221, + "step": 1313 + }, + { + "epoch": 0.517960088691796, + "grad_norm": 0.6644581716811222, + "learning_rate": 1.1112804358331766e-05, + "loss": 1.1723, + "step": 1314 + }, + { + "epoch": 0.5183542744518355, + "grad_norm": 0.647476681026034, + "learning_rate": 1.1099121974381546e-05, + "loss": 1.2043, + "step": 1315 + }, + { + "epoch": 0.5187484602118748, + "grad_norm": 0.6615768891463015, + "learning_rate": 1.108543750731134e-05, + "loss": 1.1933, + "step": 1316 + }, + { + "epoch": 0.5191426459719143, + "grad_norm": 0.6352447330049817, + "learning_rate": 1.1071750983056733e-05, + "loss": 1.1965, + "step": 1317 + }, + { + "epoch": 0.5195368317319536, + "grad_norm": 0.6515803618281081, + "learning_rate": 1.105806242755723e-05, + "loss": 1.2412, + "step": 1318 + }, + { + "epoch": 0.5199310174919931, + "grad_norm": 0.6408728168852139, + "learning_rate": 1.1044371866756178e-05, + "loss": 1.2595, + "step": 1319 + }, + { + "epoch": 0.5203252032520326, + "grad_norm": 0.6136018250584243, + "learning_rate": 1.1030679326600726e-05, + "loss": 1.1597, + "step": 1320 + }, + { + "epoch": 0.5207193890120719, + "grad_norm": 0.6341434671207334, + "learning_rate": 1.1016984833041773e-05, + "loss": 1.1992, + "step": 1321 + }, + { + "epoch": 0.5211135747721114, + "grad_norm": 0.6539064660047773, + "learning_rate": 1.1003288412033923e-05, + "loss": 1.1332, + "step": 1322 + }, + { + "epoch": 0.5215077605321508, + "grad_norm": 0.6232171122795831, + "learning_rate": 1.0989590089535426e-05, + "loss": 1.2388, + "step": 1323 + }, + { + "epoch": 0.5219019462921902, + "grad_norm": 0.6877295201168714, + "learning_rate": 1.097588989150815e-05, + "loss": 1.2525, + "step": 1324 + }, + { + "epoch": 0.5222961320522296, + "grad_norm": 0.7115352113501258, + "learning_rate": 1.0962187843917498e-05, + "loss": 1.2115, + "step": 1325 + }, + { + "epoch": 0.522690317812269, + "grad_norm": 0.642946361400015, + "learning_rate": 1.0948483972732395e-05, + "loss": 1.2129, + "step": 1326 + }, + { + "epoch": 0.5230845035723084, + "grad_norm": 0.634552641474732, + "learning_rate": 1.0934778303925214e-05, + "loss": 1.1845, + "step": 1327 + }, + { + "epoch": 0.5234786893323479, + "grad_norm": 0.6716816812404441, + "learning_rate": 1.0921070863471732e-05, + "loss": 1.2202, + "step": 1328 + }, + { + "epoch": 0.5238728750923873, + "grad_norm": 0.6403984245235527, + "learning_rate": 1.09073616773511e-05, + "loss": 1.2436, + "step": 1329 + }, + { + "epoch": 0.5242670608524267, + "grad_norm": 0.6426802290331379, + "learning_rate": 1.089365077154576e-05, + "loss": 1.1759, + "step": 1330 + }, + { + "epoch": 0.5246612466124662, + "grad_norm": 0.6528320428327657, + "learning_rate": 1.0879938172041415e-05, + "loss": 1.234, + "step": 1331 + }, + { + "epoch": 0.5250554323725055, + "grad_norm": 0.6343235957872947, + "learning_rate": 1.0866223904826992e-05, + "loss": 1.1482, + "step": 1332 + }, + { + "epoch": 0.525449618132545, + "grad_norm": 0.635182058088562, + "learning_rate": 1.0852507995894558e-05, + "loss": 1.2054, + "step": 1333 + }, + { + "epoch": 0.5258438038925843, + "grad_norm": 0.6367031967484378, + "learning_rate": 1.0838790471239314e-05, + "loss": 1.1575, + "step": 1334 + }, + { + "epoch": 0.5262379896526238, + "grad_norm": 0.6402983704212438, + "learning_rate": 1.0825071356859502e-05, + "loss": 1.1966, + "step": 1335 + }, + { + "epoch": 0.5266321754126633, + "grad_norm": 0.6558137431376323, + "learning_rate": 1.0811350678756392e-05, + "loss": 1.2003, + "step": 1336 + }, + { + "epoch": 0.5270263611727026, + "grad_norm": 0.6387053585661903, + "learning_rate": 1.0797628462934214e-05, + "loss": 1.2108, + "step": 1337 + }, + { + "epoch": 0.5274205469327421, + "grad_norm": 0.6086598757639083, + "learning_rate": 1.0783904735400103e-05, + "loss": 1.1663, + "step": 1338 + }, + { + "epoch": 0.5278147326927815, + "grad_norm": 0.6399532215520667, + "learning_rate": 1.0770179522164079e-05, + "loss": 1.2112, + "step": 1339 + }, + { + "epoch": 0.5282089184528209, + "grad_norm": 0.6676098681703231, + "learning_rate": 1.0756452849238955e-05, + "loss": 1.2461, + "step": 1340 + }, + { + "epoch": 0.5286031042128603, + "grad_norm": 0.6540029616620948, + "learning_rate": 1.0742724742640323e-05, + "loss": 1.2397, + "step": 1341 + }, + { + "epoch": 0.5289972899728997, + "grad_norm": 0.6538972674770378, + "learning_rate": 1.0728995228386496e-05, + "loss": 1.2309, + "step": 1342 + }, + { + "epoch": 0.5293914757329391, + "grad_norm": 0.6772694870371185, + "learning_rate": 1.0715264332498445e-05, + "loss": 1.258, + "step": 1343 + }, + { + "epoch": 0.5297856614929786, + "grad_norm": 0.6376355859195808, + "learning_rate": 1.0701532080999762e-05, + "loss": 1.2376, + "step": 1344 + }, + { + "epoch": 0.530179847253018, + "grad_norm": 0.663394682115222, + "learning_rate": 1.0687798499916613e-05, + "loss": 1.2073, + "step": 1345 + }, + { + "epoch": 0.5305740330130574, + "grad_norm": 0.6701564343777298, + "learning_rate": 1.0674063615277681e-05, + "loss": 1.2365, + "step": 1346 + }, + { + "epoch": 0.5309682187730969, + "grad_norm": 0.6464607961695173, + "learning_rate": 1.0660327453114118e-05, + "loss": 1.1761, + "step": 1347 + }, + { + "epoch": 0.5313624045331362, + "grad_norm": 0.6383382398982943, + "learning_rate": 1.0646590039459499e-05, + "loss": 1.2069, + "step": 1348 + }, + { + "epoch": 0.5317565902931757, + "grad_norm": 0.7250328811363568, + "learning_rate": 1.063285140034977e-05, + "loss": 1.2748, + "step": 1349 + }, + { + "epoch": 0.532150776053215, + "grad_norm": 0.6218566182573235, + "learning_rate": 1.0619111561823208e-05, + "loss": 1.1792, + "step": 1350 + }, + { + "epoch": 0.5325449618132545, + "grad_norm": 0.6491294616401706, + "learning_rate": 1.060537054992034e-05, + "loss": 1.214, + "step": 1351 + }, + { + "epoch": 0.532939147573294, + "grad_norm": 0.6218758954772929, + "learning_rate": 1.0591628390683945e-05, + "loss": 1.1642, + "step": 1352 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.6423851142416096, + "learning_rate": 1.0577885110158959e-05, + "loss": 1.2269, + "step": 1353 + }, + { + "epoch": 0.5337275190933728, + "grad_norm": 0.6619276692624474, + "learning_rate": 1.0564140734392445e-05, + "loss": 1.2517, + "step": 1354 + }, + { + "epoch": 0.5341217048534121, + "grad_norm": 0.6486156036656686, + "learning_rate": 1.0550395289433553e-05, + "loss": 1.2318, + "step": 1355 + }, + { + "epoch": 0.5345158906134516, + "grad_norm": 0.6207033641119062, + "learning_rate": 1.0536648801333443e-05, + "loss": 1.22, + "step": 1356 + }, + { + "epoch": 0.534910076373491, + "grad_norm": 0.6286210196563511, + "learning_rate": 1.0522901296145263e-05, + "loss": 1.2087, + "step": 1357 + }, + { + "epoch": 0.5353042621335304, + "grad_norm": 0.6425274380062405, + "learning_rate": 1.0509152799924085e-05, + "loss": 1.2117, + "step": 1358 + }, + { + "epoch": 0.5356984478935698, + "grad_norm": 0.6192586936021032, + "learning_rate": 1.0495403338726862e-05, + "loss": 1.1948, + "step": 1359 + }, + { + "epoch": 0.5360926336536093, + "grad_norm": 0.6377697560605069, + "learning_rate": 1.0481652938612374e-05, + "loss": 1.2518, + "step": 1360 + }, + { + "epoch": 0.5364868194136487, + "grad_norm": 0.6359977533800316, + "learning_rate": 1.0467901625641174e-05, + "loss": 1.1883, + "step": 1361 + }, + { + "epoch": 0.5368810051736881, + "grad_norm": 0.6266522995098218, + "learning_rate": 1.045414942587556e-05, + "loss": 1.1223, + "step": 1362 + }, + { + "epoch": 0.5372751909337276, + "grad_norm": 0.6358734881969099, + "learning_rate": 1.0440396365379496e-05, + "loss": 1.2248, + "step": 1363 + }, + { + "epoch": 0.5376693766937669, + "grad_norm": 0.6182266673498269, + "learning_rate": 1.0426642470218587e-05, + "loss": 1.205, + "step": 1364 + }, + { + "epoch": 0.5380635624538064, + "grad_norm": 0.6485743617703122, + "learning_rate": 1.0412887766460017e-05, + "loss": 1.1979, + "step": 1365 + }, + { + "epoch": 0.5384577482138457, + "grad_norm": 0.6392709807479522, + "learning_rate": 1.0399132280172494e-05, + "loss": 1.2084, + "step": 1366 + }, + { + "epoch": 0.5388519339738852, + "grad_norm": 0.6545405852048852, + "learning_rate": 1.0385376037426227e-05, + "loss": 1.265, + "step": 1367 + }, + { + "epoch": 0.5392461197339247, + "grad_norm": 0.6496693130292205, + "learning_rate": 1.0371619064292844e-05, + "loss": 1.2467, + "step": 1368 + }, + { + "epoch": 0.539640305493964, + "grad_norm": 0.6835306554548173, + "learning_rate": 1.035786138684536e-05, + "loss": 1.2406, + "step": 1369 + }, + { + "epoch": 0.5400344912540035, + "grad_norm": 0.6433918833824575, + "learning_rate": 1.034410303115813e-05, + "loss": 1.2708, + "step": 1370 + }, + { + "epoch": 0.5404286770140428, + "grad_norm": 0.6391881556502016, + "learning_rate": 1.0330344023306791e-05, + "loss": 1.229, + "step": 1371 + }, + { + "epoch": 0.5408228627740823, + "grad_norm": 0.6778620828218745, + "learning_rate": 1.0316584389368213e-05, + "loss": 1.2611, + "step": 1372 + }, + { + "epoch": 0.5412170485341217, + "grad_norm": 0.6574985715883013, + "learning_rate": 1.0302824155420464e-05, + "loss": 1.2234, + "step": 1373 + }, + { + "epoch": 0.5416112342941611, + "grad_norm": 0.6714841683370039, + "learning_rate": 1.0289063347542727e-05, + "loss": 1.2057, + "step": 1374 + }, + { + "epoch": 0.5420054200542005, + "grad_norm": 0.646623331729815, + "learning_rate": 1.0275301991815299e-05, + "loss": 1.2366, + "step": 1375 + }, + { + "epoch": 0.54239960581424, + "grad_norm": 0.6267893952077622, + "learning_rate": 1.02615401143195e-05, + "loss": 1.2157, + "step": 1376 + }, + { + "epoch": 0.5427937915742794, + "grad_norm": 0.6430429787610838, + "learning_rate": 1.0247777741137636e-05, + "loss": 1.2459, + "step": 1377 + }, + { + "epoch": 0.5431879773343188, + "grad_norm": 0.6315063466990641, + "learning_rate": 1.0234014898352966e-05, + "loss": 1.2342, + "step": 1378 + }, + { + "epoch": 0.5435821630943583, + "grad_norm": 0.7220865603750691, + "learning_rate": 1.022025161204963e-05, + "loss": 1.2154, + "step": 1379 + }, + { + "epoch": 0.5439763488543976, + "grad_norm": 0.6377801583000084, + "learning_rate": 1.0206487908312607e-05, + "loss": 1.206, + "step": 1380 + }, + { + "epoch": 0.5443705346144371, + "grad_norm": 0.6319172744640024, + "learning_rate": 1.0192723813227672e-05, + "loss": 1.1919, + "step": 1381 + }, + { + "epoch": 0.5447647203744764, + "grad_norm": 0.6364897393407957, + "learning_rate": 1.0178959352881337e-05, + "loss": 1.2146, + "step": 1382 + }, + { + "epoch": 0.5451589061345159, + "grad_norm": 0.6688375716623369, + "learning_rate": 1.0165194553360813e-05, + "loss": 1.2469, + "step": 1383 + }, + { + "epoch": 0.5455530918945554, + "grad_norm": 0.662719310669721, + "learning_rate": 1.0151429440753948e-05, + "loss": 1.3032, + "step": 1384 + }, + { + "epoch": 0.5459472776545947, + "grad_norm": 0.6431824004552453, + "learning_rate": 1.0137664041149187e-05, + "loss": 1.2224, + "step": 1385 + }, + { + "epoch": 0.5463414634146342, + "grad_norm": 0.6397813243923787, + "learning_rate": 1.0123898380635515e-05, + "loss": 1.1647, + "step": 1386 + }, + { + "epoch": 0.5467356491746735, + "grad_norm": 0.6349500431531321, + "learning_rate": 1.011013248530241e-05, + "loss": 1.2286, + "step": 1387 + }, + { + "epoch": 0.547129834934713, + "grad_norm": 0.6355731398653511, + "learning_rate": 1.0096366381239808e-05, + "loss": 1.1548, + "step": 1388 + }, + { + "epoch": 0.5475240206947524, + "grad_norm": 0.6272297906309461, + "learning_rate": 1.0082600094538029e-05, + "loss": 1.2372, + "step": 1389 + }, + { + "epoch": 0.5479182064547918, + "grad_norm": 0.6514286635524038, + "learning_rate": 1.0068833651287736e-05, + "loss": 1.1854, + "step": 1390 + }, + { + "epoch": 0.5483123922148312, + "grad_norm": 0.6434159221463395, + "learning_rate": 1.0055067077579894e-05, + "loss": 1.1649, + "step": 1391 + }, + { + "epoch": 0.5487065779748707, + "grad_norm": 0.6534616096140339, + "learning_rate": 1.0041300399505724e-05, + "loss": 1.2058, + "step": 1392 + }, + { + "epoch": 0.5491007637349101, + "grad_norm": 0.6385843361048341, + "learning_rate": 1.0027533643156629e-05, + "loss": 1.206, + "step": 1393 + }, + { + "epoch": 0.5494949494949495, + "grad_norm": 0.654135497386305, + "learning_rate": 1.0013766834624168e-05, + "loss": 1.2947, + "step": 1394 + }, + { + "epoch": 0.549889135254989, + "grad_norm": 0.6527260856281124, + "learning_rate": 1e-05, + "loss": 1.2067, + "step": 1395 + }, + { + "epoch": 0.5502833210150283, + "grad_norm": 0.6456506343549768, + "learning_rate": 9.986233165375837e-06, + "loss": 1.2799, + "step": 1396 + }, + { + "epoch": 0.5506775067750678, + "grad_norm": 0.7246957748680044, + "learning_rate": 9.972466356843375e-06, + "loss": 1.3271, + "step": 1397 + }, + { + "epoch": 0.5510716925351071, + "grad_norm": 0.6399327077783894, + "learning_rate": 9.95869960049428e-06, + "loss": 1.2443, + "step": 1398 + }, + { + "epoch": 0.5514658782951466, + "grad_norm": 0.6241508398727628, + "learning_rate": 9.944932922420109e-06, + "loss": 1.2007, + "step": 1399 + }, + { + "epoch": 0.5518600640551861, + "grad_norm": 0.614559476153416, + "learning_rate": 9.931166348712268e-06, + "loss": 1.1704, + "step": 1400 + }, + { + "epoch": 0.5522542498152254, + "grad_norm": 0.6304080966033335, + "learning_rate": 9.917399905461974e-06, + "loss": 1.1869, + "step": 1401 + }, + { + "epoch": 0.5526484355752649, + "grad_norm": 0.6412439956786309, + "learning_rate": 9.903633618760195e-06, + "loss": 1.1782, + "step": 1402 + }, + { + "epoch": 0.5530426213353042, + "grad_norm": 0.6557358908407644, + "learning_rate": 9.889867514697591e-06, + "loss": 1.225, + "step": 1403 + }, + { + "epoch": 0.5534368070953437, + "grad_norm": 0.6212875821927828, + "learning_rate": 9.876101619364487e-06, + "loss": 1.196, + "step": 1404 + }, + { + "epoch": 0.5538309928553831, + "grad_norm": 0.613555231324674, + "learning_rate": 9.862335958850816e-06, + "loss": 1.1592, + "step": 1405 + }, + { + "epoch": 0.5542251786154225, + "grad_norm": 0.6745935115478964, + "learning_rate": 9.848570559246055e-06, + "loss": 1.1877, + "step": 1406 + }, + { + "epoch": 0.5546193643754619, + "grad_norm": 0.6410977347319441, + "learning_rate": 9.834805446639187e-06, + "loss": 1.1612, + "step": 1407 + }, + { + "epoch": 0.5550135501355014, + "grad_norm": 0.6309144641717204, + "learning_rate": 9.821040647118666e-06, + "loss": 1.1425, + "step": 1408 + }, + { + "epoch": 0.5554077358955408, + "grad_norm": 0.6299676272735365, + "learning_rate": 9.807276186772335e-06, + "loss": 1.208, + "step": 1409 + }, + { + "epoch": 0.5558019216555802, + "grad_norm": 0.6178102722375627, + "learning_rate": 9.793512091687396e-06, + "loss": 1.1846, + "step": 1410 + }, + { + "epoch": 0.5561961074156196, + "grad_norm": 0.622166600700565, + "learning_rate": 9.779748387950372e-06, + "loss": 1.1662, + "step": 1411 + }, + { + "epoch": 0.556590293175659, + "grad_norm": 0.6600214723637224, + "learning_rate": 9.765985101647037e-06, + "loss": 1.2892, + "step": 1412 + }, + { + "epoch": 0.5569844789356985, + "grad_norm": 0.6176714958995365, + "learning_rate": 9.752222258862364e-06, + "loss": 1.1706, + "step": 1413 + }, + { + "epoch": 0.5573786646957378, + "grad_norm": 0.5939231448625044, + "learning_rate": 9.738459885680502e-06, + "loss": 1.1488, + "step": 1414 + }, + { + "epoch": 0.5577728504557773, + "grad_norm": 0.6352717829639574, + "learning_rate": 9.724698008184705e-06, + "loss": 1.2017, + "step": 1415 + }, + { + "epoch": 0.5581670362158168, + "grad_norm": 0.6167223796720016, + "learning_rate": 9.710936652457276e-06, + "loss": 1.1228, + "step": 1416 + }, + { + "epoch": 0.5585612219758561, + "grad_norm": 0.6213254460946624, + "learning_rate": 9.69717584457954e-06, + "loss": 1.184, + "step": 1417 + }, + { + "epoch": 0.5589554077358956, + "grad_norm": 0.6131341167960235, + "learning_rate": 9.683415610631788e-06, + "loss": 1.161, + "step": 1418 + }, + { + "epoch": 0.5593495934959349, + "grad_norm": 0.6296617155093078, + "learning_rate": 9.669655976693214e-06, + "loss": 1.1642, + "step": 1419 + }, + { + "epoch": 0.5597437792559744, + "grad_norm": 0.6153554191014486, + "learning_rate": 9.655896968841873e-06, + "loss": 1.2156, + "step": 1420 + }, + { + "epoch": 0.5601379650160138, + "grad_norm": 0.6392439227341541, + "learning_rate": 9.642138613154643e-06, + "loss": 1.1957, + "step": 1421 + }, + { + "epoch": 0.5605321507760532, + "grad_norm": 0.6260052735651341, + "learning_rate": 9.62838093570716e-06, + "loss": 1.1974, + "step": 1422 + }, + { + "epoch": 0.5609263365360926, + "grad_norm": 0.6334362558009554, + "learning_rate": 9.614623962573776e-06, + "loss": 1.1965, + "step": 1423 + }, + { + "epoch": 0.561320522296132, + "grad_norm": 0.6179635946785395, + "learning_rate": 9.600867719827507e-06, + "loss": 1.1606, + "step": 1424 + }, + { + "epoch": 0.5617147080561715, + "grad_norm": 0.675892965228182, + "learning_rate": 9.587112233539988e-06, + "loss": 1.2698, + "step": 1425 + }, + { + "epoch": 0.5621088938162109, + "grad_norm": 0.6269199497256357, + "learning_rate": 9.573357529781413e-06, + "loss": 1.1738, + "step": 1426 + }, + { + "epoch": 0.5625030795762503, + "grad_norm": 0.6206668162899066, + "learning_rate": 9.559603634620505e-06, + "loss": 1.1545, + "step": 1427 + }, + { + "epoch": 0.5628972653362897, + "grad_norm": 0.6392518680745602, + "learning_rate": 9.545850574124444e-06, + "loss": 1.2394, + "step": 1428 + }, + { + "epoch": 0.5632914510963292, + "grad_norm": 0.6554357478989767, + "learning_rate": 9.532098374358828e-06, + "loss": 1.2056, + "step": 1429 + }, + { + "epoch": 0.5636856368563685, + "grad_norm": 0.6321993644191258, + "learning_rate": 9.518347061387629e-06, + "loss": 1.2424, + "step": 1430 + }, + { + "epoch": 0.564079822616408, + "grad_norm": 0.6342077276536365, + "learning_rate": 9.504596661273141e-06, + "loss": 1.216, + "step": 1431 + }, + { + "epoch": 0.5644740083764475, + "grad_norm": 0.655567194868911, + "learning_rate": 9.490847200075919e-06, + "loss": 1.2236, + "step": 1432 + }, + { + "epoch": 0.5648681941364868, + "grad_norm": 0.6452206424611665, + "learning_rate": 9.47709870385474e-06, + "loss": 1.1493, + "step": 1433 + }, + { + "epoch": 0.5652623798965263, + "grad_norm": 0.6551732071227462, + "learning_rate": 9.46335119866656e-06, + "loss": 1.2243, + "step": 1434 + }, + { + "epoch": 0.5656565656565656, + "grad_norm": 0.638292981830309, + "learning_rate": 9.449604710566452e-06, + "loss": 1.2154, + "step": 1435 + }, + { + "epoch": 0.5660507514166051, + "grad_norm": 0.6434536189993397, + "learning_rate": 9.435859265607555e-06, + "loss": 1.2622, + "step": 1436 + }, + { + "epoch": 0.5664449371766445, + "grad_norm": 0.6235727133771496, + "learning_rate": 9.422114889841045e-06, + "loss": 1.2097, + "step": 1437 + }, + { + "epoch": 0.5668391229366839, + "grad_norm": 0.6380544846865114, + "learning_rate": 9.40837160931606e-06, + "loss": 1.1931, + "step": 1438 + }, + { + "epoch": 0.5672333086967233, + "grad_norm": 0.6070307134735536, + "learning_rate": 9.394629450079661e-06, + "loss": 1.1728, + "step": 1439 + }, + { + "epoch": 0.5676274944567627, + "grad_norm": 0.6261762404486911, + "learning_rate": 9.380888438176797e-06, + "loss": 1.2047, + "step": 1440 + }, + { + "epoch": 0.5680216802168022, + "grad_norm": 0.6148402557876401, + "learning_rate": 9.367148599650231e-06, + "loss": 1.1782, + "step": 1441 + }, + { + "epoch": 0.5684158659768416, + "grad_norm": 0.6153367707877275, + "learning_rate": 9.353409960540506e-06, + "loss": 1.1333, + "step": 1442 + }, + { + "epoch": 0.568810051736881, + "grad_norm": 0.6401365387127351, + "learning_rate": 9.339672546885885e-06, + "loss": 1.2479, + "step": 1443 + }, + { + "epoch": 0.5692042374969204, + "grad_norm": 0.6301673949669812, + "learning_rate": 9.325936384722322e-06, + "loss": 1.2015, + "step": 1444 + }, + { + "epoch": 0.5695984232569599, + "grad_norm": 0.6286144736358145, + "learning_rate": 9.312201500083392e-06, + "loss": 1.2487, + "step": 1445 + }, + { + "epoch": 0.5699926090169992, + "grad_norm": 0.6171822342295599, + "learning_rate": 9.29846791900024e-06, + "loss": 1.1904, + "step": 1446 + }, + { + "epoch": 0.5703867947770387, + "grad_norm": 0.6428565759737676, + "learning_rate": 9.284735667501558e-06, + "loss": 1.1679, + "step": 1447 + }, + { + "epoch": 0.5707809805370782, + "grad_norm": 0.6151703289847316, + "learning_rate": 9.271004771613509e-06, + "loss": 1.1246, + "step": 1448 + }, + { + "epoch": 0.5711751662971175, + "grad_norm": 0.6398686829564575, + "learning_rate": 9.257275257359679e-06, + "loss": 1.1657, + "step": 1449 + }, + { + "epoch": 0.571569352057157, + "grad_norm": 0.6243382952424049, + "learning_rate": 9.243547150761047e-06, + "loss": 1.1966, + "step": 1450 + }, + { + "epoch": 0.5719635378171963, + "grad_norm": 0.6408741873334287, + "learning_rate": 9.229820477835926e-06, + "loss": 1.2205, + "step": 1451 + }, + { + "epoch": 0.5723577235772358, + "grad_norm": 0.633552764994025, + "learning_rate": 9.216095264599895e-06, + "loss": 1.2252, + "step": 1452 + }, + { + "epoch": 0.5727519093372752, + "grad_norm": 0.6511108996685305, + "learning_rate": 9.202371537065788e-06, + "loss": 1.2656, + "step": 1453 + }, + { + "epoch": 0.5731460950973146, + "grad_norm": 0.6529280803122515, + "learning_rate": 9.18864932124361e-06, + "loss": 1.2239, + "step": 1454 + }, + { + "epoch": 0.573540280857354, + "grad_norm": 0.647401441010935, + "learning_rate": 9.1749286431405e-06, + "loss": 1.2716, + "step": 1455 + }, + { + "epoch": 0.5739344666173934, + "grad_norm": 0.642622817859945, + "learning_rate": 9.161209528760691e-06, + "loss": 1.2222, + "step": 1456 + }, + { + "epoch": 0.5743286523774329, + "grad_norm": 0.6320811079325271, + "learning_rate": 9.147492004105443e-06, + "loss": 1.2481, + "step": 1457 + }, + { + "epoch": 0.5747228381374723, + "grad_norm": 0.6326782165239981, + "learning_rate": 9.133776095173015e-06, + "loss": 1.2739, + "step": 1458 + }, + { + "epoch": 0.5751170238975117, + "grad_norm": 0.6625216988220546, + "learning_rate": 9.120061827958586e-06, + "loss": 1.2355, + "step": 1459 + }, + { + "epoch": 0.5755112096575511, + "grad_norm": 0.6213952483408215, + "learning_rate": 9.106349228454242e-06, + "loss": 1.1701, + "step": 1460 + }, + { + "epoch": 0.5759053954175906, + "grad_norm": 0.6158204977575528, + "learning_rate": 9.092638322648904e-06, + "loss": 1.2463, + "step": 1461 + }, + { + "epoch": 0.5762995811776299, + "grad_norm": 0.6128069866736511, + "learning_rate": 9.078929136528267e-06, + "loss": 1.1581, + "step": 1462 + }, + { + "epoch": 0.5766937669376694, + "grad_norm": 0.6618087745723823, + "learning_rate": 9.06522169607479e-06, + "loss": 1.1823, + "step": 1463 + }, + { + "epoch": 0.5770879526977089, + "grad_norm": 0.6783150244501504, + "learning_rate": 9.05151602726761e-06, + "loss": 1.2302, + "step": 1464 + }, + { + "epoch": 0.5774821384577482, + "grad_norm": 0.6503369713306525, + "learning_rate": 9.037812156082503e-06, + "loss": 1.2407, + "step": 1465 + }, + { + "epoch": 0.5778763242177877, + "grad_norm": 0.6456712064826, + "learning_rate": 9.024110108491855e-06, + "loss": 1.1609, + "step": 1466 + }, + { + "epoch": 0.578270509977827, + "grad_norm": 0.6486197805925519, + "learning_rate": 9.010409910464575e-06, + "loss": 1.2222, + "step": 1467 + }, + { + "epoch": 0.5786646957378665, + "grad_norm": 0.7436596366499776, + "learning_rate": 8.996711587966079e-06, + "loss": 1.2581, + "step": 1468 + }, + { + "epoch": 0.5790588814979059, + "grad_norm": 0.6261635281880413, + "learning_rate": 8.983015166958228e-06, + "loss": 1.2161, + "step": 1469 + }, + { + "epoch": 0.5794530672579453, + "grad_norm": 0.6443605688870468, + "learning_rate": 8.969320673399276e-06, + "loss": 1.1791, + "step": 1470 + }, + { + "epoch": 0.5798472530179847, + "grad_norm": 0.671825587927519, + "learning_rate": 8.955628133243828e-06, + "loss": 1.218, + "step": 1471 + }, + { + "epoch": 0.5802414387780241, + "grad_norm": 0.6434248476334178, + "learning_rate": 8.941937572442773e-06, + "loss": 1.1846, + "step": 1472 + }, + { + "epoch": 0.5806356245380636, + "grad_norm": 0.6254667200582976, + "learning_rate": 8.92824901694327e-06, + "loss": 1.2353, + "step": 1473 + }, + { + "epoch": 0.581029810298103, + "grad_norm": 0.6232654021330023, + "learning_rate": 8.914562492688667e-06, + "loss": 1.114, + "step": 1474 + }, + { + "epoch": 0.5814239960581424, + "grad_norm": 0.6299635353186261, + "learning_rate": 8.900878025618453e-06, + "loss": 1.2504, + "step": 1475 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.6833411898307228, + "learning_rate": 8.887195641668235e-06, + "loss": 1.2404, + "step": 1476 + }, + { + "epoch": 0.5822123675782213, + "grad_norm": 0.6669528413277209, + "learning_rate": 8.873515366769666e-06, + "loss": 1.1557, + "step": 1477 + }, + { + "epoch": 0.5826065533382606, + "grad_norm": 0.6340389941502457, + "learning_rate": 8.85983722685039e-06, + "loss": 1.1978, + "step": 1478 + }, + { + "epoch": 0.5830007390983001, + "grad_norm": 0.6504266413875779, + "learning_rate": 8.846161247834024e-06, + "loss": 1.2026, + "step": 1479 + }, + { + "epoch": 0.5833949248583395, + "grad_norm": 0.623448080239467, + "learning_rate": 8.832487455640074e-06, + "loss": 1.1968, + "step": 1480 + }, + { + "epoch": 0.5837891106183789, + "grad_norm": 0.6377332989581492, + "learning_rate": 8.81881587618391e-06, + "loss": 1.1794, + "step": 1481 + }, + { + "epoch": 0.5841832963784184, + "grad_norm": 0.6487050264881453, + "learning_rate": 8.805146535376709e-06, + "loss": 1.2329, + "step": 1482 + }, + { + "epoch": 0.5845774821384577, + "grad_norm": 0.6866850553685105, + "learning_rate": 8.791479459125396e-06, + "loss": 1.2786, + "step": 1483 + }, + { + "epoch": 0.5849716678984972, + "grad_norm": 0.6241541462965179, + "learning_rate": 8.777814673332615e-06, + "loss": 1.1997, + "step": 1484 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 0.6488269216574984, + "learning_rate": 8.764152203896658e-06, + "loss": 1.1873, + "step": 1485 + }, + { + "epoch": 0.585760039418576, + "grad_norm": 0.6518659909159534, + "learning_rate": 8.750492076711439e-06, + "loss": 1.1964, + "step": 1486 + }, + { + "epoch": 0.5861542251786154, + "grad_norm": 0.6379498327658182, + "learning_rate": 8.736834317666428e-06, + "loss": 1.19, + "step": 1487 + }, + { + "epoch": 0.5865484109386548, + "grad_norm": 0.6073862610155873, + "learning_rate": 8.723178952646597e-06, + "loss": 1.1497, + "step": 1488 + }, + { + "epoch": 0.5869425966986943, + "grad_norm": 0.6335121996922907, + "learning_rate": 8.709526007532396e-06, + "loss": 1.1905, + "step": 1489 + }, + { + "epoch": 0.5873367824587337, + "grad_norm": 0.6478757542846147, + "learning_rate": 8.695875508199683e-06, + "loss": 1.1726, + "step": 1490 + }, + { + "epoch": 0.5877309682187731, + "grad_norm": 0.6801004693955225, + "learning_rate": 8.682227480519672e-06, + "loss": 1.1956, + "step": 1491 + }, + { + "epoch": 0.5881251539788125, + "grad_norm": 0.6869506155016226, + "learning_rate": 8.66858195035891e-06, + "loss": 1.2158, + "step": 1492 + }, + { + "epoch": 0.588519339738852, + "grad_norm": 0.6328860065449554, + "learning_rate": 8.654938943579194e-06, + "loss": 1.1986, + "step": 1493 + }, + { + "epoch": 0.5889135254988913, + "grad_norm": 0.6966371382556359, + "learning_rate": 8.641298486037543e-06, + "loss": 1.2219, + "step": 1494 + }, + { + "epoch": 0.5893077112589308, + "grad_norm": 0.6706456600510302, + "learning_rate": 8.627660603586157e-06, + "loss": 1.2992, + "step": 1495 + }, + { + "epoch": 0.5897018970189702, + "grad_norm": 0.6634528939701451, + "learning_rate": 8.614025322072338e-06, + "loss": 1.2412, + "step": 1496 + }, + { + "epoch": 0.5900960827790096, + "grad_norm": 0.6101971245071337, + "learning_rate": 8.600392667338465e-06, + "loss": 1.1347, + "step": 1497 + }, + { + "epoch": 0.5904902685390491, + "grad_norm": 0.640682969790413, + "learning_rate": 8.58676266522194e-06, + "loss": 1.2015, + "step": 1498 + }, + { + "epoch": 0.5908844542990884, + "grad_norm": 0.648892739773898, + "learning_rate": 8.573135341555138e-06, + "loss": 1.1751, + "step": 1499 + }, + { + "epoch": 0.5912786400591279, + "grad_norm": 0.6497240357012373, + "learning_rate": 8.55951072216536e-06, + "loss": 1.2231, + "step": 1500 + }, + { + "epoch": 0.5916728258191672, + "grad_norm": 0.653343396545042, + "learning_rate": 8.54588883287477e-06, + "loss": 1.1746, + "step": 1501 + }, + { + "epoch": 0.5920670115792067, + "grad_norm": 0.6432488267867399, + "learning_rate": 8.532269699500377e-06, + "loss": 1.1574, + "step": 1502 + }, + { + "epoch": 0.5924611973392461, + "grad_norm": 0.6545865486299587, + "learning_rate": 8.518653347853948e-06, + "loss": 1.2443, + "step": 1503 + }, + { + "epoch": 0.5928553830992855, + "grad_norm": 0.7869569426495164, + "learning_rate": 8.505039803741985e-06, + "loss": 1.2115, + "step": 1504 + }, + { + "epoch": 0.593249568859325, + "grad_norm": 0.61279157223736, + "learning_rate": 8.491429092965677e-06, + "loss": 1.1301, + "step": 1505 + }, + { + "epoch": 0.5936437546193644, + "grad_norm": 0.6584615054581199, + "learning_rate": 8.477821241320831e-06, + "loss": 1.1872, + "step": 1506 + }, + { + "epoch": 0.5940379403794038, + "grad_norm": 0.6651037222509211, + "learning_rate": 8.464216274597839e-06, + "loss": 1.1699, + "step": 1507 + }, + { + "epoch": 0.5944321261394432, + "grad_norm": 0.6192362295929023, + "learning_rate": 8.450614218581631e-06, + "loss": 1.2301, + "step": 1508 + }, + { + "epoch": 0.5948263118994827, + "grad_norm": 0.6063957302686086, + "learning_rate": 8.437015099051613e-06, + "loss": 1.1558, + "step": 1509 + }, + { + "epoch": 0.595220497659522, + "grad_norm": 0.6463493132821347, + "learning_rate": 8.42341894178163e-06, + "loss": 1.2595, + "step": 1510 + }, + { + "epoch": 0.5956146834195615, + "grad_norm": 0.6177688405321609, + "learning_rate": 8.409825772539905e-06, + "loss": 1.174, + "step": 1511 + }, + { + "epoch": 0.5960088691796009, + "grad_norm": 0.6181575708603189, + "learning_rate": 8.396235617089013e-06, + "loss": 1.1953, + "step": 1512 + }, + { + "epoch": 0.5964030549396403, + "grad_norm": 0.6232523590903218, + "learning_rate": 8.382648501185806e-06, + "loss": 1.2131, + "step": 1513 + }, + { + "epoch": 0.5967972406996798, + "grad_norm": 0.6853964780387746, + "learning_rate": 8.369064450581374e-06, + "loss": 1.2397, + "step": 1514 + }, + { + "epoch": 0.5971914264597191, + "grad_norm": 0.638261822593998, + "learning_rate": 8.355483491021007e-06, + "loss": 1.1697, + "step": 1515 + }, + { + "epoch": 0.5975856122197586, + "grad_norm": 0.6345858720982844, + "learning_rate": 8.341905648244122e-06, + "loss": 1.198, + "step": 1516 + }, + { + "epoch": 0.597979797979798, + "grad_norm": 0.6205371649965156, + "learning_rate": 8.328330947984243e-06, + "loss": 1.1509, + "step": 1517 + }, + { + "epoch": 0.5983739837398374, + "grad_norm": 0.6780688159415363, + "learning_rate": 8.314759415968936e-06, + "loss": 1.2359, + "step": 1518 + }, + { + "epoch": 0.5987681694998768, + "grad_norm": 0.6375070575615467, + "learning_rate": 8.301191077919753e-06, + "loss": 1.2035, + "step": 1519 + }, + { + "epoch": 0.5991623552599162, + "grad_norm": 0.622909906771207, + "learning_rate": 8.2876259595522e-06, + "loss": 1.2104, + "step": 1520 + }, + { + "epoch": 0.5995565410199557, + "grad_norm": 0.6094392519833095, + "learning_rate": 8.274064086575682e-06, + "loss": 1.1475, + "step": 1521 + }, + { + "epoch": 0.5999507267799951, + "grad_norm": 0.621252910798821, + "learning_rate": 8.260505484693449e-06, + "loss": 1.1864, + "step": 1522 + }, + { + "epoch": 0.6003449125400345, + "grad_norm": 0.6698438223208214, + "learning_rate": 8.246950179602554e-06, + "loss": 1.1991, + "step": 1523 + }, + { + "epoch": 0.6007390983000739, + "grad_norm": 0.6520795365380274, + "learning_rate": 8.2333981969938e-06, + "loss": 1.1769, + "step": 1524 + }, + { + "epoch": 0.6011332840601133, + "grad_norm": 0.6522360114294746, + "learning_rate": 8.219849562551695e-06, + "loss": 1.2025, + "step": 1525 + }, + { + "epoch": 0.6015274698201527, + "grad_norm": 0.6295823752577447, + "learning_rate": 8.206304301954397e-06, + "loss": 1.1339, + "step": 1526 + }, + { + "epoch": 0.6019216555801922, + "grad_norm": 0.6483586741712484, + "learning_rate": 8.192762440873675e-06, + "loss": 1.1893, + "step": 1527 + }, + { + "epoch": 0.6023158413402316, + "grad_norm": 0.6574976200875523, + "learning_rate": 8.179224004974857e-06, + "loss": 1.1948, + "step": 1528 + }, + { + "epoch": 0.602710027100271, + "grad_norm": 0.6592927070571326, + "learning_rate": 8.165689019916769e-06, + "loss": 1.1865, + "step": 1529 + }, + { + "epoch": 0.6031042128603105, + "grad_norm": 0.6602088196871608, + "learning_rate": 8.152157511351704e-06, + "loss": 1.2788, + "step": 1530 + }, + { + "epoch": 0.6034983986203498, + "grad_norm": 0.5966682622148229, + "learning_rate": 8.138629504925372e-06, + "loss": 1.1035, + "step": 1531 + }, + { + "epoch": 0.6038925843803893, + "grad_norm": 0.6472735298836796, + "learning_rate": 8.125105026276832e-06, + "loss": 1.2211, + "step": 1532 + }, + { + "epoch": 0.6042867701404286, + "grad_norm": 0.647741738867434, + "learning_rate": 8.111584101038462e-06, + "loss": 1.2187, + "step": 1533 + }, + { + "epoch": 0.6046809559004681, + "grad_norm": 0.6404826084219543, + "learning_rate": 8.098066754835916e-06, + "loss": 1.1788, + "step": 1534 + }, + { + "epoch": 0.6050751416605075, + "grad_norm": 0.6124100298486728, + "learning_rate": 8.084553013288048e-06, + "loss": 1.1426, + "step": 1535 + }, + { + "epoch": 0.6054693274205469, + "grad_norm": 0.6344901181171149, + "learning_rate": 8.071042902006896e-06, + "loss": 1.2431, + "step": 1536 + }, + { + "epoch": 0.6058635131805864, + "grad_norm": 0.6328920930143503, + "learning_rate": 8.057536446597598e-06, + "loss": 1.2025, + "step": 1537 + }, + { + "epoch": 0.6062576989406258, + "grad_norm": 0.6519280491300705, + "learning_rate": 8.044033672658387e-06, + "loss": 1.2351, + "step": 1538 + }, + { + "epoch": 0.6066518847006652, + "grad_norm": 0.6725946251767152, + "learning_rate": 8.0305346057805e-06, + "loss": 1.2485, + "step": 1539 + }, + { + "epoch": 0.6070460704607046, + "grad_norm": 0.657229000221368, + "learning_rate": 8.017039271548154e-06, + "loss": 1.1958, + "step": 1540 + }, + { + "epoch": 0.607440256220744, + "grad_norm": 0.63930798917721, + "learning_rate": 8.0035476955385e-06, + "loss": 1.2539, + "step": 1541 + }, + { + "epoch": 0.6078344419807834, + "grad_norm": 0.6356269105691521, + "learning_rate": 7.990059903321554e-06, + "loss": 1.174, + "step": 1542 + }, + { + "epoch": 0.6082286277408229, + "grad_norm": 0.6421402197109457, + "learning_rate": 7.97657592046016e-06, + "loss": 1.2085, + "step": 1543 + }, + { + "epoch": 0.6086228135008623, + "grad_norm": 0.6489422328975518, + "learning_rate": 7.96309577250996e-06, + "loss": 1.2387, + "step": 1544 + }, + { + "epoch": 0.6090169992609017, + "grad_norm": 0.6530006388057895, + "learning_rate": 7.949619485019307e-06, + "loss": 1.2009, + "step": 1545 + }, + { + "epoch": 0.6094111850209412, + "grad_norm": 0.6416958127168939, + "learning_rate": 7.936147083529245e-06, + "loss": 1.2154, + "step": 1546 + }, + { + "epoch": 0.6098053707809805, + "grad_norm": 0.6337303333525649, + "learning_rate": 7.922678593573462e-06, + "loss": 1.1974, + "step": 1547 + }, + { + "epoch": 0.61019955654102, + "grad_norm": 0.6637031259257837, + "learning_rate": 7.90921404067822e-06, + "loss": 1.2052, + "step": 1548 + }, + { + "epoch": 0.6105937423010593, + "grad_norm": 0.6473009660413165, + "learning_rate": 7.89575345036232e-06, + "loss": 1.2473, + "step": 1549 + }, + { + "epoch": 0.6109879280610988, + "grad_norm": 0.6261555671205469, + "learning_rate": 7.882296848137063e-06, + "loss": 1.2066, + "step": 1550 + }, + { + "epoch": 0.6113821138211382, + "grad_norm": 0.6177349103271258, + "learning_rate": 7.868844259506186e-06, + "loss": 1.1547, + "step": 1551 + }, + { + "epoch": 0.6117762995811776, + "grad_norm": 0.6264274304099752, + "learning_rate": 7.855395709965814e-06, + "loss": 1.2039, + "step": 1552 + }, + { + "epoch": 0.6121704853412171, + "grad_norm": 0.6208965372231373, + "learning_rate": 7.84195122500442e-06, + "loss": 1.1659, + "step": 1553 + }, + { + "epoch": 0.6125646711012565, + "grad_norm": 0.6182902432180839, + "learning_rate": 7.828510830102785e-06, + "loss": 1.1802, + "step": 1554 + }, + { + "epoch": 0.6129588568612959, + "grad_norm": 0.6010062493402437, + "learning_rate": 7.815074550733919e-06, + "loss": 1.1624, + "step": 1555 + }, + { + "epoch": 0.6133530426213353, + "grad_norm": 0.6100632398399762, + "learning_rate": 7.801642412363042e-06, + "loss": 1.1588, + "step": 1556 + }, + { + "epoch": 0.6137472283813747, + "grad_norm": 0.6244968785224004, + "learning_rate": 7.788214440447532e-06, + "loss": 1.16, + "step": 1557 + }, + { + "epoch": 0.6141414141414141, + "grad_norm": 0.6262394381187797, + "learning_rate": 7.774790660436857e-06, + "loss": 1.1379, + "step": 1558 + }, + { + "epoch": 0.6145355999014536, + "grad_norm": 0.6268360201286511, + "learning_rate": 7.761371097772548e-06, + "loss": 1.1632, + "step": 1559 + }, + { + "epoch": 0.614929785661493, + "grad_norm": 0.6450865669879012, + "learning_rate": 7.747955777888145e-06, + "loss": 1.1762, + "step": 1560 + }, + { + "epoch": 0.6153239714215324, + "grad_norm": 0.6424738031868468, + "learning_rate": 7.734544726209143e-06, + "loss": 1.1559, + "step": 1561 + }, + { + "epoch": 0.6157181571815719, + "grad_norm": 0.637950698301497, + "learning_rate": 7.721137968152944e-06, + "loss": 1.1831, + "step": 1562 + }, + { + "epoch": 0.6161123429416112, + "grad_norm": 0.6186538417807995, + "learning_rate": 7.707735529128819e-06, + "loss": 1.1962, + "step": 1563 + }, + { + "epoch": 0.6165065287016507, + "grad_norm": 0.6181805636977189, + "learning_rate": 7.694337434537856e-06, + "loss": 1.1768, + "step": 1564 + }, + { + "epoch": 0.61690071446169, + "grad_norm": 0.6254768111350152, + "learning_rate": 7.680943709772899e-06, + "loss": 1.1604, + "step": 1565 + }, + { + "epoch": 0.6172949002217295, + "grad_norm": 0.644104659671372, + "learning_rate": 7.667554380218513e-06, + "loss": 1.2107, + "step": 1566 + }, + { + "epoch": 0.6176890859817689, + "grad_norm": 0.6537180884599917, + "learning_rate": 7.654169471250945e-06, + "loss": 1.2834, + "step": 1567 + }, + { + "epoch": 0.6180832717418083, + "grad_norm": 0.6361808370235917, + "learning_rate": 7.640789008238044e-06, + "loss": 1.1062, + "step": 1568 + }, + { + "epoch": 0.6184774575018478, + "grad_norm": 0.6523288827402758, + "learning_rate": 7.627413016539247e-06, + "loss": 1.1986, + "step": 1569 + }, + { + "epoch": 0.6188716432618871, + "grad_norm": 0.6285054549406514, + "learning_rate": 7.614041521505517e-06, + "loss": 1.1758, + "step": 1570 + }, + { + "epoch": 0.6192658290219266, + "grad_norm": 0.6272952169331758, + "learning_rate": 7.6006745484792855e-06, + "loss": 1.1788, + "step": 1571 + }, + { + "epoch": 0.619660014781966, + "grad_norm": 0.6500656109205114, + "learning_rate": 7.587312122794414e-06, + "loss": 1.2231, + "step": 1572 + }, + { + "epoch": 0.6200542005420054, + "grad_norm": 0.6954118875061881, + "learning_rate": 7.5739542697761615e-06, + "loss": 1.2549, + "step": 1573 + }, + { + "epoch": 0.6204483863020448, + "grad_norm": 0.6226893727767379, + "learning_rate": 7.560601014741103e-06, + "loss": 1.1388, + "step": 1574 + }, + { + "epoch": 0.6208425720620843, + "grad_norm": 0.6505634755873115, + "learning_rate": 7.547252382997101e-06, + "loss": 1.2098, + "step": 1575 + }, + { + "epoch": 0.6212367578221237, + "grad_norm": 0.6498328807173522, + "learning_rate": 7.533908399843266e-06, + "loss": 1.1734, + "step": 1576 + }, + { + "epoch": 0.6216309435821631, + "grad_norm": 0.6761129099478455, + "learning_rate": 7.520569090569894e-06, + "loss": 1.1757, + "step": 1577 + }, + { + "epoch": 0.6220251293422026, + "grad_norm": 0.6971630762485974, + "learning_rate": 7.507234480458414e-06, + "loss": 1.2566, + "step": 1578 + }, + { + "epoch": 0.6224193151022419, + "grad_norm": 0.6237942794960373, + "learning_rate": 7.493904594781358e-06, + "loss": 1.1296, + "step": 1579 + }, + { + "epoch": 0.6228135008622814, + "grad_norm": 0.6295586177215396, + "learning_rate": 7.4805794588023086e-06, + "loss": 1.1169, + "step": 1580 + }, + { + "epoch": 0.6232076866223207, + "grad_norm": 0.6408732189903159, + "learning_rate": 7.4672590977758295e-06, + "loss": 1.1301, + "step": 1581 + }, + { + "epoch": 0.6236018723823602, + "grad_norm": 0.6771354689742808, + "learning_rate": 7.45394353694745e-06, + "loss": 1.2348, + "step": 1582 + }, + { + "epoch": 0.6239960581423996, + "grad_norm": 0.640613127950835, + "learning_rate": 7.4406328015536e-06, + "loss": 1.196, + "step": 1583 + }, + { + "epoch": 0.624390243902439, + "grad_norm": 0.650879151108994, + "learning_rate": 7.427326916821557e-06, + "loss": 1.1784, + "step": 1584 + }, + { + "epoch": 0.6247844296624785, + "grad_norm": 0.6596072847031024, + "learning_rate": 7.414025907969404e-06, + "loss": 1.2214, + "step": 1585 + }, + { + "epoch": 0.6251786154225178, + "grad_norm": 0.6278635059421687, + "learning_rate": 7.4007298002059965e-06, + "loss": 1.1567, + "step": 1586 + }, + { + "epoch": 0.6255728011825573, + "grad_norm": 0.6225891858209661, + "learning_rate": 7.387438618730891e-06, + "loss": 1.1644, + "step": 1587 + }, + { + "epoch": 0.6259669869425967, + "grad_norm": 0.6387712671736495, + "learning_rate": 7.3741523887343015e-06, + "loss": 1.1932, + "step": 1588 + }, + { + "epoch": 0.6263611727026361, + "grad_norm": 0.6731157388955487, + "learning_rate": 7.360871135397072e-06, + "loss": 1.2878, + "step": 1589 + }, + { + "epoch": 0.6267553584626755, + "grad_norm": 0.6067881423807671, + "learning_rate": 7.347594883890608e-06, + "loss": 1.1341, + "step": 1590 + }, + { + "epoch": 0.627149544222715, + "grad_norm": 0.6315807367438574, + "learning_rate": 7.3343236593768295e-06, + "loss": 1.15, + "step": 1591 + }, + { + "epoch": 0.6275437299827544, + "grad_norm": 0.6828787333827238, + "learning_rate": 7.321057487008136e-06, + "loss": 1.2797, + "step": 1592 + }, + { + "epoch": 0.6279379157427938, + "grad_norm": 0.636378285588495, + "learning_rate": 7.307796391927356e-06, + "loss": 1.2114, + "step": 1593 + }, + { + "epoch": 0.6283321015028333, + "grad_norm": 0.6227706869499603, + "learning_rate": 7.294540399267682e-06, + "loss": 1.2107, + "step": 1594 + }, + { + "epoch": 0.6287262872628726, + "grad_norm": 0.6542527940502086, + "learning_rate": 7.281289534152644e-06, + "loss": 1.1301, + "step": 1595 + }, + { + "epoch": 0.6291204730229121, + "grad_norm": 0.6481496871980028, + "learning_rate": 7.268043821696062e-06, + "loss": 1.2319, + "step": 1596 + }, + { + "epoch": 0.6295146587829514, + "grad_norm": 0.6445223927771241, + "learning_rate": 7.254803287001975e-06, + "loss": 1.2334, + "step": 1597 + }, + { + "epoch": 0.6299088445429909, + "grad_norm": 0.6329838727914758, + "learning_rate": 7.24156795516461e-06, + "loss": 1.1496, + "step": 1598 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.6299335180741068, + "learning_rate": 7.22833785126835e-06, + "loss": 1.184, + "step": 1599 + }, + { + "epoch": 0.6306972160630697, + "grad_norm": 0.6284096678702693, + "learning_rate": 7.215113000387654e-06, + "loss": 1.254, + "step": 1600 + }, + { + "epoch": 0.6310914018231092, + "grad_norm": 0.6324689716112708, + "learning_rate": 7.201893427587026e-06, + "loss": 1.1721, + "step": 1601 + }, + { + "epoch": 0.6314855875831485, + "grad_norm": 0.6858753419716495, + "learning_rate": 7.188679157920977e-06, + "loss": 1.1898, + "step": 1602 + }, + { + "epoch": 0.631879773343188, + "grad_norm": 0.6556988105872994, + "learning_rate": 7.1754702164339575e-06, + "loss": 1.2545, + "step": 1603 + }, + { + "epoch": 0.6322739591032274, + "grad_norm": 0.6195080831875678, + "learning_rate": 7.1622666281603235e-06, + "loss": 1.2272, + "step": 1604 + }, + { + "epoch": 0.6326681448632668, + "grad_norm": 0.6586395858980946, + "learning_rate": 7.149068418124281e-06, + "loss": 1.2194, + "step": 1605 + }, + { + "epoch": 0.6330623306233062, + "grad_norm": 0.6447888871223056, + "learning_rate": 7.1358756113398545e-06, + "loss": 1.2575, + "step": 1606 + }, + { + "epoch": 0.6334565163833457, + "grad_norm": 0.60959438103777, + "learning_rate": 7.122688232810815e-06, + "loss": 1.2215, + "step": 1607 + }, + { + "epoch": 0.6338507021433851, + "grad_norm": 0.6336168777241095, + "learning_rate": 7.109506307530646e-06, + "loss": 1.2274, + "step": 1608 + }, + { + "epoch": 0.6342448879034245, + "grad_norm": 0.6166032302997211, + "learning_rate": 7.096329860482507e-06, + "loss": 1.2061, + "step": 1609 + }, + { + "epoch": 0.634639073663464, + "grad_norm": 0.6674971360893448, + "learning_rate": 7.083158916639169e-06, + "loss": 1.3014, + "step": 1610 + }, + { + "epoch": 0.6350332594235033, + "grad_norm": 0.6542997563204203, + "learning_rate": 7.069993500962964e-06, + "loss": 1.139, + "step": 1611 + }, + { + "epoch": 0.6354274451835428, + "grad_norm": 0.6233870945052585, + "learning_rate": 7.056833638405762e-06, + "loss": 1.1705, + "step": 1612 + }, + { + "epoch": 0.6358216309435821, + "grad_norm": 0.6532480222627909, + "learning_rate": 7.043679353908901e-06, + "loss": 1.2109, + "step": 1613 + }, + { + "epoch": 0.6362158167036216, + "grad_norm": 0.6249185015676082, + "learning_rate": 7.0305306724031396e-06, + "loss": 1.1821, + "step": 1614 + }, + { + "epoch": 0.636610002463661, + "grad_norm": 0.6218410031542252, + "learning_rate": 7.017387618808634e-06, + "loss": 1.1483, + "step": 1615 + }, + { + "epoch": 0.6370041882237004, + "grad_norm": 0.6490684142962722, + "learning_rate": 7.0042502180348635e-06, + "loss": 1.2157, + "step": 1616 + }, + { + "epoch": 0.6373983739837399, + "grad_norm": 0.6034827634471542, + "learning_rate": 6.991118494980591e-06, + "loss": 1.1842, + "step": 1617 + }, + { + "epoch": 0.6377925597437792, + "grad_norm": 0.6274462711346118, + "learning_rate": 6.977992474533823e-06, + "loss": 1.2361, + "step": 1618 + }, + { + "epoch": 0.6381867455038187, + "grad_norm": 0.6760850255550227, + "learning_rate": 6.964872181571765e-06, + "loss": 1.1862, + "step": 1619 + }, + { + "epoch": 0.6385809312638581, + "grad_norm": 0.6396402151072694, + "learning_rate": 6.9517576409607545e-06, + "loss": 1.2231, + "step": 1620 + }, + { + "epoch": 0.6389751170238975, + "grad_norm": 0.6338829150069218, + "learning_rate": 6.938648877556231e-06, + "loss": 1.2246, + "step": 1621 + }, + { + "epoch": 0.6393693027839369, + "grad_norm": 0.6473593135129597, + "learning_rate": 6.925545916202692e-06, + "loss": 1.2431, + "step": 1622 + }, + { + "epoch": 0.6397634885439764, + "grad_norm": 0.6401312934763702, + "learning_rate": 6.912448781733633e-06, + "loss": 1.2157, + "step": 1623 + }, + { + "epoch": 0.6401576743040158, + "grad_norm": 0.6399148681302655, + "learning_rate": 6.8993574989714995e-06, + "loss": 1.1838, + "step": 1624 + }, + { + "epoch": 0.6405518600640552, + "grad_norm": 0.5966358662573188, + "learning_rate": 6.88627209272766e-06, + "loss": 1.1593, + "step": 1625 + }, + { + "epoch": 0.6409460458240946, + "grad_norm": 0.6516019968106155, + "learning_rate": 6.87319258780234e-06, + "loss": 1.1743, + "step": 1626 + }, + { + "epoch": 0.641340231584134, + "grad_norm": 0.623888477031532, + "learning_rate": 6.860119008984569e-06, + "loss": 1.2352, + "step": 1627 + }, + { + "epoch": 0.6417344173441735, + "grad_norm": 0.6462585435255515, + "learning_rate": 6.847051381052165e-06, + "loss": 1.1955, + "step": 1628 + }, + { + "epoch": 0.6421286031042128, + "grad_norm": 0.6285337684977241, + "learning_rate": 6.833989728771657e-06, + "loss": 1.2102, + "step": 1629 + }, + { + "epoch": 0.6425227888642523, + "grad_norm": 0.6313390139589669, + "learning_rate": 6.820934076898247e-06, + "loss": 1.209, + "step": 1630 + }, + { + "epoch": 0.6429169746242916, + "grad_norm": 0.6219389731857671, + "learning_rate": 6.8078844501757625e-06, + "loss": 1.1647, + "step": 1631 + }, + { + "epoch": 0.6433111603843311, + "grad_norm": 0.6255385020113866, + "learning_rate": 6.794840873336622e-06, + "loss": 1.2185, + "step": 1632 + }, + { + "epoch": 0.6437053461443706, + "grad_norm": 0.6214536562298445, + "learning_rate": 6.781803371101774e-06, + "loss": 1.2235, + "step": 1633 + }, + { + "epoch": 0.6440995319044099, + "grad_norm": 0.6520907124359351, + "learning_rate": 6.768771968180643e-06, + "loss": 1.2638, + "step": 1634 + }, + { + "epoch": 0.6444937176644494, + "grad_norm": 0.6349696744735929, + "learning_rate": 6.755746689271112e-06, + "loss": 1.2064, + "step": 1635 + }, + { + "epoch": 0.6448879034244888, + "grad_norm": 0.6202351218573725, + "learning_rate": 6.742727559059448e-06, + "loss": 1.2017, + "step": 1636 + }, + { + "epoch": 0.6452820891845282, + "grad_norm": 0.6114039580216786, + "learning_rate": 6.729714602220256e-06, + "loss": 1.1862, + "step": 1637 + }, + { + "epoch": 0.6456762749445676, + "grad_norm": 0.6747317843915315, + "learning_rate": 6.71670784341646e-06, + "loss": 1.2687, + "step": 1638 + }, + { + "epoch": 0.646070460704607, + "grad_norm": 0.6221379676750881, + "learning_rate": 6.703707307299224e-06, + "loss": 1.1739, + "step": 1639 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.6067484985660325, + "learning_rate": 6.690713018507917e-06, + "loss": 1.1716, + "step": 1640 + }, + { + "epoch": 0.6468588322246859, + "grad_norm": 0.6646806120765326, + "learning_rate": 6.677725001670078e-06, + "loss": 1.2563, + "step": 1641 + }, + { + "epoch": 0.6472530179847253, + "grad_norm": 0.6381676236429237, + "learning_rate": 6.664743281401351e-06, + "loss": 1.2079, + "step": 1642 + }, + { + "epoch": 0.6476472037447647, + "grad_norm": 0.6325821061959688, + "learning_rate": 6.651767882305447e-06, + "loss": 1.1695, + "step": 1643 + }, + { + "epoch": 0.6480413895048042, + "grad_norm": 0.6475669717517898, + "learning_rate": 6.6387988289741e-06, + "loss": 1.2316, + "step": 1644 + }, + { + "epoch": 0.6484355752648435, + "grad_norm": 0.6328642670845832, + "learning_rate": 6.625836145987015e-06, + "loss": 1.187, + "step": 1645 + }, + { + "epoch": 0.648829761024883, + "grad_norm": 0.6356937277383269, + "learning_rate": 6.612879857911825e-06, + "loss": 1.1713, + "step": 1646 + }, + { + "epoch": 0.6492239467849223, + "grad_norm": 0.6286143776886958, + "learning_rate": 6.599929989304034e-06, + "loss": 1.1949, + "step": 1647 + }, + { + "epoch": 0.6496181325449618, + "grad_norm": 0.6251531191060387, + "learning_rate": 6.5869865647069995e-06, + "loss": 1.1918, + "step": 1648 + }, + { + "epoch": 0.6500123183050013, + "grad_norm": 0.6111849191258597, + "learning_rate": 6.574049608651849e-06, + "loss": 1.1922, + "step": 1649 + }, + { + "epoch": 0.6504065040650406, + "grad_norm": 0.6172328892977227, + "learning_rate": 6.561119145657451e-06, + "loss": 1.2013, + "step": 1650 + }, + { + "epoch": 0.6508006898250801, + "grad_norm": 0.6563068727145971, + "learning_rate": 6.548195200230376e-06, + "loss": 1.1936, + "step": 1651 + }, + { + "epoch": 0.6511948755851195, + "grad_norm": 0.6451511184566149, + "learning_rate": 6.535277796864842e-06, + "loss": 1.1765, + "step": 1652 + }, + { + "epoch": 0.6515890613451589, + "grad_norm": 0.6148495858039739, + "learning_rate": 6.522366960042654e-06, + "loss": 1.1506, + "step": 1653 + }, + { + "epoch": 0.6519832471051983, + "grad_norm": 0.6125300863917666, + "learning_rate": 6.509462714233194e-06, + "loss": 1.1669, + "step": 1654 + }, + { + "epoch": 0.6523774328652377, + "grad_norm": 0.630309988193399, + "learning_rate": 6.496565083893333e-06, + "loss": 1.1889, + "step": 1655 + }, + { + "epoch": 0.6527716186252772, + "grad_norm": 0.6634157824387188, + "learning_rate": 6.483674093467409e-06, + "loss": 1.2278, + "step": 1656 + }, + { + "epoch": 0.6531658043853166, + "grad_norm": 0.631045534805432, + "learning_rate": 6.470789767387188e-06, + "loss": 1.1569, + "step": 1657 + }, + { + "epoch": 0.653559990145356, + "grad_norm": 0.6445024253655253, + "learning_rate": 6.457912130071786e-06, + "loss": 1.2291, + "step": 1658 + }, + { + "epoch": 0.6539541759053954, + "grad_norm": 0.6295685120939664, + "learning_rate": 6.445041205927658e-06, + "loss": 1.1953, + "step": 1659 + }, + { + "epoch": 0.6543483616654349, + "grad_norm": 0.6095510411838025, + "learning_rate": 6.432177019348521e-06, + "loss": 1.2001, + "step": 1660 + }, + { + "epoch": 0.6547425474254742, + "grad_norm": 0.6444146297988372, + "learning_rate": 6.419319594715338e-06, + "loss": 1.244, + "step": 1661 + }, + { + "epoch": 0.6551367331855137, + "grad_norm": 0.6104207832263667, + "learning_rate": 6.4064689563962505e-06, + "loss": 1.1556, + "step": 1662 + }, + { + "epoch": 0.655530918945553, + "grad_norm": 0.6326952360287978, + "learning_rate": 6.393625128746527e-06, + "loss": 1.1521, + "step": 1663 + }, + { + "epoch": 0.6559251047055925, + "grad_norm": 0.640334858610275, + "learning_rate": 6.3807881361085465e-06, + "loss": 1.181, + "step": 1664 + }, + { + "epoch": 0.656319290465632, + "grad_norm": 0.6504217808929613, + "learning_rate": 6.367958002811726e-06, + "loss": 1.1974, + "step": 1665 + }, + { + "epoch": 0.6567134762256713, + "grad_norm": 0.6529534715347126, + "learning_rate": 6.355134753172474e-06, + "loss": 1.1889, + "step": 1666 + }, + { + "epoch": 0.6571076619857108, + "grad_norm": 0.6654769765183821, + "learning_rate": 6.3423184114941686e-06, + "loss": 1.1865, + "step": 1667 + }, + { + "epoch": 0.6575018477457502, + "grad_norm": 0.6436155169730803, + "learning_rate": 6.32950900206708e-06, + "loss": 1.1647, + "step": 1668 + }, + { + "epoch": 0.6578960335057896, + "grad_norm": 0.6503660356165931, + "learning_rate": 6.31670654916835e-06, + "loss": 1.1674, + "step": 1669 + }, + { + "epoch": 0.658290219265829, + "grad_norm": 0.6608765081904892, + "learning_rate": 6.303911077061937e-06, + "loss": 1.2069, + "step": 1670 + }, + { + "epoch": 0.6586844050258684, + "grad_norm": 0.6417814536413016, + "learning_rate": 6.291122609998559e-06, + "loss": 1.2464, + "step": 1671 + }, + { + "epoch": 0.6590785907859079, + "grad_norm": 0.6676289218023853, + "learning_rate": 6.278341172215669e-06, + "loss": 1.2228, + "step": 1672 + }, + { + "epoch": 0.6594727765459473, + "grad_norm": 0.6280886790009287, + "learning_rate": 6.265566787937386e-06, + "loss": 1.1968, + "step": 1673 + }, + { + "epoch": 0.6598669623059867, + "grad_norm": 0.6483564238116941, + "learning_rate": 6.252799481374472e-06, + "loss": 1.2109, + "step": 1674 + }, + { + "epoch": 0.6602611480660261, + "grad_norm": 0.6189215649081374, + "learning_rate": 6.240039276724273e-06, + "loss": 1.196, + "step": 1675 + }, + { + "epoch": 0.6606553338260656, + "grad_norm": 0.6496483405660746, + "learning_rate": 6.227286198170663e-06, + "loss": 1.2246, + "step": 1676 + }, + { + "epoch": 0.6610495195861049, + "grad_norm": 0.6436584140179482, + "learning_rate": 6.214540269884026e-06, + "loss": 1.2284, + "step": 1677 + }, + { + "epoch": 0.6614437053461444, + "grad_norm": 0.6076777270904066, + "learning_rate": 6.20180151602119e-06, + "loss": 1.1942, + "step": 1678 + }, + { + "epoch": 0.6618378911061837, + "grad_norm": 0.636033416189757, + "learning_rate": 6.189069960725375e-06, + "loss": 1.1675, + "step": 1679 + }, + { + "epoch": 0.6622320768662232, + "grad_norm": 0.6396164730580286, + "learning_rate": 6.176345628126176e-06, + "loss": 1.1487, + "step": 1680 + }, + { + "epoch": 0.6626262626262627, + "grad_norm": 0.6015028228353986, + "learning_rate": 6.163628542339482e-06, + "loss": 1.1619, + "step": 1681 + }, + { + "epoch": 0.663020448386302, + "grad_norm": 0.6749292049019211, + "learning_rate": 6.150918727467455e-06, + "loss": 1.254, + "step": 1682 + }, + { + "epoch": 0.6634146341463415, + "grad_norm": 0.6328636162023467, + "learning_rate": 6.138216207598484e-06, + "loss": 1.2299, + "step": 1683 + }, + { + "epoch": 0.6638088199063809, + "grad_norm": 0.6214587756005278, + "learning_rate": 6.125521006807116e-06, + "loss": 1.2219, + "step": 1684 + }, + { + "epoch": 0.6642030056664203, + "grad_norm": 0.6537286104808447, + "learning_rate": 6.112833149154042e-06, + "loss": 1.2113, + "step": 1685 + }, + { + "epoch": 0.6645971914264597, + "grad_norm": 0.609872538457475, + "learning_rate": 6.10015265868602e-06, + "loss": 1.1715, + "step": 1686 + }, + { + "epoch": 0.6649913771864991, + "grad_norm": 0.6494731629680189, + "learning_rate": 6.0874795594358635e-06, + "loss": 1.2314, + "step": 1687 + }, + { + "epoch": 0.6653855629465386, + "grad_norm": 0.632923311793017, + "learning_rate": 6.0748138754223665e-06, + "loss": 1.1768, + "step": 1688 + }, + { + "epoch": 0.665779748706578, + "grad_norm": 0.6247202140755514, + "learning_rate": 6.062155630650265e-06, + "loss": 1.1812, + "step": 1689 + }, + { + "epoch": 0.6661739344666174, + "grad_norm": 0.631382377815529, + "learning_rate": 6.04950484911021e-06, + "loss": 1.1885, + "step": 1690 + }, + { + "epoch": 0.6665681202266568, + "grad_norm": 0.6138459038575285, + "learning_rate": 6.036861554778695e-06, + "loss": 1.1024, + "step": 1691 + }, + { + "epoch": 0.6669623059866963, + "grad_norm": 0.6265529929087996, + "learning_rate": 6.024225771618024e-06, + "loss": 1.1803, + "step": 1692 + }, + { + "epoch": 0.6673564917467356, + "grad_norm": 0.6227616940366973, + "learning_rate": 6.01159752357628e-06, + "loss": 1.2006, + "step": 1693 + }, + { + "epoch": 0.6677506775067751, + "grad_norm": 0.6558790947502295, + "learning_rate": 5.998976834587246e-06, + "loss": 1.2862, + "step": 1694 + }, + { + "epoch": 0.6681448632668144, + "grad_norm": 0.6304744900349945, + "learning_rate": 5.98636372857039e-06, + "loss": 1.1633, + "step": 1695 + }, + { + "epoch": 0.6685390490268539, + "grad_norm": 0.6318297859034908, + "learning_rate": 5.973758229430806e-06, + "loss": 1.2295, + "step": 1696 + }, + { + "epoch": 0.6689332347868934, + "grad_norm": 0.5988437549278761, + "learning_rate": 5.961160361059168e-06, + "loss": 1.1157, + "step": 1697 + }, + { + "epoch": 0.6693274205469327, + "grad_norm": 0.6137920151619946, + "learning_rate": 5.9485701473316925e-06, + "loss": 1.1448, + "step": 1698 + }, + { + "epoch": 0.6697216063069722, + "grad_norm": 0.6329970134758367, + "learning_rate": 5.935987612110081e-06, + "loss": 1.1792, + "step": 1699 + }, + { + "epoch": 0.6701157920670116, + "grad_norm": 0.6102586025760833, + "learning_rate": 5.923412779241493e-06, + "loss": 1.1214, + "step": 1700 + }, + { + "epoch": 0.670509977827051, + "grad_norm": 0.6016261422928656, + "learning_rate": 5.910845672558483e-06, + "loss": 1.1718, + "step": 1701 + }, + { + "epoch": 0.6709041635870904, + "grad_norm": 0.6144263728280865, + "learning_rate": 5.8982863158789605e-06, + "loss": 1.1613, + "step": 1702 + }, + { + "epoch": 0.6712983493471298, + "grad_norm": 0.621741539871381, + "learning_rate": 5.8857347330061545e-06, + "loss": 1.2034, + "step": 1703 + }, + { + "epoch": 0.6716925351071693, + "grad_norm": 0.6395204468391608, + "learning_rate": 5.873190947728552e-06, + "loss": 1.2198, + "step": 1704 + }, + { + "epoch": 0.6720867208672087, + "grad_norm": 0.606550147222352, + "learning_rate": 5.860654983819865e-06, + "loss": 1.1776, + "step": 1705 + }, + { + "epoch": 0.6724809066272481, + "grad_norm": 0.61755989526117, + "learning_rate": 5.84812686503899e-06, + "loss": 1.2269, + "step": 1706 + }, + { + "epoch": 0.6728750923872875, + "grad_norm": 0.7087998957119107, + "learning_rate": 5.83560661512994e-06, + "loss": 1.2204, + "step": 1707 + }, + { + "epoch": 0.673269278147327, + "grad_norm": 0.6413367764373633, + "learning_rate": 5.823094257821822e-06, + "loss": 1.1834, + "step": 1708 + }, + { + "epoch": 0.6736634639073663, + "grad_norm": 0.6157486461013707, + "learning_rate": 5.810589816828786e-06, + "loss": 1.1602, + "step": 1709 + }, + { + "epoch": 0.6740576496674058, + "grad_norm": 0.6342496529809019, + "learning_rate": 5.798093315849984e-06, + "loss": 1.2135, + "step": 1710 + }, + { + "epoch": 0.6744518354274451, + "grad_norm": 0.6117339478605194, + "learning_rate": 5.785604778569505e-06, + "loss": 1.177, + "step": 1711 + }, + { + "epoch": 0.6748460211874846, + "grad_norm": 0.6360723349056584, + "learning_rate": 5.773124228656348e-06, + "loss": 1.2873, + "step": 1712 + }, + { + "epoch": 0.6752402069475241, + "grad_norm": 0.6302819005649393, + "learning_rate": 5.76065168976439e-06, + "loss": 1.1972, + "step": 1713 + }, + { + "epoch": 0.6756343927075634, + "grad_norm": 0.6224162266525995, + "learning_rate": 5.748187185532306e-06, + "loss": 1.1855, + "step": 1714 + }, + { + "epoch": 0.6760285784676029, + "grad_norm": 0.6281722704464516, + "learning_rate": 5.73573073958355e-06, + "loss": 1.1815, + "step": 1715 + }, + { + "epoch": 0.6764227642276422, + "grad_norm": 0.6081887852352087, + "learning_rate": 5.723282375526302e-06, + "loss": 1.1804, + "step": 1716 + }, + { + "epoch": 0.6768169499876817, + "grad_norm": 0.6352236721472015, + "learning_rate": 5.7108421169534376e-06, + "loss": 1.1534, + "step": 1717 + }, + { + "epoch": 0.6772111357477211, + "grad_norm": 0.5979382590678716, + "learning_rate": 5.698409987442448e-06, + "loss": 1.1452, + "step": 1718 + }, + { + "epoch": 0.6776053215077605, + "grad_norm": 0.6036448112025448, + "learning_rate": 5.685986010555437e-06, + "loss": 1.1876, + "step": 1719 + }, + { + "epoch": 0.6779995072678, + "grad_norm": 0.6219506058018258, + "learning_rate": 5.6735702098390454e-06, + "loss": 1.2324, + "step": 1720 + }, + { + "epoch": 0.6783936930278394, + "grad_norm": 0.6263654931652052, + "learning_rate": 5.66116260882442e-06, + "loss": 1.1572, + "step": 1721 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.6278411193914041, + "learning_rate": 5.648763231027171e-06, + "loss": 1.1307, + "step": 1722 + }, + { + "epoch": 0.6791820645479182, + "grad_norm": 0.6294069087185388, + "learning_rate": 5.636372099947327e-06, + "loss": 1.2278, + "step": 1723 + }, + { + "epoch": 0.6795762503079577, + "grad_norm": 0.6296558801771532, + "learning_rate": 5.623989239069275e-06, + "loss": 1.1627, + "step": 1724 + }, + { + "epoch": 0.679970436067997, + "grad_norm": 0.6385637803835064, + "learning_rate": 5.611614671861733e-06, + "loss": 1.1481, + "step": 1725 + }, + { + "epoch": 0.6803646218280365, + "grad_norm": 0.6307923826155407, + "learning_rate": 5.5992484217777074e-06, + "loss": 1.2114, + "step": 1726 + }, + { + "epoch": 0.6807588075880758, + "grad_norm": 0.6040246463542289, + "learning_rate": 5.5868905122544344e-06, + "loss": 1.2137, + "step": 1727 + }, + { + "epoch": 0.6811529933481153, + "grad_norm": 0.6139446753066389, + "learning_rate": 5.574540966713338e-06, + "loss": 1.1472, + "step": 1728 + }, + { + "epoch": 0.6815471791081548, + "grad_norm": 0.6430020863098516, + "learning_rate": 5.562199808560001e-06, + "loss": 1.2109, + "step": 1729 + }, + { + "epoch": 0.6819413648681941, + "grad_norm": 0.6061201727927807, + "learning_rate": 5.549867061184108e-06, + "loss": 1.1718, + "step": 1730 + }, + { + "epoch": 0.6823355506282336, + "grad_norm": 0.6422178072097416, + "learning_rate": 5.5375427479593945e-06, + "loss": 1.1794, + "step": 1731 + }, + { + "epoch": 0.682729736388273, + "grad_norm": 0.6458731861630423, + "learning_rate": 5.525226892243623e-06, + "loss": 1.2502, + "step": 1732 + }, + { + "epoch": 0.6831239221483124, + "grad_norm": 0.631975611730984, + "learning_rate": 5.5129195173785184e-06, + "loss": 1.224, + "step": 1733 + }, + { + "epoch": 0.6835181079083518, + "grad_norm": 0.639062643993908, + "learning_rate": 5.50062064668973e-06, + "loss": 1.2374, + "step": 1734 + }, + { + "epoch": 0.6839122936683912, + "grad_norm": 0.6153286588995233, + "learning_rate": 5.488330303486795e-06, + "loss": 1.1532, + "step": 1735 + }, + { + "epoch": 0.6843064794284307, + "grad_norm": 0.6095750520956184, + "learning_rate": 5.4760485110630956e-06, + "loss": 1.1539, + "step": 1736 + }, + { + "epoch": 0.6847006651884701, + "grad_norm": 0.6242095926386367, + "learning_rate": 5.46377529269579e-06, + "loss": 1.1842, + "step": 1737 + }, + { + "epoch": 0.6850948509485095, + "grad_norm": 0.6373500217851757, + "learning_rate": 5.451510671645806e-06, + "loss": 1.2564, + "step": 1738 + }, + { + "epoch": 0.6854890367085489, + "grad_norm": 0.6528326441972604, + "learning_rate": 5.439254671157764e-06, + "loss": 1.2031, + "step": 1739 + }, + { + "epoch": 0.6858832224685883, + "grad_norm": 0.6265646534423697, + "learning_rate": 5.427007314459949e-06, + "loss": 1.2276, + "step": 1740 + }, + { + "epoch": 0.6862774082286277, + "grad_norm": 0.6155975267249686, + "learning_rate": 5.414768624764262e-06, + "loss": 1.168, + "step": 1741 + }, + { + "epoch": 0.6866715939886672, + "grad_norm": 0.6407827075088298, + "learning_rate": 5.402538625266184e-06, + "loss": 1.2118, + "step": 1742 + }, + { + "epoch": 0.6870657797487065, + "grad_norm": 0.6203929435962302, + "learning_rate": 5.390317339144726e-06, + "loss": 1.1711, + "step": 1743 + }, + { + "epoch": 0.687459965508746, + "grad_norm": 0.6296758413992221, + "learning_rate": 5.378104789562373e-06, + "loss": 1.1671, + "step": 1744 + }, + { + "epoch": 0.6878541512687855, + "grad_norm": 0.6402560327012314, + "learning_rate": 5.3659009996650704e-06, + "loss": 1.2331, + "step": 1745 + }, + { + "epoch": 0.6882483370288248, + "grad_norm": 0.6352813958888808, + "learning_rate": 5.353705992582147e-06, + "loss": 1.171, + "step": 1746 + }, + { + "epoch": 0.6886425227888643, + "grad_norm": 0.6173013307650468, + "learning_rate": 5.341519791426285e-06, + "loss": 1.1872, + "step": 1747 + }, + { + "epoch": 0.6890367085489036, + "grad_norm": 0.6300579221159313, + "learning_rate": 5.329342419293488e-06, + "loss": 1.1538, + "step": 1748 + }, + { + "epoch": 0.6894308943089431, + "grad_norm": 0.6452484286067051, + "learning_rate": 5.3171738992630266e-06, + "loss": 1.1983, + "step": 1749 + }, + { + "epoch": 0.6898250800689825, + "grad_norm": 0.6351697766210709, + "learning_rate": 5.305014254397378e-06, + "loss": 1.2099, + "step": 1750 + }, + { + "epoch": 0.6902192658290219, + "grad_norm": 0.6059437488402356, + "learning_rate": 5.292863507742218e-06, + "loss": 1.1429, + "step": 1751 + }, + { + "epoch": 0.6906134515890614, + "grad_norm": 0.6375500404238919, + "learning_rate": 5.280721682326349e-06, + "loss": 1.195, + "step": 1752 + }, + { + "epoch": 0.6910076373491008, + "grad_norm": 0.6214302914583397, + "learning_rate": 5.268588801161661e-06, + "loss": 1.1562, + "step": 1753 + }, + { + "epoch": 0.6914018231091402, + "grad_norm": 0.6233573649742591, + "learning_rate": 5.256464887243095e-06, + "loss": 1.1784, + "step": 1754 + }, + { + "epoch": 0.6917960088691796, + "grad_norm": 0.6057486309866048, + "learning_rate": 5.244349963548603e-06, + "loss": 1.1841, + "step": 1755 + }, + { + "epoch": 0.692190194629219, + "grad_norm": 0.6262495769486762, + "learning_rate": 5.232244053039099e-06, + "loss": 1.2069, + "step": 1756 + }, + { + "epoch": 0.6925843803892584, + "grad_norm": 0.6244256499974958, + "learning_rate": 5.220147178658401e-06, + "loss": 1.2099, + "step": 1757 + }, + { + "epoch": 0.6929785661492979, + "grad_norm": 0.5987132658245882, + "learning_rate": 5.208059363333218e-06, + "loss": 1.1172, + "step": 1758 + }, + { + "epoch": 0.6933727519093372, + "grad_norm": 0.6204462023553633, + "learning_rate": 5.195980629973077e-06, + "loss": 1.1287, + "step": 1759 + }, + { + "epoch": 0.6937669376693767, + "grad_norm": 0.616887618107624, + "learning_rate": 5.183911001470296e-06, + "loss": 1.1707, + "step": 1760 + }, + { + "epoch": 0.6941611234294162, + "grad_norm": 0.6131588350689924, + "learning_rate": 5.171850500699942e-06, + "loss": 1.1913, + "step": 1761 + }, + { + "epoch": 0.6945553091894555, + "grad_norm": 0.6220240105240659, + "learning_rate": 5.159799150519773e-06, + "loss": 1.1752, + "step": 1762 + }, + { + "epoch": 0.694949494949495, + "grad_norm": 0.6474411617934912, + "learning_rate": 5.147756973770215e-06, + "loss": 1.1685, + "step": 1763 + }, + { + "epoch": 0.6953436807095343, + "grad_norm": 0.6074241395347293, + "learning_rate": 5.135723993274304e-06, + "loss": 1.1274, + "step": 1764 + }, + { + "epoch": 0.6957378664695738, + "grad_norm": 0.6257258438943853, + "learning_rate": 5.123700231837643e-06, + "loss": 1.1876, + "step": 1765 + }, + { + "epoch": 0.6961320522296132, + "grad_norm": 0.6240327119384406, + "learning_rate": 5.111685712248364e-06, + "loss": 1.1356, + "step": 1766 + }, + { + "epoch": 0.6965262379896526, + "grad_norm": 0.6058794807211466, + "learning_rate": 5.099680457277083e-06, + "loss": 1.1859, + "step": 1767 + }, + { + "epoch": 0.6969204237496921, + "grad_norm": 0.6130830438069458, + "learning_rate": 5.087684489676862e-06, + "loss": 1.1917, + "step": 1768 + }, + { + "epoch": 0.6973146095097315, + "grad_norm": 0.6307417343281665, + "learning_rate": 5.07569783218316e-06, + "loss": 1.2297, + "step": 1769 + }, + { + "epoch": 0.6977087952697709, + "grad_norm": 0.6127737313603762, + "learning_rate": 5.063720507513781e-06, + "loss": 1.1673, + "step": 1770 + }, + { + "epoch": 0.6981029810298103, + "grad_norm": 0.624666994089622, + "learning_rate": 5.051752538368855e-06, + "loss": 1.2133, + "step": 1771 + }, + { + "epoch": 0.6984971667898497, + "grad_norm": 0.612192851855714, + "learning_rate": 5.039793947430774e-06, + "loss": 1.1894, + "step": 1772 + }, + { + "epoch": 0.6988913525498891, + "grad_norm": 0.6163484499307348, + "learning_rate": 5.02784475736415e-06, + "loss": 1.1901, + "step": 1773 + }, + { + "epoch": 0.6992855383099286, + "grad_norm": 0.6189253804729046, + "learning_rate": 5.015904990815792e-06, + "loss": 1.1852, + "step": 1774 + }, + { + "epoch": 0.6996797240699679, + "grad_norm": 0.6315133839229915, + "learning_rate": 5.003974670414633e-06, + "loss": 1.2218, + "step": 1775 + }, + { + "epoch": 0.7000739098300074, + "grad_norm": 0.6143569728327692, + "learning_rate": 4.992053818771715e-06, + "loss": 1.1698, + "step": 1776 + }, + { + "epoch": 0.7004680955900469, + "grad_norm": 0.6023568254933535, + "learning_rate": 4.980142458480136e-06, + "loss": 1.1618, + "step": 1777 + }, + { + "epoch": 0.7008622813500862, + "grad_norm": 0.620427287297367, + "learning_rate": 4.968240612114995e-06, + "loss": 1.1812, + "step": 1778 + }, + { + "epoch": 0.7012564671101257, + "grad_norm": 0.6169377500547716, + "learning_rate": 4.956348302233364e-06, + "loss": 1.1729, + "step": 1779 + }, + { + "epoch": 0.701650652870165, + "grad_norm": 0.6119581164148135, + "learning_rate": 4.944465551374238e-06, + "loss": 1.1942, + "step": 1780 + }, + { + "epoch": 0.7020448386302045, + "grad_norm": 0.6207029111041957, + "learning_rate": 4.932592382058503e-06, + "loss": 1.1841, + "step": 1781 + }, + { + "epoch": 0.7024390243902439, + "grad_norm": 0.6274557767427725, + "learning_rate": 4.920728816788885e-06, + "loss": 1.2241, + "step": 1782 + }, + { + "epoch": 0.7028332101502833, + "grad_norm": 0.6251490097972446, + "learning_rate": 4.908874878049894e-06, + "loss": 1.1746, + "step": 1783 + }, + { + "epoch": 0.7032273959103228, + "grad_norm": 0.6421558996903795, + "learning_rate": 4.897030588307816e-06, + "loss": 1.1599, + "step": 1784 + }, + { + "epoch": 0.7036215816703622, + "grad_norm": 0.6580529776636076, + "learning_rate": 4.885195970010634e-06, + "loss": 1.1876, + "step": 1785 + }, + { + "epoch": 0.7040157674304016, + "grad_norm": 0.7799716182595261, + "learning_rate": 4.873371045588002e-06, + "loss": 1.1619, + "step": 1786 + }, + { + "epoch": 0.704409953190441, + "grad_norm": 0.6034015555793384, + "learning_rate": 4.861555837451213e-06, + "loss": 1.1339, + "step": 1787 + }, + { + "epoch": 0.7048041389504804, + "grad_norm": 0.6354298706812905, + "learning_rate": 4.84975036799313e-06, + "loss": 1.1904, + "step": 1788 + }, + { + "epoch": 0.7051983247105198, + "grad_norm": 0.656808882761667, + "learning_rate": 4.837954659588172e-06, + "loss": 1.2118, + "step": 1789 + }, + { + "epoch": 0.7055925104705593, + "grad_norm": 0.6354068123945864, + "learning_rate": 4.826168734592254e-06, + "loss": 1.2657, + "step": 1790 + }, + { + "epoch": 0.7059866962305986, + "grad_norm": 0.6135559463093657, + "learning_rate": 4.814392615342746e-06, + "loss": 1.218, + "step": 1791 + }, + { + "epoch": 0.7063808819906381, + "grad_norm": 0.6190332303953764, + "learning_rate": 4.802626324158432e-06, + "loss": 1.1298, + "step": 1792 + }, + { + "epoch": 0.7067750677506776, + "grad_norm": 0.6261895312898496, + "learning_rate": 4.790869883339473e-06, + "loss": 1.2229, + "step": 1793 + }, + { + "epoch": 0.7071692535107169, + "grad_norm": 0.6499346687616555, + "learning_rate": 4.779123315167362e-06, + "loss": 1.2436, + "step": 1794 + }, + { + "epoch": 0.7075634392707564, + "grad_norm": 0.7112549120650247, + "learning_rate": 4.767386641904883e-06, + "loss": 1.1948, + "step": 1795 + }, + { + "epoch": 0.7079576250307957, + "grad_norm": 0.6187195781334022, + "learning_rate": 4.755659885796054e-06, + "loss": 1.2253, + "step": 1796 + }, + { + "epoch": 0.7083518107908352, + "grad_norm": 0.616576163504054, + "learning_rate": 4.743943069066118e-06, + "loss": 1.1448, + "step": 1797 + }, + { + "epoch": 0.7087459965508746, + "grad_norm": 0.614300702515973, + "learning_rate": 4.73223621392146e-06, + "loss": 1.181, + "step": 1798 + }, + { + "epoch": 0.709140182310914, + "grad_norm": 0.6141034301455051, + "learning_rate": 4.720539342549594e-06, + "loss": 1.1788, + "step": 1799 + }, + { + "epoch": 0.7095343680709535, + "grad_norm": 0.6073756603898747, + "learning_rate": 4.708852477119117e-06, + "loss": 1.1848, + "step": 1800 + }, + { + "epoch": 0.7099285538309928, + "grad_norm": 0.6344185849187683, + "learning_rate": 4.6971756397796506e-06, + "loss": 1.1721, + "step": 1801 + }, + { + "epoch": 0.7103227395910323, + "grad_norm": 0.6248360198993864, + "learning_rate": 4.6855088526618205e-06, + "loss": 1.1565, + "step": 1802 + }, + { + "epoch": 0.7107169253510717, + "grad_norm": 0.6152420860002373, + "learning_rate": 4.6738521378772066e-06, + "loss": 1.1702, + "step": 1803 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.6168160579182377, + "learning_rate": 4.662205517518286e-06, + "loss": 1.1988, + "step": 1804 + }, + { + "epoch": 0.7115052968711505, + "grad_norm": 0.6199790217466414, + "learning_rate": 4.650569013658417e-06, + "loss": 1.2058, + "step": 1805 + }, + { + "epoch": 0.71189948263119, + "grad_norm": 0.6176228890841313, + "learning_rate": 4.638942648351774e-06, + "loss": 1.1612, + "step": 1806 + }, + { + "epoch": 0.7122936683912293, + "grad_norm": 0.5959975381441662, + "learning_rate": 4.627326443633327e-06, + "loss": 1.1628, + "step": 1807 + }, + { + "epoch": 0.7126878541512688, + "grad_norm": 0.6189398958365385, + "learning_rate": 4.61572042151878e-06, + "loss": 1.1928, + "step": 1808 + }, + { + "epoch": 0.7130820399113083, + "grad_norm": 0.6271163010563219, + "learning_rate": 4.604124604004544e-06, + "loss": 1.2124, + "step": 1809 + }, + { + "epoch": 0.7134762256713476, + "grad_norm": 0.6000046568229123, + "learning_rate": 4.592539013067692e-06, + "loss": 1.153, + "step": 1810 + }, + { + "epoch": 0.7138704114313871, + "grad_norm": 0.5989067172216591, + "learning_rate": 4.580963670665906e-06, + "loss": 1.1537, + "step": 1811 + }, + { + "epoch": 0.7142645971914264, + "grad_norm": 0.65003150237445, + "learning_rate": 4.569398598737448e-06, + "loss": 1.2302, + "step": 1812 + }, + { + "epoch": 0.7146587829514659, + "grad_norm": 0.6224236372159876, + "learning_rate": 4.557843819201121e-06, + "loss": 1.2191, + "step": 1813 + }, + { + "epoch": 0.7150529687115053, + "grad_norm": 0.6360681967059407, + "learning_rate": 4.546299353956211e-06, + "loss": 1.1782, + "step": 1814 + }, + { + "epoch": 0.7154471544715447, + "grad_norm": 0.6134230197484926, + "learning_rate": 4.534765224882463e-06, + "loss": 1.2106, + "step": 1815 + }, + { + "epoch": 0.7158413402315842, + "grad_norm": 0.6176737002203802, + "learning_rate": 4.5232414538400336e-06, + "loss": 1.2175, + "step": 1816 + }, + { + "epoch": 0.7162355259916235, + "grad_norm": 0.6202906864487361, + "learning_rate": 4.511728062669443e-06, + "loss": 1.1807, + "step": 1817 + }, + { + "epoch": 0.716629711751663, + "grad_norm": 0.6212585444516489, + "learning_rate": 4.50022507319154e-06, + "loss": 1.1958, + "step": 1818 + }, + { + "epoch": 0.7170238975117024, + "grad_norm": 0.6142126146314887, + "learning_rate": 4.488732507207457e-06, + "loss": 1.189, + "step": 1819 + }, + { + "epoch": 0.7174180832717418, + "grad_norm": 0.6301160963451029, + "learning_rate": 4.477250386498582e-06, + "loss": 1.2383, + "step": 1820 + }, + { + "epoch": 0.7178122690317812, + "grad_norm": 0.6238993246895916, + "learning_rate": 4.46577873282649e-06, + "loss": 1.1642, + "step": 1821 + }, + { + "epoch": 0.7182064547918207, + "grad_norm": 0.5954902888936976, + "learning_rate": 4.4543175679329345e-06, + "loss": 1.1319, + "step": 1822 + }, + { + "epoch": 0.71860064055186, + "grad_norm": 0.5975113333384684, + "learning_rate": 4.442866913539783e-06, + "loss": 1.1692, + "step": 1823 + }, + { + "epoch": 0.7189948263118995, + "grad_norm": 0.6361387072646193, + "learning_rate": 4.431426791348981e-06, + "loss": 1.2058, + "step": 1824 + }, + { + "epoch": 0.719389012071939, + "grad_norm": 0.6206879841575946, + "learning_rate": 4.419997223042509e-06, + "loss": 1.1892, + "step": 1825 + }, + { + "epoch": 0.7197831978319783, + "grad_norm": 0.6187188924722868, + "learning_rate": 4.408578230282361e-06, + "loss": 1.2343, + "step": 1826 + }, + { + "epoch": 0.7201773835920178, + "grad_norm": 0.6099133549608606, + "learning_rate": 4.397169834710467e-06, + "loss": 1.1874, + "step": 1827 + }, + { + "epoch": 0.7205715693520571, + "grad_norm": 0.6218762750404337, + "learning_rate": 4.38577205794869e-06, + "loss": 1.2522, + "step": 1828 + }, + { + "epoch": 0.7209657551120966, + "grad_norm": 0.6122795104171647, + "learning_rate": 4.37438492159876e-06, + "loss": 1.1989, + "step": 1829 + }, + { + "epoch": 0.721359940872136, + "grad_norm": 0.6015290594639533, + "learning_rate": 4.36300844724224e-06, + "loss": 1.1714, + "step": 1830 + }, + { + "epoch": 0.7217541266321754, + "grad_norm": 0.6252355128162509, + "learning_rate": 4.351642656440482e-06, + "loss": 1.1703, + "step": 1831 + }, + { + "epoch": 0.7221483123922149, + "grad_norm": 0.6111637339804932, + "learning_rate": 4.340287570734604e-06, + "loss": 1.152, + "step": 1832 + }, + { + "epoch": 0.7225424981522542, + "grad_norm": 0.6101267108124663, + "learning_rate": 4.32894321164542e-06, + "loss": 1.184, + "step": 1833 + }, + { + "epoch": 0.7229366839122937, + "grad_norm": 0.6424270287758459, + "learning_rate": 4.317609600673418e-06, + "loss": 1.1703, + "step": 1834 + }, + { + "epoch": 0.7233308696723331, + "grad_norm": 0.6224326912866733, + "learning_rate": 4.306286759298721e-06, + "loss": 1.1925, + "step": 1835 + }, + { + "epoch": 0.7237250554323725, + "grad_norm": 0.5990540447824775, + "learning_rate": 4.294974708981041e-06, + "loss": 1.1549, + "step": 1836 + }, + { + "epoch": 0.7241192411924119, + "grad_norm": 0.6304187409365657, + "learning_rate": 4.283673471159632e-06, + "loss": 1.1974, + "step": 1837 + }, + { + "epoch": 0.7245134269524514, + "grad_norm": 0.6236344446716869, + "learning_rate": 4.272383067253254e-06, + "loss": 1.1704, + "step": 1838 + }, + { + "epoch": 0.7249076127124907, + "grad_norm": 0.6183536446735383, + "learning_rate": 4.2611035186601445e-06, + "loss": 1.2539, + "step": 1839 + }, + { + "epoch": 0.7253017984725302, + "grad_norm": 0.6381015795817223, + "learning_rate": 4.2498348467579555e-06, + "loss": 1.1772, + "step": 1840 + }, + { + "epoch": 0.7256959842325696, + "grad_norm": 0.6196633330398633, + "learning_rate": 4.2385770729037336e-06, + "loss": 1.1597, + "step": 1841 + }, + { + "epoch": 0.726090169992609, + "grad_norm": 0.6402144565991683, + "learning_rate": 4.22733021843387e-06, + "loss": 1.2207, + "step": 1842 + }, + { + "epoch": 0.7264843557526485, + "grad_norm": 0.6134635440909342, + "learning_rate": 4.216094304664056e-06, + "loss": 1.2303, + "step": 1843 + }, + { + "epoch": 0.7268785415126878, + "grad_norm": 0.6170474770272091, + "learning_rate": 4.204869352889246e-06, + "loss": 1.1897, + "step": 1844 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.625150589347141, + "learning_rate": 4.193655384383631e-06, + "loss": 1.1273, + "step": 1845 + }, + { + "epoch": 0.7276669130327666, + "grad_norm": 0.6702486495437785, + "learning_rate": 4.182452420400571e-06, + "loss": 1.2604, + "step": 1846 + }, + { + "epoch": 0.7280610987928061, + "grad_norm": 1.1398019367962655, + "learning_rate": 4.171260482172574e-06, + "loss": 1.151, + "step": 1847 + }, + { + "epoch": 0.7284552845528456, + "grad_norm": 0.6232712417738132, + "learning_rate": 4.160079590911257e-06, + "loss": 1.1928, + "step": 1848 + }, + { + "epoch": 0.7288494703128849, + "grad_norm": 0.6346597753210788, + "learning_rate": 4.1489097678073e-06, + "loss": 1.2134, + "step": 1849 + }, + { + "epoch": 0.7292436560729244, + "grad_norm": 0.622479343337929, + "learning_rate": 4.1377510340304e-06, + "loss": 1.1351, + "step": 1850 + }, + { + "epoch": 0.7296378418329638, + "grad_norm": 0.6095396783729989, + "learning_rate": 4.126603410729232e-06, + "loss": 1.1835, + "step": 1851 + }, + { + "epoch": 0.7300320275930032, + "grad_norm": 0.6007947259934253, + "learning_rate": 4.1154669190314315e-06, + "loss": 1.1361, + "step": 1852 + }, + { + "epoch": 0.7304262133530426, + "grad_norm": 0.6392450529455237, + "learning_rate": 4.104341580043518e-06, + "loss": 1.2352, + "step": 1853 + }, + { + "epoch": 0.730820399113082, + "grad_norm": 0.6088170301748977, + "learning_rate": 4.093227414850887e-06, + "loss": 1.1555, + "step": 1854 + }, + { + "epoch": 0.7312145848731214, + "grad_norm": 0.611940955223257, + "learning_rate": 4.0821244445177535e-06, + "loss": 1.1035, + "step": 1855 + }, + { + "epoch": 0.7316087706331609, + "grad_norm": 0.6429334370137534, + "learning_rate": 4.071032690087111e-06, + "loss": 1.2077, + "step": 1856 + }, + { + "epoch": 0.7320029563932003, + "grad_norm": 0.6199867856316763, + "learning_rate": 4.059952172580694e-06, + "loss": 1.1898, + "step": 1857 + }, + { + "epoch": 0.7323971421532397, + "grad_norm": 0.682925719480743, + "learning_rate": 4.0488829129989536e-06, + "loss": 1.1796, + "step": 1858 + }, + { + "epoch": 0.7327913279132792, + "grad_norm": 0.6300326280908697, + "learning_rate": 4.0378249323209915e-06, + "loss": 1.1821, + "step": 1859 + }, + { + "epoch": 0.7331855136733185, + "grad_norm": 0.6188854368428854, + "learning_rate": 4.026778251504533e-06, + "loss": 1.212, + "step": 1860 + }, + { + "epoch": 0.733579699433358, + "grad_norm": 0.7209116321064022, + "learning_rate": 4.015742891485893e-06, + "loss": 1.2115, + "step": 1861 + }, + { + "epoch": 0.7339738851933973, + "grad_norm": 0.6377551509793858, + "learning_rate": 4.0047188731799345e-06, + "loss": 1.2223, + "step": 1862 + }, + { + "epoch": 0.7343680709534368, + "grad_norm": 0.6709121309342012, + "learning_rate": 3.993706217480015e-06, + "loss": 1.2369, + "step": 1863 + }, + { + "epoch": 0.7347622567134763, + "grad_norm": 0.6610392131221031, + "learning_rate": 3.982704945257957e-06, + "loss": 1.238, + "step": 1864 + }, + { + "epoch": 0.7351564424735156, + "grad_norm": 0.6314301850508148, + "learning_rate": 3.97171507736402e-06, + "loss": 1.1694, + "step": 1865 + }, + { + "epoch": 0.7355506282335551, + "grad_norm": 0.6075680590520474, + "learning_rate": 3.960736634626838e-06, + "loss": 1.1627, + "step": 1866 + }, + { + "epoch": 0.7359448139935945, + "grad_norm": 0.6341926480920811, + "learning_rate": 3.949769637853393e-06, + "loss": 1.1434, + "step": 1867 + }, + { + "epoch": 0.7363389997536339, + "grad_norm": 0.621486685123361, + "learning_rate": 3.9388141078289775e-06, + "loss": 1.1946, + "step": 1868 + }, + { + "epoch": 0.7367331855136733, + "grad_norm": 0.6464204738071503, + "learning_rate": 3.927870065317156e-06, + "loss": 1.1774, + "step": 1869 + }, + { + "epoch": 0.7371273712737128, + "grad_norm": 0.6718388040792097, + "learning_rate": 3.916937531059706e-06, + "loss": 1.161, + "step": 1870 + }, + { + "epoch": 0.7375215570337521, + "grad_norm": 0.6323822736177052, + "learning_rate": 3.9060165257766116e-06, + "loss": 1.2166, + "step": 1871 + }, + { + "epoch": 0.7379157427937916, + "grad_norm": 0.6289704307488232, + "learning_rate": 3.895107070165995e-06, + "loss": 1.1657, + "step": 1872 + }, + { + "epoch": 0.738309928553831, + "grad_norm": 0.6262746372052379, + "learning_rate": 3.884209184904088e-06, + "loss": 1.2249, + "step": 1873 + }, + { + "epoch": 0.7387041143138704, + "grad_norm": 0.6184529013832247, + "learning_rate": 3.873322890645202e-06, + "loss": 1.1515, + "step": 1874 + }, + { + "epoch": 0.7390983000739099, + "grad_norm": 0.6290711060233826, + "learning_rate": 3.862448208021677e-06, + "loss": 1.1834, + "step": 1875 + }, + { + "epoch": 0.7394924858339492, + "grad_norm": 0.5895476413662796, + "learning_rate": 3.851585157643845e-06, + "loss": 1.1234, + "step": 1876 + }, + { + "epoch": 0.7398866715939887, + "grad_norm": 0.6107335830258855, + "learning_rate": 3.840733760099985e-06, + "loss": 1.1639, + "step": 1877 + }, + { + "epoch": 0.740280857354028, + "grad_norm": 0.6322945602429125, + "learning_rate": 3.829894035956306e-06, + "loss": 1.2427, + "step": 1878 + }, + { + "epoch": 0.7406750431140675, + "grad_norm": 0.6323335943798655, + "learning_rate": 3.819066005756883e-06, + "loss": 1.2223, + "step": 1879 + }, + { + "epoch": 0.741069228874107, + "grad_norm": 0.6078450616507315, + "learning_rate": 3.8082496900236244e-06, + "loss": 1.1706, + "step": 1880 + }, + { + "epoch": 0.7414634146341463, + "grad_norm": 0.6221466682968542, + "learning_rate": 3.7974451092562447e-06, + "loss": 1.2046, + "step": 1881 + }, + { + "epoch": 0.7418576003941858, + "grad_norm": 0.6049678464198069, + "learning_rate": 3.7866522839322207e-06, + "loss": 1.1767, + "step": 1882 + }, + { + "epoch": 0.7422517861542252, + "grad_norm": 0.6295952461868448, + "learning_rate": 3.775871234506734e-06, + "loss": 1.2225, + "step": 1883 + }, + { + "epoch": 0.7426459719142646, + "grad_norm": 0.6394412262692781, + "learning_rate": 3.7651019814126656e-06, + "loss": 1.214, + "step": 1884 + }, + { + "epoch": 0.743040157674304, + "grad_norm": 0.610513027873533, + "learning_rate": 3.754344545060529e-06, + "loss": 1.1537, + "step": 1885 + }, + { + "epoch": 0.7434343434343434, + "grad_norm": 0.5956769595890598, + "learning_rate": 3.743598945838438e-06, + "loss": 1.1758, + "step": 1886 + }, + { + "epoch": 0.7438285291943828, + "grad_norm": 0.6417078515489372, + "learning_rate": 3.732865204112084e-06, + "loss": 1.1991, + "step": 1887 + }, + { + "epoch": 0.7442227149544223, + "grad_norm": 0.6291270205503651, + "learning_rate": 3.722143340224682e-06, + "loss": 1.2203, + "step": 1888 + }, + { + "epoch": 0.7446169007144617, + "grad_norm": 0.6143214199994612, + "learning_rate": 3.7114333744969312e-06, + "loss": 1.2053, + "step": 1889 + }, + { + "epoch": 0.7450110864745011, + "grad_norm": 0.6247493772614575, + "learning_rate": 3.7007353272269764e-06, + "loss": 1.187, + "step": 1890 + }, + { + "epoch": 0.7454052722345406, + "grad_norm": 0.6280559082279741, + "learning_rate": 3.6900492186903893e-06, + "loss": 1.2001, + "step": 1891 + }, + { + "epoch": 0.7457994579945799, + "grad_norm": 0.6656868801405882, + "learning_rate": 3.6793750691400996e-06, + "loss": 1.2266, + "step": 1892 + }, + { + "epoch": 0.7461936437546194, + "grad_norm": 0.6290134544837587, + "learning_rate": 3.6687128988063768e-06, + "loss": 1.2643, + "step": 1893 + }, + { + "epoch": 0.7465878295146587, + "grad_norm": 0.6046720210188277, + "learning_rate": 3.6580627278967883e-06, + "loss": 1.1329, + "step": 1894 + }, + { + "epoch": 0.7469820152746982, + "grad_norm": 0.6132109677638092, + "learning_rate": 3.6474245765961623e-06, + "loss": 1.1802, + "step": 1895 + }, + { + "epoch": 0.7473762010347377, + "grad_norm": 0.6215636460183582, + "learning_rate": 3.636798465066537e-06, + "loss": 1.161, + "step": 1896 + }, + { + "epoch": 0.747770386794777, + "grad_norm": 0.6324476045738789, + "learning_rate": 3.6261844134471434e-06, + "loss": 1.2743, + "step": 1897 + }, + { + "epoch": 0.7481645725548165, + "grad_norm": 0.6229098227690751, + "learning_rate": 3.6155824418543482e-06, + "loss": 1.1813, + "step": 1898 + }, + { + "epoch": 0.7485587583148559, + "grad_norm": 0.6090812575135249, + "learning_rate": 3.604992570381621e-06, + "loss": 1.1345, + "step": 1899 + }, + { + "epoch": 0.7489529440748953, + "grad_norm": 0.6175559157353252, + "learning_rate": 3.5944148190995077e-06, + "loss": 1.2318, + "step": 1900 + }, + { + "epoch": 0.7493471298349347, + "grad_norm": 0.6151430132474782, + "learning_rate": 3.583849208055582e-06, + "loss": 1.1515, + "step": 1901 + }, + { + "epoch": 0.7497413155949741, + "grad_norm": 0.6150817757122007, + "learning_rate": 3.573295757274401e-06, + "loss": 1.1709, + "step": 1902 + }, + { + "epoch": 0.7501355013550135, + "grad_norm": 0.6206530860937504, + "learning_rate": 3.562754486757477e-06, + "loss": 1.2368, + "step": 1903 + }, + { + "epoch": 0.750529687115053, + "grad_norm": 0.6187559303708384, + "learning_rate": 3.5522254164832458e-06, + "loss": 1.166, + "step": 1904 + }, + { + "epoch": 0.7509238728750924, + "grad_norm": 0.6050479857846883, + "learning_rate": 3.5417085664070127e-06, + "loss": 1.1884, + "step": 1905 + }, + { + "epoch": 0.7513180586351318, + "grad_norm": 0.6168601224584902, + "learning_rate": 3.5312039564609203e-06, + "loss": 1.179, + "step": 1906 + }, + { + "epoch": 0.7517122443951713, + "grad_norm": 0.6626157674267323, + "learning_rate": 3.5207116065539214e-06, + "loss": 1.2784, + "step": 1907 + }, + { + "epoch": 0.7521064301552106, + "grad_norm": 0.6204622203986804, + "learning_rate": 3.510231536571731e-06, + "loss": 1.1545, + "step": 1908 + }, + { + "epoch": 0.7525006159152501, + "grad_norm": 0.6025298592606017, + "learning_rate": 3.4997637663767827e-06, + "loss": 1.1623, + "step": 1909 + }, + { + "epoch": 0.7528948016752894, + "grad_norm": 0.6686746729115949, + "learning_rate": 3.4893083158082096e-06, + "loss": 1.225, + "step": 1910 + }, + { + "epoch": 0.7532889874353289, + "grad_norm": 0.6770303268213698, + "learning_rate": 3.4788652046817885e-06, + "loss": 1.1987, + "step": 1911 + }, + { + "epoch": 0.7536831731953684, + "grad_norm": 0.6169292952669728, + "learning_rate": 3.4684344527899117e-06, + "loss": 1.1413, + "step": 1912 + }, + { + "epoch": 0.7540773589554077, + "grad_norm": 0.6485841260675642, + "learning_rate": 3.458016079901544e-06, + "loss": 1.1747, + "step": 1913 + }, + { + "epoch": 0.7544715447154472, + "grad_norm": 0.644634311279479, + "learning_rate": 3.447610105762197e-06, + "loss": 1.1688, + "step": 1914 + }, + { + "epoch": 0.7548657304754866, + "grad_norm": 0.5954331888752692, + "learning_rate": 3.4372165500938813e-06, + "loss": 1.1999, + "step": 1915 + }, + { + "epoch": 0.755259916235526, + "grad_norm": 0.617923959960479, + "learning_rate": 3.4268354325950637e-06, + "loss": 1.2101, + "step": 1916 + }, + { + "epoch": 0.7556541019955654, + "grad_norm": 0.6202978534151761, + "learning_rate": 3.4164667729406487e-06, + "loss": 1.1168, + "step": 1917 + }, + { + "epoch": 0.7560482877556048, + "grad_norm": 0.6139453726018187, + "learning_rate": 3.4061105907819202e-06, + "loss": 1.107, + "step": 1918 + }, + { + "epoch": 0.7564424735156442, + "grad_norm": 0.6199465940139608, + "learning_rate": 3.395766905746515e-06, + "loss": 1.2331, + "step": 1919 + }, + { + "epoch": 0.7568366592756837, + "grad_norm": 0.6121258940736186, + "learning_rate": 3.3854357374383905e-06, + "loss": 1.1512, + "step": 1920 + }, + { + "epoch": 0.7572308450357231, + "grad_norm": 0.6192952901355329, + "learning_rate": 3.375117105437784e-06, + "loss": 1.1992, + "step": 1921 + }, + { + "epoch": 0.7576250307957625, + "grad_norm": 0.6428452093914235, + "learning_rate": 3.3648110293011592e-06, + "loss": 1.2009, + "step": 1922 + }, + { + "epoch": 0.758019216555802, + "grad_norm": 0.632857445152661, + "learning_rate": 3.3545175285611986e-06, + "loss": 1.2031, + "step": 1923 + }, + { + "epoch": 0.7584134023158413, + "grad_norm": 0.61203461189701, + "learning_rate": 3.344236622726743e-06, + "loss": 1.128, + "step": 1924 + }, + { + "epoch": 0.7588075880758808, + "grad_norm": 0.5940930582433119, + "learning_rate": 3.333968331282759e-06, + "loss": 1.1638, + "step": 1925 + }, + { + "epoch": 0.7592017738359201, + "grad_norm": 0.6128730590023086, + "learning_rate": 3.3237126736903168e-06, + "loss": 1.1636, + "step": 1926 + }, + { + "epoch": 0.7595959595959596, + "grad_norm": 0.6453501409856305, + "learning_rate": 3.313469669386532e-06, + "loss": 1.2196, + "step": 1927 + }, + { + "epoch": 0.7599901453559991, + "grad_norm": 0.6462479993428716, + "learning_rate": 3.303239337784547e-06, + "loss": 1.1757, + "step": 1928 + }, + { + "epoch": 0.7603843311160384, + "grad_norm": 0.6223443320198161, + "learning_rate": 3.2930216982734775e-06, + "loss": 1.2022, + "step": 1929 + }, + { + "epoch": 0.7607785168760779, + "grad_norm": 0.6012467834584495, + "learning_rate": 3.2828167702183945e-06, + "loss": 1.1624, + "step": 1930 + }, + { + "epoch": 0.7611727026361172, + "grad_norm": 0.6212867293615743, + "learning_rate": 3.272624572960269e-06, + "loss": 1.1469, + "step": 1931 + }, + { + "epoch": 0.7615668883961567, + "grad_norm": 0.623426678936357, + "learning_rate": 3.262445125815945e-06, + "loss": 1.2142, + "step": 1932 + }, + { + "epoch": 0.7619610741561961, + "grad_norm": 0.6174911641351716, + "learning_rate": 3.2522784480781057e-06, + "loss": 1.229, + "step": 1933 + }, + { + "epoch": 0.7623552599162355, + "grad_norm": 0.6458478147860737, + "learning_rate": 3.242124559015234e-06, + "loss": 1.2307, + "step": 1934 + }, + { + "epoch": 0.7627494456762749, + "grad_norm": 0.6139695821784812, + "learning_rate": 3.2319834778715662e-06, + "loss": 1.1993, + "step": 1935 + }, + { + "epoch": 0.7631436314363144, + "grad_norm": 0.6244967897448498, + "learning_rate": 3.221855223867076e-06, + "loss": 1.1983, + "step": 1936 + }, + { + "epoch": 0.7635378171963538, + "grad_norm": 0.6167092879774253, + "learning_rate": 3.211739816197419e-06, + "loss": 1.139, + "step": 1937 + }, + { + "epoch": 0.7639320029563932, + "grad_norm": 0.6253757235990433, + "learning_rate": 3.2016372740339e-06, + "loss": 1.2246, + "step": 1938 + }, + { + "epoch": 0.7643261887164327, + "grad_norm": 0.625945816934853, + "learning_rate": 3.1915476165234505e-06, + "loss": 1.1534, + "step": 1939 + }, + { + "epoch": 0.764720374476472, + "grad_norm": 0.6294175091707643, + "learning_rate": 3.1814708627885736e-06, + "loss": 1.2087, + "step": 1940 + }, + { + "epoch": 0.7651145602365115, + "grad_norm": 0.6174964988395791, + "learning_rate": 3.171407031927325e-06, + "loss": 1.2108, + "step": 1941 + }, + { + "epoch": 0.7655087459965508, + "grad_norm": 0.6692493984724502, + "learning_rate": 3.161356143013258e-06, + "loss": 1.2602, + "step": 1942 + }, + { + "epoch": 0.7659029317565903, + "grad_norm": 0.6049874736921799, + "learning_rate": 3.1513182150954067e-06, + "loss": 1.1283, + "step": 1943 + }, + { + "epoch": 0.7662971175166298, + "grad_norm": 0.6170567402312764, + "learning_rate": 3.1412932671982368e-06, + "loss": 1.1787, + "step": 1944 + }, + { + "epoch": 0.7666913032766691, + "grad_norm": 0.5939532563374448, + "learning_rate": 3.131281318321607e-06, + "loss": 1.1134, + "step": 1945 + }, + { + "epoch": 0.7670854890367086, + "grad_norm": 0.6073844909969783, + "learning_rate": 3.1212823874407517e-06, + "loss": 1.1714, + "step": 1946 + }, + { + "epoch": 0.767479674796748, + "grad_norm": 0.6102814200245192, + "learning_rate": 3.1112964935062297e-06, + "loss": 1.172, + "step": 1947 + }, + { + "epoch": 0.7678738605567874, + "grad_norm": 0.6156593525633267, + "learning_rate": 3.101323655443882e-06, + "loss": 1.2028, + "step": 1948 + }, + { + "epoch": 0.7682680463168268, + "grad_norm": 0.630439880503606, + "learning_rate": 3.0913638921548195e-06, + "loss": 1.1547, + "step": 1949 + }, + { + "epoch": 0.7686622320768662, + "grad_norm": 0.596623146889128, + "learning_rate": 3.0814172225153626e-06, + "loss": 1.1191, + "step": 1950 + }, + { + "epoch": 0.7690564178369056, + "grad_norm": 0.6035005020079766, + "learning_rate": 3.0714836653770153e-06, + "loss": 1.1602, + "step": 1951 + }, + { + "epoch": 0.7694506035969451, + "grad_norm": 0.6229719405653049, + "learning_rate": 3.0615632395664395e-06, + "loss": 1.2358, + "step": 1952 + }, + { + "epoch": 0.7698447893569845, + "grad_norm": 0.6172825849164519, + "learning_rate": 3.051655963885398e-06, + "loss": 1.1966, + "step": 1953 + }, + { + "epoch": 0.7702389751170239, + "grad_norm": 0.6286383648446865, + "learning_rate": 3.0417618571107443e-06, + "loss": 1.1964, + "step": 1954 + }, + { + "epoch": 0.7706331608770634, + "grad_norm": 0.6108360343185555, + "learning_rate": 3.0318809379943594e-06, + "loss": 1.1728, + "step": 1955 + }, + { + "epoch": 0.7710273466371027, + "grad_norm": 0.6362153250389974, + "learning_rate": 3.022013225263142e-06, + "loss": 1.2236, + "step": 1956 + }, + { + "epoch": 0.7714215323971422, + "grad_norm": 0.6344908938517139, + "learning_rate": 3.0121587376189544e-06, + "loss": 1.2053, + "step": 1957 + }, + { + "epoch": 0.7718157181571815, + "grad_norm": 0.6201739659408967, + "learning_rate": 3.00231749373859e-06, + "loss": 1.1537, + "step": 1958 + }, + { + "epoch": 0.772209903917221, + "grad_norm": 0.6100774811460168, + "learning_rate": 2.992489512273754e-06, + "loss": 1.1984, + "step": 1959 + }, + { + "epoch": 0.7726040896772605, + "grad_norm": 0.6232200126606358, + "learning_rate": 2.9826748118510107e-06, + "loss": 1.2338, + "step": 1960 + }, + { + "epoch": 0.7729982754372998, + "grad_norm": 0.6325714051449248, + "learning_rate": 2.972873411071745e-06, + "loss": 1.1917, + "step": 1961 + }, + { + "epoch": 0.7733924611973393, + "grad_norm": 0.6152245310127229, + "learning_rate": 2.9630853285121506e-06, + "loss": 1.2181, + "step": 1962 + }, + { + "epoch": 0.7737866469573786, + "grad_norm": 0.6382727314998073, + "learning_rate": 2.9533105827231677e-06, + "loss": 1.2374, + "step": 1963 + }, + { + "epoch": 0.7741808327174181, + "grad_norm": 0.6093019906684419, + "learning_rate": 2.9435491922304603e-06, + "loss": 1.2039, + "step": 1964 + }, + { + "epoch": 0.7745750184774575, + "grad_norm": 0.6466162600658065, + "learning_rate": 2.933801175534392e-06, + "loss": 1.2507, + "step": 1965 + }, + { + "epoch": 0.7749692042374969, + "grad_norm": 0.6172944871295347, + "learning_rate": 2.9240665511099643e-06, + "loss": 1.1777, + "step": 1966 + }, + { + "epoch": 0.7753633899975363, + "grad_norm": 0.6025058965161826, + "learning_rate": 2.914345337406812e-06, + "loss": 1.1488, + "step": 1967 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.6283140418676793, + "learning_rate": 2.9046375528491378e-06, + "loss": 1.2246, + "step": 1968 + }, + { + "epoch": 0.7761517615176152, + "grad_norm": 0.6174686412053484, + "learning_rate": 2.8949432158357083e-06, + "loss": 1.1603, + "step": 1969 + }, + { + "epoch": 0.7765459472776546, + "grad_norm": 0.6249876696519094, + "learning_rate": 2.885262344739792e-06, + "loss": 1.2378, + "step": 1970 + }, + { + "epoch": 0.776940133037694, + "grad_norm": 0.6155008238993236, + "learning_rate": 2.875594957909136e-06, + "loss": 1.1734, + "step": 1971 + }, + { + "epoch": 0.7773343187977334, + "grad_norm": 0.6070997737354649, + "learning_rate": 2.865941073665942e-06, + "loss": 1.1533, + "step": 1972 + }, + { + "epoch": 0.7777285045577729, + "grad_norm": 0.6285112446428368, + "learning_rate": 2.8563007103068075e-06, + "loss": 1.2374, + "step": 1973 + }, + { + "epoch": 0.7781226903178122, + "grad_norm": 0.6292319074803627, + "learning_rate": 2.8466738861027143e-06, + "loss": 1.1764, + "step": 1974 + }, + { + "epoch": 0.7785168760778517, + "grad_norm": 0.6280895354859987, + "learning_rate": 2.8370606192989826e-06, + "loss": 1.2332, + "step": 1975 + }, + { + "epoch": 0.7789110618378912, + "grad_norm": 0.6392848234961054, + "learning_rate": 2.8274609281152322e-06, + "loss": 1.1681, + "step": 1976 + }, + { + "epoch": 0.7793052475979305, + "grad_norm": 0.6422553395733501, + "learning_rate": 2.8178748307453552e-06, + "loss": 1.1967, + "step": 1977 + }, + { + "epoch": 0.77969943335797, + "grad_norm": 0.6448664268947002, + "learning_rate": 2.8083023453574867e-06, + "loss": 1.1637, + "step": 1978 + }, + { + "epoch": 0.7800936191180093, + "grad_norm": 0.6268688830101503, + "learning_rate": 2.7987434900939537e-06, + "loss": 1.1992, + "step": 1979 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 0.6270584497214332, + "learning_rate": 2.7891982830712614e-06, + "loss": 1.215, + "step": 1980 + }, + { + "epoch": 0.7808819906380882, + "grad_norm": 0.6136390949207409, + "learning_rate": 2.779666742380035e-06, + "loss": 1.1842, + "step": 1981 + }, + { + "epoch": 0.7812761763981276, + "grad_norm": 0.6160721779555592, + "learning_rate": 2.7701488860850134e-06, + "loss": 1.1465, + "step": 1982 + }, + { + "epoch": 0.781670362158167, + "grad_norm": 0.6229572690437215, + "learning_rate": 2.7606447322249876e-06, + "loss": 1.1872, + "step": 1983 + }, + { + "epoch": 0.7820645479182065, + "grad_norm": 0.6120891016882081, + "learning_rate": 2.7511542988127815e-06, + "loss": 1.1933, + "step": 1984 + }, + { + "epoch": 0.7824587336782459, + "grad_norm": 0.6396299966743912, + "learning_rate": 2.7416776038352246e-06, + "loss": 1.2268, + "step": 1985 + }, + { + "epoch": 0.7828529194382853, + "grad_norm": 0.620606681831229, + "learning_rate": 2.732214665253092e-06, + "loss": 1.18, + "step": 1986 + }, + { + "epoch": 0.7832471051983247, + "grad_norm": 0.6172045847652757, + "learning_rate": 2.7227655010011034e-06, + "loss": 1.2072, + "step": 1987 + }, + { + "epoch": 0.7836412909583641, + "grad_norm": 0.6174655713344509, + "learning_rate": 2.7133301289878644e-06, + "loss": 1.1981, + "step": 1988 + }, + { + "epoch": 0.7840354767184036, + "grad_norm": 0.6453151721553436, + "learning_rate": 2.703908567095841e-06, + "loss": 1.2319, + "step": 1989 + }, + { + "epoch": 0.7844296624784429, + "grad_norm": 0.6143239403662212, + "learning_rate": 2.694500833181323e-06, + "loss": 1.1539, + "step": 1990 + }, + { + "epoch": 0.7848238482384824, + "grad_norm": 0.6118518639087388, + "learning_rate": 2.6851069450743996e-06, + "loss": 1.136, + "step": 1991 + }, + { + "epoch": 0.7852180339985219, + "grad_norm": 0.621523302552173, + "learning_rate": 2.6757269205789118e-06, + "loss": 1.1884, + "step": 1992 + }, + { + "epoch": 0.7856122197585612, + "grad_norm": 0.6177501269477549, + "learning_rate": 2.666360777472432e-06, + "loss": 1.1697, + "step": 1993 + }, + { + "epoch": 0.7860064055186007, + "grad_norm": 0.6169578769905575, + "learning_rate": 2.6570085335062166e-06, + "loss": 1.149, + "step": 1994 + }, + { + "epoch": 0.78640059127864, + "grad_norm": 0.6384469724904461, + "learning_rate": 2.6476702064051873e-06, + "loss": 1.215, + "step": 1995 + }, + { + "epoch": 0.7867947770386795, + "grad_norm": 0.6526331509523849, + "learning_rate": 2.638345813867883e-06, + "loss": 1.1834, + "step": 1996 + }, + { + "epoch": 0.7871889627987189, + "grad_norm": 0.6384058053206544, + "learning_rate": 2.629035373566433e-06, + "loss": 1.2679, + "step": 1997 + }, + { + "epoch": 0.7875831485587583, + "grad_norm": 0.6173186289000027, + "learning_rate": 2.6197389031465328e-06, + "loss": 1.1497, + "step": 1998 + }, + { + "epoch": 0.7879773343187977, + "grad_norm": 0.6179494011323186, + "learning_rate": 2.610456420227386e-06, + "loss": 1.155, + "step": 1999 + }, + { + "epoch": 0.7883715200788372, + "grad_norm": 0.6495295068681656, + "learning_rate": 2.6011879424017006e-06, + "loss": 1.1627, + "step": 2000 + }, + { + "epoch": 0.7887657058388766, + "grad_norm": 0.6124764762909571, + "learning_rate": 2.5919334872356384e-06, + "loss": 1.2092, + "step": 2001 + }, + { + "epoch": 0.789159891598916, + "grad_norm": 0.6267862591887654, + "learning_rate": 2.582693072268778e-06, + "loss": 1.2324, + "step": 2002 + }, + { + "epoch": 0.7895540773589554, + "grad_norm": 0.640938297681364, + "learning_rate": 2.573466715014089e-06, + "loss": 1.1638, + "step": 2003 + }, + { + "epoch": 0.7899482631189948, + "grad_norm": 0.6319357561158305, + "learning_rate": 2.5642544329579088e-06, + "loss": 1.1436, + "step": 2004 + }, + { + "epoch": 0.7903424488790343, + "grad_norm": 0.6599757389441551, + "learning_rate": 2.5550562435598834e-06, + "loss": 1.1859, + "step": 2005 + }, + { + "epoch": 0.7907366346390736, + "grad_norm": 0.6261460556185046, + "learning_rate": 2.5458721642529637e-06, + "loss": 1.2276, + "step": 2006 + }, + { + "epoch": 0.7911308203991131, + "grad_norm": 0.6368615447497923, + "learning_rate": 2.536702212443345e-06, + "loss": 1.126, + "step": 2007 + }, + { + "epoch": 0.7915250061591526, + "grad_norm": 0.6065232945787534, + "learning_rate": 2.5275464055104615e-06, + "loss": 1.1566, + "step": 2008 + }, + { + "epoch": 0.7919191919191919, + "grad_norm": 0.6260924052346492, + "learning_rate": 2.5184047608069283e-06, + "loss": 1.2301, + "step": 2009 + }, + { + "epoch": 0.7923133776792314, + "grad_norm": 0.5961679029421411, + "learning_rate": 2.509277295658521e-06, + "loss": 1.1195, + "step": 2010 + }, + { + "epoch": 0.7927075634392707, + "grad_norm": 0.6880173744181591, + "learning_rate": 2.500164027364147e-06, + "loss": 1.1852, + "step": 2011 + }, + { + "epoch": 0.7931017491993102, + "grad_norm": 0.591725360802608, + "learning_rate": 2.491064973195798e-06, + "loss": 1.1237, + "step": 2012 + }, + { + "epoch": 0.7934959349593496, + "grad_norm": 0.5975825860792612, + "learning_rate": 2.4819801503985365e-06, + "loss": 1.1518, + "step": 2013 + }, + { + "epoch": 0.793890120719389, + "grad_norm": 0.6221206271257661, + "learning_rate": 2.4729095761904487e-06, + "loss": 1.1838, + "step": 2014 + }, + { + "epoch": 0.7942843064794284, + "grad_norm": 0.6271650798589434, + "learning_rate": 2.4638532677626124e-06, + "loss": 1.1672, + "step": 2015 + }, + { + "epoch": 0.7946784922394678, + "grad_norm": 0.6395665538753358, + "learning_rate": 2.4548112422790695e-06, + "loss": 1.2002, + "step": 2016 + }, + { + "epoch": 0.7950726779995073, + "grad_norm": 0.6087288790926827, + "learning_rate": 2.4457835168767975e-06, + "loss": 1.1194, + "step": 2017 + }, + { + "epoch": 0.7954668637595467, + "grad_norm": 0.6099991672736873, + "learning_rate": 2.4367701086656625e-06, + "loss": 1.141, + "step": 2018 + }, + { + "epoch": 0.7958610495195861, + "grad_norm": 0.6055519755469221, + "learning_rate": 2.4277710347284035e-06, + "loss": 1.1506, + "step": 2019 + }, + { + "epoch": 0.7962552352796255, + "grad_norm": 0.653125514461312, + "learning_rate": 2.4187863121205933e-06, + "loss": 1.1804, + "step": 2020 + }, + { + "epoch": 0.796649421039665, + "grad_norm": 0.6025409266602508, + "learning_rate": 2.409815957870597e-06, + "loss": 1.1893, + "step": 2021 + }, + { + "epoch": 0.7970436067997043, + "grad_norm": 0.6126866642525495, + "learning_rate": 2.400859988979555e-06, + "loss": 1.186, + "step": 2022 + }, + { + "epoch": 0.7974377925597438, + "grad_norm": 0.6286983033908643, + "learning_rate": 2.3919184224213354e-06, + "loss": 1.1655, + "step": 2023 + }, + { + "epoch": 0.7978319783197833, + "grad_norm": 0.5932553711308323, + "learning_rate": 2.3829912751425244e-06, + "loss": 1.1778, + "step": 2024 + }, + { + "epoch": 0.7982261640798226, + "grad_norm": 0.633166520052366, + "learning_rate": 2.374078564062364e-06, + "loss": 1.1589, + "step": 2025 + }, + { + "epoch": 0.7986203498398621, + "grad_norm": 0.6299341383892152, + "learning_rate": 2.3651803060727484e-06, + "loss": 1.1603, + "step": 2026 + }, + { + "epoch": 0.7990145355999014, + "grad_norm": 0.6223977799816698, + "learning_rate": 2.3562965180381746e-06, + "loss": 1.2036, + "step": 2027 + }, + { + "epoch": 0.7994087213599409, + "grad_norm": 0.6214882966307388, + "learning_rate": 2.3474272167957144e-06, + "loss": 1.1902, + "step": 2028 + }, + { + "epoch": 0.7998029071199803, + "grad_norm": 0.6261786382679704, + "learning_rate": 2.3385724191549807e-06, + "loss": 1.1596, + "step": 2029 + }, + { + "epoch": 0.8001970928800197, + "grad_norm": 0.6179261386167846, + "learning_rate": 2.3297321418981077e-06, + "loss": 1.1601, + "step": 2030 + }, + { + "epoch": 0.8005912786400591, + "grad_norm": 0.6067017257945441, + "learning_rate": 2.3209064017797014e-06, + "loss": 1.1052, + "step": 2031 + }, + { + "epoch": 0.8009854644000985, + "grad_norm": 0.6030346397003117, + "learning_rate": 2.312095215526814e-06, + "loss": 1.1272, + "step": 2032 + }, + { + "epoch": 0.801379650160138, + "grad_norm": 0.6187228819182855, + "learning_rate": 2.3032985998389236e-06, + "loss": 1.2039, + "step": 2033 + }, + { + "epoch": 0.8017738359201774, + "grad_norm": 0.6190809264452526, + "learning_rate": 2.29451657138789e-06, + "loss": 1.2414, + "step": 2034 + }, + { + "epoch": 0.8021680216802168, + "grad_norm": 0.6083179570546223, + "learning_rate": 2.285749146817924e-06, + "loss": 1.1508, + "step": 2035 + }, + { + "epoch": 0.8025622074402562, + "grad_norm": 0.5937926599332075, + "learning_rate": 2.2769963427455555e-06, + "loss": 1.0988, + "step": 2036 + }, + { + "epoch": 0.8029563932002957, + "grad_norm": 0.6173897531116277, + "learning_rate": 2.2682581757596144e-06, + "loss": 1.1962, + "step": 2037 + }, + { + "epoch": 0.803350578960335, + "grad_norm": 0.5854683327803459, + "learning_rate": 2.259534662421179e-06, + "loss": 1.1119, + "step": 2038 + }, + { + "epoch": 0.8037447647203745, + "grad_norm": 0.6170817511105888, + "learning_rate": 2.2508258192635614e-06, + "loss": 1.1889, + "step": 2039 + }, + { + "epoch": 0.804138950480414, + "grad_norm": 0.6159894762027561, + "learning_rate": 2.242131662792272e-06, + "loss": 1.1667, + "step": 2040 + }, + { + "epoch": 0.8045331362404533, + "grad_norm": 0.6118649548400591, + "learning_rate": 2.2334522094849798e-06, + "loss": 1.1371, + "step": 2041 + }, + { + "epoch": 0.8049273220004928, + "grad_norm": 0.6392916794711796, + "learning_rate": 2.2247874757914865e-06, + "loss": 1.1846, + "step": 2042 + }, + { + "epoch": 0.8053215077605321, + "grad_norm": 0.5941927210409212, + "learning_rate": 2.2161374781337084e-06, + "loss": 1.1291, + "step": 2043 + }, + { + "epoch": 0.8057156935205716, + "grad_norm": 0.6294242082032777, + "learning_rate": 2.2075022329056193e-06, + "loss": 1.2009, + "step": 2044 + }, + { + "epoch": 0.806109879280611, + "grad_norm": 0.6422605646655121, + "learning_rate": 2.198881756473238e-06, + "loss": 1.2299, + "step": 2045 + }, + { + "epoch": 0.8065040650406504, + "grad_norm": 0.6563848866016602, + "learning_rate": 2.190276065174596e-06, + "loss": 1.2258, + "step": 2046 + }, + { + "epoch": 0.8068982508006898, + "grad_norm": 0.6448012423504815, + "learning_rate": 2.1816851753197023e-06, + "loss": 1.1881, + "step": 2047 + }, + { + "epoch": 0.8072924365607292, + "grad_norm": 0.597728406050263, + "learning_rate": 2.1731091031905118e-06, + "loss": 1.1688, + "step": 2048 + }, + { + "epoch": 0.8076866223207687, + "grad_norm": 0.5886841825944683, + "learning_rate": 2.164547865040889e-06, + "loss": 1.124, + "step": 2049 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.6142796262742458, + "learning_rate": 2.156001477096601e-06, + "loss": 1.2032, + "step": 2050 + }, + { + "epoch": 0.8084749938408475, + "grad_norm": 0.6175251461681956, + "learning_rate": 2.1474699555552527e-06, + "loss": 1.1787, + "step": 2051 + }, + { + "epoch": 0.8088691796008869, + "grad_norm": 0.6139100518824416, + "learning_rate": 2.138953316586283e-06, + "loss": 1.1953, + "step": 2052 + }, + { + "epoch": 0.8092633653609264, + "grad_norm": 0.6430044371047359, + "learning_rate": 2.130451576330925e-06, + "loss": 1.2208, + "step": 2053 + }, + { + "epoch": 0.8096575511209657, + "grad_norm": 0.6111371447533479, + "learning_rate": 2.12196475090217e-06, + "loss": 1.1537, + "step": 2054 + }, + { + "epoch": 0.8100517368810052, + "grad_norm": 0.6150669801063049, + "learning_rate": 2.113492856384741e-06, + "loss": 1.1211, + "step": 2055 + }, + { + "epoch": 0.8104459226410446, + "grad_norm": 0.6290841991274971, + "learning_rate": 2.1050359088350724e-06, + "loss": 1.2084, + "step": 2056 + }, + { + "epoch": 0.810840108401084, + "grad_norm": 0.6053161669582096, + "learning_rate": 2.0965939242812594e-06, + "loss": 1.1343, + "step": 2057 + }, + { + "epoch": 0.8112342941611235, + "grad_norm": 0.623034572056998, + "learning_rate": 2.0881669187230415e-06, + "loss": 1.1616, + "step": 2058 + }, + { + "epoch": 0.8116284799211628, + "grad_norm": 0.6122769163475099, + "learning_rate": 2.0797549081317724e-06, + "loss": 1.1639, + "step": 2059 + }, + { + "epoch": 0.8120226656812023, + "grad_norm": 0.6241014032007793, + "learning_rate": 2.0713579084503877e-06, + "loss": 1.2213, + "step": 2060 + }, + { + "epoch": 0.8124168514412416, + "grad_norm": 0.6054665326241209, + "learning_rate": 2.0629759355933665e-06, + "loss": 1.183, + "step": 2061 + }, + { + "epoch": 0.8128110372012811, + "grad_norm": 0.6131850542325953, + "learning_rate": 2.0546090054467118e-06, + "loss": 1.1867, + "step": 2062 + }, + { + "epoch": 0.8132052229613205, + "grad_norm": 0.5905612318597147, + "learning_rate": 2.0462571338679204e-06, + "loss": 1.1652, + "step": 2063 + }, + { + "epoch": 0.8135994087213599, + "grad_norm": 0.6086745867605593, + "learning_rate": 2.0379203366859413e-06, + "loss": 1.1749, + "step": 2064 + }, + { + "epoch": 0.8139935944813994, + "grad_norm": 0.6547726012282458, + "learning_rate": 2.0295986297011603e-06, + "loss": 1.2606, + "step": 2065 + }, + { + "epoch": 0.8143877802414388, + "grad_norm": 0.6176365863473255, + "learning_rate": 2.0212920286853656e-06, + "loss": 1.1631, + "step": 2066 + }, + { + "epoch": 0.8147819660014782, + "grad_norm": 0.5969133841837041, + "learning_rate": 2.0130005493817063e-06, + "loss": 1.1818, + "step": 2067 + }, + { + "epoch": 0.8151761517615176, + "grad_norm": 0.6095137689005168, + "learning_rate": 2.004724207504675e-06, + "loss": 1.1147, + "step": 2068 + }, + { + "epoch": 0.815570337521557, + "grad_norm": 0.6149824366682144, + "learning_rate": 1.9964630187400834e-06, + "loss": 1.1667, + "step": 2069 + }, + { + "epoch": 0.8159645232815964, + "grad_norm": 0.6076416587106072, + "learning_rate": 1.988216998745014e-06, + "loss": 1.1657, + "step": 2070 + }, + { + "epoch": 0.8163587090416359, + "grad_norm": 0.6378102035141168, + "learning_rate": 1.9799861631478013e-06, + "loss": 1.1748, + "step": 2071 + }, + { + "epoch": 0.8167528948016753, + "grad_norm": 0.6018846786576992, + "learning_rate": 1.971770527548008e-06, + "loss": 1.1243, + "step": 2072 + }, + { + "epoch": 0.8171470805617147, + "grad_norm": 0.6072693290996355, + "learning_rate": 1.9635701075163884e-06, + "loss": 1.1456, + "step": 2073 + }, + { + "epoch": 0.8175412663217542, + "grad_norm": 0.6188901773945752, + "learning_rate": 1.9553849185948514e-06, + "loss": 1.2303, + "step": 2074 + }, + { + "epoch": 0.8179354520817935, + "grad_norm": 0.6652688896175301, + "learning_rate": 1.947214976296443e-06, + "loss": 1.2502, + "step": 2075 + }, + { + "epoch": 0.818329637841833, + "grad_norm": 0.6180903878734494, + "learning_rate": 1.9390602961053194e-06, + "loss": 1.156, + "step": 2076 + }, + { + "epoch": 0.8187238236018723, + "grad_norm": 0.6125254270472376, + "learning_rate": 1.930920893476701e-06, + "loss": 1.1941, + "step": 2077 + }, + { + "epoch": 0.8191180093619118, + "grad_norm": 0.623138908331946, + "learning_rate": 1.9227967838368566e-06, + "loss": 1.1965, + "step": 2078 + }, + { + "epoch": 0.8195121951219512, + "grad_norm": 0.615972707734638, + "learning_rate": 1.9146879825830753e-06, + "loss": 1.1691, + "step": 2079 + }, + { + "epoch": 0.8199063808819906, + "grad_norm": 0.6000820870368339, + "learning_rate": 1.9065945050836299e-06, + "loss": 1.1169, + "step": 2080 + }, + { + "epoch": 0.8203005666420301, + "grad_norm": 0.609742615231763, + "learning_rate": 1.8985163666777473e-06, + "loss": 1.1694, + "step": 2081 + }, + { + "epoch": 0.8206947524020695, + "grad_norm": 0.6200332366286192, + "learning_rate": 1.890453582675591e-06, + "loss": 1.1225, + "step": 2082 + }, + { + "epoch": 0.8210889381621089, + "grad_norm": 0.6145307042295974, + "learning_rate": 1.882406168358215e-06, + "loss": 1.1893, + "step": 2083 + }, + { + "epoch": 0.8214831239221483, + "grad_norm": 0.613663996359055, + "learning_rate": 1.8743741389775472e-06, + "loss": 1.2003, + "step": 2084 + }, + { + "epoch": 0.8218773096821878, + "grad_norm": 0.6163140729383925, + "learning_rate": 1.866357509756358e-06, + "loss": 1.1625, + "step": 2085 + }, + { + "epoch": 0.8222714954422271, + "grad_norm": 0.6093496583736225, + "learning_rate": 1.8583562958882329e-06, + "loss": 1.1604, + "step": 2086 + }, + { + "epoch": 0.8226656812022666, + "grad_norm": 0.6112581505765976, + "learning_rate": 1.8503705125375382e-06, + "loss": 1.12, + "step": 2087 + }, + { + "epoch": 0.823059866962306, + "grad_norm": 0.6187957102380715, + "learning_rate": 1.8424001748393905e-06, + "loss": 1.2006, + "step": 2088 + }, + { + "epoch": 0.8234540527223454, + "grad_norm": 0.6131303613972927, + "learning_rate": 1.8344452978996441e-06, + "loss": 1.1182, + "step": 2089 + }, + { + "epoch": 0.8238482384823849, + "grad_norm": 0.6096435231696508, + "learning_rate": 1.8265058967948434e-06, + "loss": 1.0993, + "step": 2090 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.6188414868551905, + "learning_rate": 1.818581986572201e-06, + "loss": 1.2266, + "step": 2091 + }, + { + "epoch": 0.8246366100024637, + "grad_norm": 0.6187428595993414, + "learning_rate": 1.8106735822495746e-06, + "loss": 1.2269, + "step": 2092 + }, + { + "epoch": 0.825030795762503, + "grad_norm": 0.6158407049072168, + "learning_rate": 1.8027806988154373e-06, + "loss": 1.1678, + "step": 2093 + }, + { + "epoch": 0.8254249815225425, + "grad_norm": 0.6274441437082312, + "learning_rate": 1.794903351228835e-06, + "loss": 1.2211, + "step": 2094 + }, + { + "epoch": 0.8258191672825819, + "grad_norm": 0.6161979943389017, + "learning_rate": 1.7870415544193808e-06, + "loss": 1.1381, + "step": 2095 + }, + { + "epoch": 0.8262133530426213, + "grad_norm": 0.6192811967277538, + "learning_rate": 1.7791953232872083e-06, + "loss": 1.1739, + "step": 2096 + }, + { + "epoch": 0.8266075388026608, + "grad_norm": 0.6261988603055474, + "learning_rate": 1.7713646727029476e-06, + "loss": 1.1864, + "step": 2097 + }, + { + "epoch": 0.8270017245627002, + "grad_norm": 0.6383885993657525, + "learning_rate": 1.7635496175077082e-06, + "loss": 1.1576, + "step": 2098 + }, + { + "epoch": 0.8273959103227396, + "grad_norm": 0.6401328645174053, + "learning_rate": 1.755750172513041e-06, + "loss": 1.1973, + "step": 2099 + }, + { + "epoch": 0.827790096082779, + "grad_norm": 0.6520856363314526, + "learning_rate": 1.747966352500904e-06, + "loss": 1.2282, + "step": 2100 + }, + { + "epoch": 0.8281842818428184, + "grad_norm": 0.6338910603246662, + "learning_rate": 1.7401981722236438e-06, + "loss": 1.175, + "step": 2101 + }, + { + "epoch": 0.8285784676028578, + "grad_norm": 0.614780711742896, + "learning_rate": 1.7324456464039751e-06, + "loss": 1.219, + "step": 2102 + }, + { + "epoch": 0.8289726533628973, + "grad_norm": 0.6320193678396515, + "learning_rate": 1.7247087897349334e-06, + "loss": 1.234, + "step": 2103 + }, + { + "epoch": 0.8293668391229367, + "grad_norm": 0.6148462845714023, + "learning_rate": 1.7169876168798561e-06, + "loss": 1.207, + "step": 2104 + }, + { + "epoch": 0.8297610248829761, + "grad_norm": 0.6183637939087829, + "learning_rate": 1.7092821424723637e-06, + "loss": 1.191, + "step": 2105 + }, + { + "epoch": 0.8301552106430156, + "grad_norm": 0.6242963838055702, + "learning_rate": 1.7015923811163225e-06, + "loss": 1.2022, + "step": 2106 + }, + { + "epoch": 0.8305493964030549, + "grad_norm": 0.5988324551990205, + "learning_rate": 1.6939183473858101e-06, + "loss": 1.1113, + "step": 2107 + }, + { + "epoch": 0.8309435821630944, + "grad_norm": 0.6110399627678608, + "learning_rate": 1.6862600558251097e-06, + "loss": 1.14, + "step": 2108 + }, + { + "epoch": 0.8313377679231337, + "grad_norm": 0.6048300072512719, + "learning_rate": 1.6786175209486565e-06, + "loss": 1.1364, + "step": 2109 + }, + { + "epoch": 0.8317319536831732, + "grad_norm": 0.6191088800533002, + "learning_rate": 1.6709907572410266e-06, + "loss": 1.1591, + "step": 2110 + }, + { + "epoch": 0.8321261394432126, + "grad_norm": 0.6374165341976098, + "learning_rate": 1.6633797791569085e-06, + "loss": 1.1927, + "step": 2111 + }, + { + "epoch": 0.832520325203252, + "grad_norm": 0.6047378641330573, + "learning_rate": 1.6557846011210753e-06, + "loss": 1.1895, + "step": 2112 + }, + { + "epoch": 0.8329145109632915, + "grad_norm": 0.6180978122031335, + "learning_rate": 1.6482052375283442e-06, + "loss": 1.1932, + "step": 2113 + }, + { + "epoch": 0.8333086967233309, + "grad_norm": 0.6187193373594739, + "learning_rate": 1.6406417027435728e-06, + "loss": 1.2001, + "step": 2114 + }, + { + "epoch": 0.8337028824833703, + "grad_norm": 0.6055455770427833, + "learning_rate": 1.6330940111016103e-06, + "loss": 1.2135, + "step": 2115 + }, + { + "epoch": 0.8340970682434097, + "grad_norm": 0.6226585371397162, + "learning_rate": 1.6255621769072805e-06, + "loss": 1.2023, + "step": 2116 + }, + { + "epoch": 0.8344912540034491, + "grad_norm": 0.5949274417124252, + "learning_rate": 1.6180462144353526e-06, + "loss": 1.1744, + "step": 2117 + }, + { + "epoch": 0.8348854397634885, + "grad_norm": 0.6339414631453146, + "learning_rate": 1.6105461379305187e-06, + "loss": 1.1836, + "step": 2118 + }, + { + "epoch": 0.835279625523528, + "grad_norm": 0.6095339519814128, + "learning_rate": 1.6030619616073628e-06, + "loss": 1.1468, + "step": 2119 + }, + { + "epoch": 0.8356738112835674, + "grad_norm": 0.6227699723957059, + "learning_rate": 1.5955936996503285e-06, + "loss": 1.1617, + "step": 2120 + }, + { + "epoch": 0.8360679970436068, + "grad_norm": 0.6058636715701863, + "learning_rate": 1.5881413662137047e-06, + "loss": 1.2089, + "step": 2121 + }, + { + "epoch": 0.8364621828036463, + "grad_norm": 0.6345005146493108, + "learning_rate": 1.580704975421584e-06, + "loss": 1.2159, + "step": 2122 + }, + { + "epoch": 0.8368563685636856, + "grad_norm": 0.6425121234333704, + "learning_rate": 1.5732845413678477e-06, + "loss": 1.1546, + "step": 2123 + }, + { + "epoch": 0.8372505543237251, + "grad_norm": 0.6217776321143101, + "learning_rate": 1.5658800781161365e-06, + "loss": 1.1201, + "step": 2124 + }, + { + "epoch": 0.8376447400837644, + "grad_norm": 0.6291793073582329, + "learning_rate": 1.5584915996998217e-06, + "loss": 1.2199, + "step": 2125 + }, + { + "epoch": 0.8380389258438039, + "grad_norm": 0.6413491306262445, + "learning_rate": 1.5511191201219733e-06, + "loss": 1.1387, + "step": 2126 + }, + { + "epoch": 0.8384331116038433, + "grad_norm": 0.5968787571090911, + "learning_rate": 1.5437626533553497e-06, + "loss": 1.1677, + "step": 2127 + }, + { + "epoch": 0.8388272973638827, + "grad_norm": 0.6266812335989616, + "learning_rate": 1.5364222133423523e-06, + "loss": 1.1488, + "step": 2128 + }, + { + "epoch": 0.8392214831239222, + "grad_norm": 0.6179499573451991, + "learning_rate": 1.5290978139950108e-06, + "loss": 1.1462, + "step": 2129 + }, + { + "epoch": 0.8396156688839616, + "grad_norm": 0.6020456787105313, + "learning_rate": 1.521789469194952e-06, + "loss": 1.1895, + "step": 2130 + }, + { + "epoch": 0.840009854644001, + "grad_norm": 0.6142152475528356, + "learning_rate": 1.514497192793377e-06, + "loss": 1.1928, + "step": 2131 + }, + { + "epoch": 0.8404040404040404, + "grad_norm": 0.6418120903036971, + "learning_rate": 1.5072209986110376e-06, + "loss": 1.1873, + "step": 2132 + }, + { + "epoch": 0.8407982261640798, + "grad_norm": 0.6022912765250543, + "learning_rate": 1.4999609004381944e-06, + "loss": 1.1693, + "step": 2133 + }, + { + "epoch": 0.8411924119241192, + "grad_norm": 0.6241117050709148, + "learning_rate": 1.492716912034614e-06, + "loss": 1.1556, + "step": 2134 + }, + { + "epoch": 0.8415865976841587, + "grad_norm": 0.6088366197098409, + "learning_rate": 1.4854890471295225e-06, + "loss": 1.2307, + "step": 2135 + }, + { + "epoch": 0.8419807834441981, + "grad_norm": 0.626345154331026, + "learning_rate": 1.4782773194215883e-06, + "loss": 1.1245, + "step": 2136 + }, + { + "epoch": 0.8423749692042375, + "grad_norm": 0.6214268575987325, + "learning_rate": 1.4710817425789015e-06, + "loss": 1.1974, + "step": 2137 + }, + { + "epoch": 0.842769154964277, + "grad_norm": 0.6157509713525812, + "learning_rate": 1.4639023302389366e-06, + "loss": 1.1889, + "step": 2138 + }, + { + "epoch": 0.8431633407243163, + "grad_norm": 0.6351261747898632, + "learning_rate": 1.4567390960085325e-06, + "loss": 1.1981, + "step": 2139 + }, + { + "epoch": 0.8435575264843558, + "grad_norm": 0.6067571512713051, + "learning_rate": 1.4495920534638741e-06, + "loss": 1.1582, + "step": 2140 + }, + { + "epoch": 0.8439517122443951, + "grad_norm": 0.607006794382876, + "learning_rate": 1.4424612161504482e-06, + "loss": 1.1623, + "step": 2141 + }, + { + "epoch": 0.8443458980044346, + "grad_norm": 0.5784739791881964, + "learning_rate": 1.435346597583034e-06, + "loss": 1.116, + "step": 2142 + }, + { + "epoch": 0.844740083764474, + "grad_norm": 0.6124576542474655, + "learning_rate": 1.4282482112456686e-06, + "loss": 1.1986, + "step": 2143 + }, + { + "epoch": 0.8451342695245134, + "grad_norm": 0.6311729127767527, + "learning_rate": 1.4211660705916286e-06, + "loss": 1.2564, + "step": 2144 + }, + { + "epoch": 0.8455284552845529, + "grad_norm": 0.6337920894968637, + "learning_rate": 1.4141001890434035e-06, + "loss": 1.2245, + "step": 2145 + }, + { + "epoch": 0.8459226410445922, + "grad_norm": 0.5962616813895122, + "learning_rate": 1.407050579992658e-06, + "loss": 1.1572, + "step": 2146 + }, + { + "epoch": 0.8463168268046317, + "grad_norm": 0.6077208562957639, + "learning_rate": 1.4000172568002268e-06, + "loss": 1.1588, + "step": 2147 + }, + { + "epoch": 0.8467110125646711, + "grad_norm": 0.6206827599971425, + "learning_rate": 1.3930002327960702e-06, + "loss": 1.2329, + "step": 2148 + }, + { + "epoch": 0.8471051983247105, + "grad_norm": 0.6031727874430762, + "learning_rate": 1.385999521279261e-06, + "loss": 1.1409, + "step": 2149 + }, + { + "epoch": 0.8474993840847499, + "grad_norm": 0.6034983041379499, + "learning_rate": 1.3790151355179581e-06, + "loss": 1.2088, + "step": 2150 + }, + { + "epoch": 0.8478935698447894, + "grad_norm": 0.5944921464470333, + "learning_rate": 1.372047088749372e-06, + "loss": 1.1279, + "step": 2151 + }, + { + "epoch": 0.8482877556048288, + "grad_norm": 0.6214516653434409, + "learning_rate": 1.365095394179754e-06, + "loss": 1.2763, + "step": 2152 + }, + { + "epoch": 0.8486819413648682, + "grad_norm": 0.6442848968344648, + "learning_rate": 1.3581600649843617e-06, + "loss": 1.2047, + "step": 2153 + }, + { + "epoch": 0.8490761271249077, + "grad_norm": 0.6069453066470716, + "learning_rate": 1.3512411143074333e-06, + "loss": 1.1663, + "step": 2154 + }, + { + "epoch": 0.849470312884947, + "grad_norm": 0.632212528850588, + "learning_rate": 1.344338555262168e-06, + "loss": 1.1797, + "step": 2155 + }, + { + "epoch": 0.8498644986449865, + "grad_norm": 0.6551418490552343, + "learning_rate": 1.3374524009306944e-06, + "loss": 1.2136, + "step": 2156 + }, + { + "epoch": 0.8502586844050258, + "grad_norm": 0.6182185289441392, + "learning_rate": 1.3305826643640552e-06, + "loss": 1.1878, + "step": 2157 + }, + { + "epoch": 0.8506528701650653, + "grad_norm": 0.6177346028571237, + "learning_rate": 1.3237293585821786e-06, + "loss": 1.1659, + "step": 2158 + }, + { + "epoch": 0.8510470559251047, + "grad_norm": 0.6174374468477092, + "learning_rate": 1.316892496573845e-06, + "loss": 1.1553, + "step": 2159 + }, + { + "epoch": 0.8514412416851441, + "grad_norm": 0.6130949007768408, + "learning_rate": 1.310072091296677e-06, + "loss": 1.1732, + "step": 2160 + }, + { + "epoch": 0.8518354274451836, + "grad_norm": 0.6061989244208447, + "learning_rate": 1.303268155677101e-06, + "loss": 1.1714, + "step": 2161 + }, + { + "epoch": 0.852229613205223, + "grad_norm": 0.6088152483466427, + "learning_rate": 1.296480702610332e-06, + "loss": 1.1614, + "step": 2162 + }, + { + "epoch": 0.8526237989652624, + "grad_norm": 0.6410096353876902, + "learning_rate": 1.2897097449603491e-06, + "loss": 1.243, + "step": 2163 + }, + { + "epoch": 0.8530179847253018, + "grad_norm": 0.6215005861175246, + "learning_rate": 1.2829552955598623e-06, + "loss": 1.2266, + "step": 2164 + }, + { + "epoch": 0.8534121704853412, + "grad_norm": 0.6308618646844184, + "learning_rate": 1.2762173672102996e-06, + "loss": 1.2355, + "step": 2165 + }, + { + "epoch": 0.8538063562453806, + "grad_norm": 0.611573077552191, + "learning_rate": 1.269495972681777e-06, + "loss": 1.1797, + "step": 2166 + }, + { + "epoch": 0.8542005420054201, + "grad_norm": 0.6275131772886295, + "learning_rate": 1.2627911247130709e-06, + "loss": 1.1919, + "step": 2167 + }, + { + "epoch": 0.8545947277654595, + "grad_norm": 0.5993315352532911, + "learning_rate": 1.2561028360116002e-06, + "loss": 1.1554, + "step": 2168 + }, + { + "epoch": 0.8549889135254989, + "grad_norm": 0.6007090422412275, + "learning_rate": 1.2494311192533958e-06, + "loss": 1.1593, + "step": 2169 + }, + { + "epoch": 0.8553830992855384, + "grad_norm": 0.6260215764887312, + "learning_rate": 1.242775987083088e-06, + "loss": 1.1785, + "step": 2170 + }, + { + "epoch": 0.8557772850455777, + "grad_norm": 0.6072634488679926, + "learning_rate": 1.2361374521138724e-06, + "loss": 1.1744, + "step": 2171 + }, + { + "epoch": 0.8561714708056172, + "grad_norm": 0.6121816712097319, + "learning_rate": 1.2295155269274827e-06, + "loss": 1.1959, + "step": 2172 + }, + { + "epoch": 0.8565656565656565, + "grad_norm": 0.60232884933228, + "learning_rate": 1.2229102240741819e-06, + "loss": 1.1909, + "step": 2173 + }, + { + "epoch": 0.856959842325696, + "grad_norm": 0.6219022324990678, + "learning_rate": 1.2163215560727215e-06, + "loss": 1.2573, + "step": 2174 + }, + { + "epoch": 0.8573540280857354, + "grad_norm": 0.6432583376483387, + "learning_rate": 1.2097495354103284e-06, + "loss": 1.153, + "step": 2175 + }, + { + "epoch": 0.8577482138457748, + "grad_norm": 0.6057914024761237, + "learning_rate": 1.2031941745426824e-06, + "loss": 1.1835, + "step": 2176 + }, + { + "epoch": 0.8581423996058143, + "grad_norm": 0.5896128109103955, + "learning_rate": 1.1966554858938805e-06, + "loss": 1.1695, + "step": 2177 + }, + { + "epoch": 0.8585365853658536, + "grad_norm": 0.611114769313689, + "learning_rate": 1.1901334818564291e-06, + "loss": 1.1891, + "step": 2178 + }, + { + "epoch": 0.8589307711258931, + "grad_norm": 0.6057440341466516, + "learning_rate": 1.1836281747912125e-06, + "loss": 1.1829, + "step": 2179 + }, + { + "epoch": 0.8593249568859325, + "grad_norm": 0.6070873449171827, + "learning_rate": 1.1771395770274653e-06, + "loss": 1.1444, + "step": 2180 + }, + { + "epoch": 0.8597191426459719, + "grad_norm": 0.6173928300019214, + "learning_rate": 1.1706677008627564e-06, + "loss": 1.1758, + "step": 2181 + }, + { + "epoch": 0.8601133284060113, + "grad_norm": 0.620761797942304, + "learning_rate": 1.1642125585629593e-06, + "loss": 1.2022, + "step": 2182 + }, + { + "epoch": 0.8605075141660508, + "grad_norm": 0.6296457077216101, + "learning_rate": 1.1577741623622407e-06, + "loss": 1.1907, + "step": 2183 + }, + { + "epoch": 0.8609016999260902, + "grad_norm": 0.6203549213795299, + "learning_rate": 1.1513525244630198e-06, + "loss": 1.2293, + "step": 2184 + }, + { + "epoch": 0.8612958856861296, + "grad_norm": 0.6120086583589758, + "learning_rate": 1.1449476570359608e-06, + "loss": 1.118, + "step": 2185 + }, + { + "epoch": 0.861690071446169, + "grad_norm": 0.6044150524885432, + "learning_rate": 1.1385595722199438e-06, + "loss": 1.1275, + "step": 2186 + }, + { + "epoch": 0.8620842572062084, + "grad_norm": 0.6216834948320731, + "learning_rate": 1.1321882821220375e-06, + "loss": 1.2583, + "step": 2187 + }, + { + "epoch": 0.8624784429662479, + "grad_norm": 0.6314861381611362, + "learning_rate": 1.1258337988174794e-06, + "loss": 1.1917, + "step": 2188 + }, + { + "epoch": 0.8628726287262872, + "grad_norm": 0.6086856686806165, + "learning_rate": 1.1194961343496603e-06, + "loss": 1.2272, + "step": 2189 + }, + { + "epoch": 0.8632668144863267, + "grad_norm": 0.5983542589167679, + "learning_rate": 1.1131753007300884e-06, + "loss": 1.1747, + "step": 2190 + }, + { + "epoch": 0.863661000246366, + "grad_norm": 0.6196216583286488, + "learning_rate": 1.1068713099383754e-06, + "loss": 1.1563, + "step": 2191 + }, + { + "epoch": 0.8640551860064055, + "grad_norm": 0.622973730967306, + "learning_rate": 1.1005841739222166e-06, + "loss": 1.1721, + "step": 2192 + }, + { + "epoch": 0.864449371766445, + "grad_norm": 0.6084922385739949, + "learning_rate": 1.094313904597355e-06, + "loss": 1.2149, + "step": 2193 + }, + { + "epoch": 0.8648435575264843, + "grad_norm": 0.6017658686517071, + "learning_rate": 1.0880605138475708e-06, + "loss": 1.1582, + "step": 2194 + }, + { + "epoch": 0.8652377432865238, + "grad_norm": 0.6242920242129635, + "learning_rate": 1.0818240135246528e-06, + "loss": 1.2032, + "step": 2195 + }, + { + "epoch": 0.8656319290465632, + "grad_norm": 0.627892199233753, + "learning_rate": 1.0756044154483813e-06, + "loss": 1.2027, + "step": 2196 + }, + { + "epoch": 0.8660261148066026, + "grad_norm": 0.630460438152927, + "learning_rate": 1.0694017314064997e-06, + "loss": 1.2043, + "step": 2197 + }, + { + "epoch": 0.866420300566642, + "grad_norm": 0.5912369379567544, + "learning_rate": 1.0632159731546965e-06, + "loss": 1.1947, + "step": 2198 + }, + { + "epoch": 0.8668144863266815, + "grad_norm": 0.6032500593156851, + "learning_rate": 1.057047152416585e-06, + "loss": 1.229, + "step": 2199 + }, + { + "epoch": 0.8672086720867209, + "grad_norm": 0.6224700658910649, + "learning_rate": 1.0508952808836682e-06, + "loss": 1.1966, + "step": 2200 + }, + { + "epoch": 0.8676028578467603, + "grad_norm": 0.5995356945189887, + "learning_rate": 1.044760370215333e-06, + "loss": 1.1371, + "step": 2201 + }, + { + "epoch": 0.8679970436067997, + "grad_norm": 0.6264831422167915, + "learning_rate": 1.038642432038821e-06, + "loss": 1.1853, + "step": 2202 + }, + { + "epoch": 0.8683912293668391, + "grad_norm": 0.6112624994424279, + "learning_rate": 1.0325414779492028e-06, + "loss": 1.1631, + "step": 2203 + }, + { + "epoch": 0.8687854151268786, + "grad_norm": 0.6028695555356325, + "learning_rate": 1.0264575195093628e-06, + "loss": 1.1203, + "step": 2204 + }, + { + "epoch": 0.8691796008869179, + "grad_norm": 0.5908979194467311, + "learning_rate": 1.020390568249976e-06, + "loss": 1.1464, + "step": 2205 + }, + { + "epoch": 0.8695737866469574, + "grad_norm": 0.6020405748750884, + "learning_rate": 1.0143406356694797e-06, + "loss": 1.1964, + "step": 2206 + }, + { + "epoch": 0.8699679724069967, + "grad_norm": 0.5976257450496796, + "learning_rate": 1.0083077332340563e-06, + "loss": 1.1588, + "step": 2207 + }, + { + "epoch": 0.8703621581670362, + "grad_norm": 0.5924445023992051, + "learning_rate": 1.0022918723776175e-06, + "loss": 1.1257, + "step": 2208 + }, + { + "epoch": 0.8707563439270757, + "grad_norm": 0.6344444392731119, + "learning_rate": 9.962930645017731e-07, + "loss": 1.1801, + "step": 2209 + }, + { + "epoch": 0.871150529687115, + "grad_norm": 0.6241397033723098, + "learning_rate": 9.903113209758098e-07, + "loss": 1.1347, + "step": 2210 + }, + { + "epoch": 0.8715447154471545, + "grad_norm": 0.6092226491641914, + "learning_rate": 9.843466531366774e-07, + "loss": 1.0919, + "step": 2211 + }, + { + "epoch": 0.8719389012071939, + "grad_norm": 0.6236635571749678, + "learning_rate": 9.783990722889658e-07, + "loss": 1.231, + "step": 2212 + }, + { + "epoch": 0.8723330869672333, + "grad_norm": 0.6136904911563315, + "learning_rate": 9.724685897048747e-07, + "loss": 1.2087, + "step": 2213 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.6091769703428004, + "learning_rate": 9.665552166241965e-07, + "loss": 1.1516, + "step": 2214 + }, + { + "epoch": 0.8731214584873122, + "grad_norm": 0.5777422885075877, + "learning_rate": 9.606589642543064e-07, + "loss": 1.1211, + "step": 2215 + }, + { + "epoch": 0.8735156442473516, + "grad_norm": 0.6279241245367188, + "learning_rate": 9.547798437701194e-07, + "loss": 1.1701, + "step": 2216 + }, + { + "epoch": 0.873909830007391, + "grad_norm": 0.6055169158607546, + "learning_rate": 9.489178663140897e-07, + "loss": 1.1508, + "step": 2217 + }, + { + "epoch": 0.8743040157674304, + "grad_norm": 0.6227455138805572, + "learning_rate": 9.43073042996181e-07, + "loss": 1.1853, + "step": 2218 + }, + { + "epoch": 0.8746982015274698, + "grad_norm": 0.6205644720521007, + "learning_rate": 9.372453848938401e-07, + "loss": 1.1604, + "step": 2219 + }, + { + "epoch": 0.8750923872875093, + "grad_norm": 0.5946939883094988, + "learning_rate": 9.314349030519843e-07, + "loss": 1.1243, + "step": 2220 + }, + { + "epoch": 0.8754865730475486, + "grad_norm": 0.6057397264443781, + "learning_rate": 9.256416084829778e-07, + "loss": 1.141, + "step": 2221 + }, + { + "epoch": 0.8758807588075881, + "grad_norm": 0.6080411686477221, + "learning_rate": 9.198655121666111e-07, + "loss": 1.1783, + "step": 2222 + }, + { + "epoch": 0.8762749445676274, + "grad_norm": 0.6005470900378805, + "learning_rate": 9.141066250500741e-07, + "loss": 1.147, + "step": 2223 + }, + { + "epoch": 0.8766691303276669, + "grad_norm": 0.5945362980985712, + "learning_rate": 9.083649580479493e-07, + "loss": 1.1036, + "step": 2224 + }, + { + "epoch": 0.8770633160877064, + "grad_norm": 0.6099070701658922, + "learning_rate": 9.026405220421785e-07, + "loss": 1.155, + "step": 2225 + }, + { + "epoch": 0.8774575018477457, + "grad_norm": 0.6137077181265143, + "learning_rate": 8.969333278820447e-07, + "loss": 1.1849, + "step": 2226 + }, + { + "epoch": 0.8778516876077852, + "grad_norm": 0.6082519323627844, + "learning_rate": 8.912433863841541e-07, + "loss": 1.1608, + "step": 2227 + }, + { + "epoch": 0.8782458733678246, + "grad_norm": 0.604418332046713, + "learning_rate": 8.855707083324183e-07, + "loss": 1.1366, + "step": 2228 + }, + { + "epoch": 0.878640059127864, + "grad_norm": 0.58974397331068, + "learning_rate": 8.799153044780229e-07, + "loss": 1.1366, + "step": 2229 + }, + { + "epoch": 0.8790342448879034, + "grad_norm": 0.652855576695134, + "learning_rate": 8.742771855394205e-07, + "loss": 1.2052, + "step": 2230 + }, + { + "epoch": 0.8794284306479428, + "grad_norm": 0.606150321404692, + "learning_rate": 8.686563622023059e-07, + "loss": 1.1637, + "step": 2231 + }, + { + "epoch": 0.8798226164079823, + "grad_norm": 0.5985881774469998, + "learning_rate": 8.630528451195874e-07, + "loss": 1.1659, + "step": 2232 + }, + { + "epoch": 0.8802168021680217, + "grad_norm": 0.6204340076356355, + "learning_rate": 8.574666449113766e-07, + "loss": 1.1584, + "step": 2233 + }, + { + "epoch": 0.8806109879280611, + "grad_norm": 0.6270054382615008, + "learning_rate": 8.518977721649679e-07, + "loss": 1.2141, + "step": 2234 + }, + { + "epoch": 0.8810051736881005, + "grad_norm": 0.6090284700913406, + "learning_rate": 8.46346237434813e-07, + "loss": 1.1922, + "step": 2235 + }, + { + "epoch": 0.88139935944814, + "grad_norm": 0.6667233953406846, + "learning_rate": 8.408120512425e-07, + "loss": 1.267, + "step": 2236 + }, + { + "epoch": 0.8817935452081793, + "grad_norm": 0.6099197043950569, + "learning_rate": 8.352952240767453e-07, + "loss": 1.1661, + "step": 2237 + }, + { + "epoch": 0.8821877309682188, + "grad_norm": 0.6353214535694008, + "learning_rate": 8.297957663933609e-07, + "loss": 1.2521, + "step": 2238 + }, + { + "epoch": 0.8825819167282581, + "grad_norm": 0.5822802452492017, + "learning_rate": 8.243136886152381e-07, + "loss": 1.1051, + "step": 2239 + }, + { + "epoch": 0.8829761024882976, + "grad_norm": 0.6024284924891233, + "learning_rate": 8.188490011323291e-07, + "loss": 1.1844, + "step": 2240 + }, + { + "epoch": 0.8833702882483371, + "grad_norm": 0.6218166801091192, + "learning_rate": 8.134017143016304e-07, + "loss": 1.2239, + "step": 2241 + }, + { + "epoch": 0.8837644740083764, + "grad_norm": 0.5982021682698988, + "learning_rate": 8.079718384471557e-07, + "loss": 1.1807, + "step": 2242 + }, + { + "epoch": 0.8841586597684159, + "grad_norm": 0.6167445078039492, + "learning_rate": 8.025593838599221e-07, + "loss": 1.1514, + "step": 2243 + }, + { + "epoch": 0.8845528455284553, + "grad_norm": 0.6267698758553212, + "learning_rate": 7.971643607979273e-07, + "loss": 1.1775, + "step": 2244 + }, + { + "epoch": 0.8849470312884947, + "grad_norm": 0.6007524051589882, + "learning_rate": 7.917867794861378e-07, + "loss": 1.1715, + "step": 2245 + }, + { + "epoch": 0.8853412170485341, + "grad_norm": 0.5867075125001983, + "learning_rate": 7.864266501164541e-07, + "loss": 1.142, + "step": 2246 + }, + { + "epoch": 0.8857354028085735, + "grad_norm": 0.6117682983819526, + "learning_rate": 7.810839828477101e-07, + "loss": 1.1969, + "step": 2247 + }, + { + "epoch": 0.886129588568613, + "grad_norm": 0.6205037469861255, + "learning_rate": 7.757587878056372e-07, + "loss": 1.2472, + "step": 2248 + }, + { + "epoch": 0.8865237743286524, + "grad_norm": 0.6737180765038134, + "learning_rate": 7.704510750828542e-07, + "loss": 1.2256, + "step": 2249 + }, + { + "epoch": 0.8869179600886918, + "grad_norm": 0.5977988152478557, + "learning_rate": 7.651608547388489e-07, + "loss": 1.2092, + "step": 2250 + }, + { + "epoch": 0.8873121458487312, + "grad_norm": 0.5870543672858427, + "learning_rate": 7.598881367999566e-07, + "loss": 1.1694, + "step": 2251 + }, + { + "epoch": 0.8877063316087707, + "grad_norm": 0.604325426385001, + "learning_rate": 7.546329312593382e-07, + "loss": 1.2068, + "step": 2252 + }, + { + "epoch": 0.88810051736881, + "grad_norm": 0.5858794646282535, + "learning_rate": 7.49395248076964e-07, + "loss": 1.1019, + "step": 2253 + }, + { + "epoch": 0.8884947031288495, + "grad_norm": 0.6284533960586269, + "learning_rate": 7.441750971795991e-07, + "loss": 1.1827, + "step": 2254 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.6073835508191624, + "learning_rate": 7.389724884607763e-07, + "loss": 1.1928, + "step": 2255 + }, + { + "epoch": 0.8892830746489283, + "grad_norm": 0.6052990860959455, + "learning_rate": 7.337874317807803e-07, + "loss": 1.1328, + "step": 2256 + }, + { + "epoch": 0.8896772604089678, + "grad_norm": 0.6426366735903185, + "learning_rate": 7.286199369666346e-07, + "loss": 1.184, + "step": 2257 + }, + { + "epoch": 0.8900714461690071, + "grad_norm": 0.6135398558082623, + "learning_rate": 7.234700138120776e-07, + "loss": 1.1567, + "step": 2258 + }, + { + "epoch": 0.8904656319290466, + "grad_norm": 0.611697930631017, + "learning_rate": 7.183376720775415e-07, + "loss": 1.1767, + "step": 2259 + }, + { + "epoch": 0.890859817689086, + "grad_norm": 0.6047260944980717, + "learning_rate": 7.13222921490142e-07, + "loss": 1.142, + "step": 2260 + }, + { + "epoch": 0.8912540034491254, + "grad_norm": 0.6137118230173922, + "learning_rate": 7.081257717436507e-07, + "loss": 1.2122, + "step": 2261 + }, + { + "epoch": 0.8916481892091648, + "grad_norm": 0.5862917774633897, + "learning_rate": 7.030462324984821e-07, + "loss": 1.1464, + "step": 2262 + }, + { + "epoch": 0.8920423749692042, + "grad_norm": 0.5985229585325247, + "learning_rate": 6.979843133816744e-07, + "loss": 1.1538, + "step": 2263 + }, + { + "epoch": 0.8924365607292437, + "grad_norm": 0.6215683066351476, + "learning_rate": 6.929400239868745e-07, + "loss": 1.2066, + "step": 2264 + }, + { + "epoch": 0.8928307464892831, + "grad_norm": 0.6090688114535339, + "learning_rate": 6.879133738743116e-07, + "loss": 1.1246, + "step": 2265 + }, + { + "epoch": 0.8932249322493225, + "grad_norm": 0.6182971626634737, + "learning_rate": 6.829043725707852e-07, + "loss": 1.1638, + "step": 2266 + }, + { + "epoch": 0.8936191180093619, + "grad_norm": 0.6028753226292936, + "learning_rate": 6.779130295696479e-07, + "loss": 1.15, + "step": 2267 + }, + { + "epoch": 0.8940133037694014, + "grad_norm": 0.6138311069551523, + "learning_rate": 6.729393543307838e-07, + "loss": 1.1561, + "step": 2268 + }, + { + "epoch": 0.8944074895294407, + "grad_norm": 0.5975576303249758, + "learning_rate": 6.679833562805882e-07, + "loss": 1.1286, + "step": 2269 + }, + { + "epoch": 0.8948016752894802, + "grad_norm": 0.6059495772680955, + "learning_rate": 6.630450448119618e-07, + "loss": 1.1959, + "step": 2270 + }, + { + "epoch": 0.8951958610495195, + "grad_norm": 0.6797523524629732, + "learning_rate": 6.581244292842792e-07, + "loss": 1.1897, + "step": 2271 + }, + { + "epoch": 0.895590046809559, + "grad_norm": 0.6011492268276885, + "learning_rate": 6.532215190233748e-07, + "loss": 1.1667, + "step": 2272 + }, + { + "epoch": 0.8959842325695985, + "grad_norm": 0.6084758528762907, + "learning_rate": 6.483363233215345e-07, + "loss": 1.1592, + "step": 2273 + }, + { + "epoch": 0.8963784183296378, + "grad_norm": 0.6140227903857725, + "learning_rate": 6.434688514374632e-07, + "loss": 1.1743, + "step": 2274 + }, + { + "epoch": 0.8967726040896773, + "grad_norm": 0.6351263385074363, + "learning_rate": 6.386191125962749e-07, + "loss": 1.1728, + "step": 2275 + }, + { + "epoch": 0.8971667898497167, + "grad_norm": 0.6076012012521917, + "learning_rate": 6.337871159894804e-07, + "loss": 1.1741, + "step": 2276 + }, + { + "epoch": 0.8975609756097561, + "grad_norm": 0.6197330976983585, + "learning_rate": 6.289728707749609e-07, + "loss": 1.1687, + "step": 2277 + }, + { + "epoch": 0.8979551613697955, + "grad_norm": 0.6046508571791555, + "learning_rate": 6.241763860769535e-07, + "loss": 1.1977, + "step": 2278 + }, + { + "epoch": 0.8983493471298349, + "grad_norm": 0.6131957231184164, + "learning_rate": 6.193976709860339e-07, + "loss": 1.2021, + "step": 2279 + }, + { + "epoch": 0.8987435328898744, + "grad_norm": 0.6067811323591332, + "learning_rate": 6.146367345591053e-07, + "loss": 1.1561, + "step": 2280 + }, + { + "epoch": 0.8991377186499138, + "grad_norm": 0.5867337322284969, + "learning_rate": 6.098935858193688e-07, + "loss": 1.1449, + "step": 2281 + }, + { + "epoch": 0.8995319044099532, + "grad_norm": 0.6098231620125031, + "learning_rate": 6.051682337563158e-07, + "loss": 1.1462, + "step": 2282 + }, + { + "epoch": 0.8999260901699926, + "grad_norm": 0.6230967996752504, + "learning_rate": 6.004606873257101e-07, + "loss": 1.1426, + "step": 2283 + }, + { + "epoch": 0.900320275930032, + "grad_norm": 0.5958004624605026, + "learning_rate": 5.957709554495683e-07, + "loss": 1.1797, + "step": 2284 + }, + { + "epoch": 0.9007144616900714, + "grad_norm": 0.6079824292132843, + "learning_rate": 5.910990470161416e-07, + "loss": 1.2281, + "step": 2285 + }, + { + "epoch": 0.9011086474501109, + "grad_norm": 0.5984385548123256, + "learning_rate": 5.864449708799059e-07, + "loss": 1.1619, + "step": 2286 + }, + { + "epoch": 0.9015028332101502, + "grad_norm": 0.591664056006518, + "learning_rate": 5.818087358615354e-07, + "loss": 1.139, + "step": 2287 + }, + { + "epoch": 0.9018970189701897, + "grad_norm": 0.6275372827109235, + "learning_rate": 5.771903507478915e-07, + "loss": 1.2364, + "step": 2288 + }, + { + "epoch": 0.9022912047302292, + "grad_norm": 0.5975540870267736, + "learning_rate": 5.725898242920092e-07, + "loss": 1.1527, + "step": 2289 + }, + { + "epoch": 0.9026853904902685, + "grad_norm": 0.6050375583531165, + "learning_rate": 5.680071652130736e-07, + "loss": 1.1666, + "step": 2290 + }, + { + "epoch": 0.903079576250308, + "grad_norm": 0.6259743502880166, + "learning_rate": 5.634423821964074e-07, + "loss": 1.2275, + "step": 2291 + }, + { + "epoch": 0.9034737620103473, + "grad_norm": 0.6231031649083622, + "learning_rate": 5.588954838934523e-07, + "loss": 1.1716, + "step": 2292 + }, + { + "epoch": 0.9038679477703868, + "grad_norm": 0.6216418043768527, + "learning_rate": 5.543664789217562e-07, + "loss": 1.1871, + "step": 2293 + }, + { + "epoch": 0.9042621335304262, + "grad_norm": 0.583945627934862, + "learning_rate": 5.498553758649516e-07, + "loss": 1.1614, + "step": 2294 + }, + { + "epoch": 0.9046563192904656, + "grad_norm": 0.5974644710894348, + "learning_rate": 5.45362183272743e-07, + "loss": 1.1295, + "step": 2295 + }, + { + "epoch": 0.9050505050505051, + "grad_norm": 0.579085452767809, + "learning_rate": 5.408869096608926e-07, + "loss": 1.1105, + "step": 2296 + }, + { + "epoch": 0.9054446908105445, + "grad_norm": 0.5929251833508978, + "learning_rate": 5.364295635112016e-07, + "loss": 1.1386, + "step": 2297 + }, + { + "epoch": 0.9058388765705839, + "grad_norm": 0.5974271999517115, + "learning_rate": 5.319901532714877e-07, + "loss": 1.142, + "step": 2298 + }, + { + "epoch": 0.9062330623306233, + "grad_norm": 0.6188389973115496, + "learning_rate": 5.27568687355583e-07, + "loss": 1.2045, + "step": 2299 + }, + { + "epoch": 0.9066272480906628, + "grad_norm": 0.6234466396061988, + "learning_rate": 5.231651741433063e-07, + "loss": 1.1656, + "step": 2300 + }, + { + "epoch": 0.9070214338507021, + "grad_norm": 0.6316349387146205, + "learning_rate": 5.187796219804508e-07, + "loss": 1.1759, + "step": 2301 + }, + { + "epoch": 0.9074156196107416, + "grad_norm": 0.6119904812276791, + "learning_rate": 5.144120391787732e-07, + "loss": 1.1648, + "step": 2302 + }, + { + "epoch": 0.9078098053707809, + "grad_norm": 0.5992707761677788, + "learning_rate": 5.100624340159676e-07, + "loss": 1.1705, + "step": 2303 + }, + { + "epoch": 0.9082039911308204, + "grad_norm": 0.6125355457119835, + "learning_rate": 5.057308147356632e-07, + "loss": 1.1878, + "step": 2304 + }, + { + "epoch": 0.9085981768908599, + "grad_norm": 0.5987001014690438, + "learning_rate": 5.014171895473929e-07, + "loss": 1.1728, + "step": 2305 + }, + { + "epoch": 0.9089923626508992, + "grad_norm": 0.6233596220905993, + "learning_rate": 4.971215666265939e-07, + "loss": 1.1682, + "step": 2306 + }, + { + "epoch": 0.9093865484109387, + "grad_norm": 0.6120680988346603, + "learning_rate": 4.928439541145802e-07, + "loss": 1.154, + "step": 2307 + }, + { + "epoch": 0.909780734170978, + "grad_norm": 0.6159172688282434, + "learning_rate": 4.885843601185291e-07, + "loss": 1.1545, + "step": 2308 + }, + { + "epoch": 0.9101749199310175, + "grad_norm": 0.6561541537105161, + "learning_rate": 4.843427927114752e-07, + "loss": 1.2581, + "step": 2309 + }, + { + "epoch": 0.9105691056910569, + "grad_norm": 0.6397314727277476, + "learning_rate": 4.801192599322835e-07, + "loss": 1.2649, + "step": 2310 + }, + { + "epoch": 0.9109632914510963, + "grad_norm": 0.5968063081167863, + "learning_rate": 4.759137697856364e-07, + "loss": 1.1411, + "step": 2311 + }, + { + "epoch": 0.9113574772111358, + "grad_norm": 0.6046846431473332, + "learning_rate": 4.717263302420283e-07, + "loss": 1.2202, + "step": 2312 + }, + { + "epoch": 0.9117516629711752, + "grad_norm": 0.6213044733495849, + "learning_rate": 4.675569492377363e-07, + "loss": 1.1844, + "step": 2313 + }, + { + "epoch": 0.9121458487312146, + "grad_norm": 0.6145028852257042, + "learning_rate": 4.634056346748117e-07, + "loss": 1.2235, + "step": 2314 + }, + { + "epoch": 0.912540034491254, + "grad_norm": 0.6041076227153636, + "learning_rate": 4.5927239442107306e-07, + "loss": 1.1794, + "step": 2315 + }, + { + "epoch": 0.9129342202512934, + "grad_norm": 0.5917377858853244, + "learning_rate": 4.551572363100731e-07, + "loss": 1.1421, + "step": 2316 + }, + { + "epoch": 0.9133284060113328, + "grad_norm": 0.57962701939227, + "learning_rate": 4.5106016814110197e-07, + "loss": 1.1574, + "step": 2317 + }, + { + "epoch": 0.9137225917713723, + "grad_norm": 0.6010271614392757, + "learning_rate": 4.469811976791605e-07, + "loss": 1.1287, + "step": 2318 + }, + { + "epoch": 0.9141167775314116, + "grad_norm": 0.6304038957433044, + "learning_rate": 4.429203326549525e-07, + "loss": 1.1971, + "step": 2319 + }, + { + "epoch": 0.9145109632914511, + "grad_norm": 0.6078465285882131, + "learning_rate": 4.3887758076486597e-07, + "loss": 1.175, + "step": 2320 + }, + { + "epoch": 0.9149051490514906, + "grad_norm": 0.6058022551406895, + "learning_rate": 4.3485294967095747e-07, + "loss": 1.1782, + "step": 2321 + }, + { + "epoch": 0.9152993348115299, + "grad_norm": 0.6222158541213707, + "learning_rate": 4.308464470009432e-07, + "loss": 1.2142, + "step": 2322 + }, + { + "epoch": 0.9156935205715694, + "grad_norm": 0.5967586046808354, + "learning_rate": 4.2685808034818366e-07, + "loss": 1.1787, + "step": 2323 + }, + { + "epoch": 0.9160877063316087, + "grad_norm": 0.6168581167404708, + "learning_rate": 4.228878572716588e-07, + "loss": 1.1771, + "step": 2324 + }, + { + "epoch": 0.9164818920916482, + "grad_norm": 0.6140349806295636, + "learning_rate": 4.189357852959708e-07, + "loss": 1.1865, + "step": 2325 + }, + { + "epoch": 0.9168760778516876, + "grad_norm": 0.616944566915736, + "learning_rate": 4.150018719113147e-07, + "loss": 1.0969, + "step": 2326 + }, + { + "epoch": 0.917270263611727, + "grad_norm": 0.6129659770559598, + "learning_rate": 4.110861245734721e-07, + "loss": 1.1765, + "step": 2327 + }, + { + "epoch": 0.9176644493717665, + "grad_norm": 0.6033445957652277, + "learning_rate": 4.0718855070379535e-07, + "loss": 1.2008, + "step": 2328 + }, + { + "epoch": 0.9180586351318059, + "grad_norm": 0.6190874106262034, + "learning_rate": 4.0330915768919454e-07, + "loss": 1.2122, + "step": 2329 + }, + { + "epoch": 0.9184528208918453, + "grad_norm": 0.6012965614913941, + "learning_rate": 3.9944795288212047e-07, + "loss": 1.1824, + "step": 2330 + }, + { + "epoch": 0.9188470066518847, + "grad_norm": 0.5999458716930699, + "learning_rate": 3.956049436005538e-07, + "loss": 1.1437, + "step": 2331 + }, + { + "epoch": 0.9192411924119241, + "grad_norm": 0.6010551580255399, + "learning_rate": 3.917801371279895e-07, + "loss": 1.1636, + "step": 2332 + }, + { + "epoch": 0.9196353781719635, + "grad_norm": 0.6265717559201462, + "learning_rate": 3.8797354071342443e-07, + "loss": 1.1524, + "step": 2333 + }, + { + "epoch": 0.920029563932003, + "grad_norm": 0.5933108670825852, + "learning_rate": 3.841851615713399e-07, + "loss": 1.1646, + "step": 2334 + }, + { + "epoch": 0.9204237496920423, + "grad_norm": 0.6057802305576383, + "learning_rate": 3.8041500688169253e-07, + "loss": 1.1538, + "step": 2335 + }, + { + "epoch": 0.9208179354520818, + "grad_norm": 0.6237793034270526, + "learning_rate": 3.766630837899032e-07, + "loss": 1.1886, + "step": 2336 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.6198812448884538, + "learning_rate": 3.729293994068306e-07, + "loss": 1.1955, + "step": 2337 + }, + { + "epoch": 0.9216063069721606, + "grad_norm": 0.6247300075084717, + "learning_rate": 3.6921396080877414e-07, + "loss": 1.2292, + "step": 2338 + }, + { + "epoch": 0.9220004927322001, + "grad_norm": 0.6062053891469021, + "learning_rate": 3.6551677503744776e-07, + "loss": 1.1789, + "step": 2339 + }, + { + "epoch": 0.9223946784922394, + "grad_norm": 0.6105135332217473, + "learning_rate": 3.618378490999719e-07, + "loss": 1.1439, + "step": 2340 + }, + { + "epoch": 0.9227888642522789, + "grad_norm": 0.5768948920273077, + "learning_rate": 3.581771899688646e-07, + "loss": 1.1398, + "step": 2341 + }, + { + "epoch": 0.9231830500123183, + "grad_norm": 0.6233702760949931, + "learning_rate": 3.545348045820174e-07, + "loss": 1.2338, + "step": 2342 + }, + { + "epoch": 0.9235772357723577, + "grad_norm": 0.6293178839378355, + "learning_rate": 3.5091069984269366e-07, + "loss": 1.284, + "step": 2343 + }, + { + "epoch": 0.9239714215323972, + "grad_norm": 0.6012639840259887, + "learning_rate": 3.473048826195058e-07, + "loss": 1.1688, + "step": 2344 + }, + { + "epoch": 0.9243656072924366, + "grad_norm": 0.6260153598558462, + "learning_rate": 3.4371735974641053e-07, + "loss": 1.2185, + "step": 2345 + }, + { + "epoch": 0.924759793052476, + "grad_norm": 0.6268091346400951, + "learning_rate": 3.40148138022689e-07, + "loss": 1.2058, + "step": 2346 + }, + { + "epoch": 0.9251539788125154, + "grad_norm": 0.6019494923660028, + "learning_rate": 3.365972242129378e-07, + "loss": 1.1248, + "step": 2347 + }, + { + "epoch": 0.9255481645725548, + "grad_norm": 0.6127790785927769, + "learning_rate": 3.3306462504705706e-07, + "loss": 1.1704, + "step": 2348 + }, + { + "epoch": 0.9259423503325942, + "grad_norm": 0.6434642793289438, + "learning_rate": 3.2955034722023214e-07, + "loss": 1.1639, + "step": 2349 + }, + { + "epoch": 0.9263365360926337, + "grad_norm": 0.6160741690407769, + "learning_rate": 3.2605439739292863e-07, + "loss": 1.181, + "step": 2350 + }, + { + "epoch": 0.926730721852673, + "grad_norm": 0.6040626337033564, + "learning_rate": 3.2257678219087543e-07, + "loss": 1.1359, + "step": 2351 + }, + { + "epoch": 0.9271249076127125, + "grad_norm": 0.6569697201075794, + "learning_rate": 3.191175082050502e-07, + "loss": 1.1525, + "step": 2352 + }, + { + "epoch": 0.927519093372752, + "grad_norm": 0.600173226578904, + "learning_rate": 3.156765819916696e-07, + "loss": 1.1436, + "step": 2353 + }, + { + "epoch": 0.9279132791327913, + "grad_norm": 0.5975529599005833, + "learning_rate": 3.122540100721794e-07, + "loss": 1.1291, + "step": 2354 + }, + { + "epoch": 0.9283074648928308, + "grad_norm": 0.6182461879570678, + "learning_rate": 3.088497989332351e-07, + "loss": 1.1686, + "step": 2355 + }, + { + "epoch": 0.9287016506528701, + "grad_norm": 0.6027606375147575, + "learning_rate": 3.05463955026698e-07, + "loss": 1.132, + "step": 2356 + }, + { + "epoch": 0.9290958364129096, + "grad_norm": 0.6211823263235605, + "learning_rate": 3.020964847696151e-07, + "loss": 1.2116, + "step": 2357 + }, + { + "epoch": 0.929490022172949, + "grad_norm": 0.8055569064292696, + "learning_rate": 2.987473945442143e-07, + "loss": 1.1802, + "step": 2358 + }, + { + "epoch": 0.9298842079329884, + "grad_norm": 0.63319663534154, + "learning_rate": 2.9541669069788505e-07, + "loss": 1.1735, + "step": 2359 + }, + { + "epoch": 0.9302783936930279, + "grad_norm": 0.6092240457871959, + "learning_rate": 2.9210437954316997e-07, + "loss": 1.1769, + "step": 2360 + }, + { + "epoch": 0.9306725794530673, + "grad_norm": 0.5994634449672462, + "learning_rate": 2.888104673577574e-07, + "loss": 1.1217, + "step": 2361 + }, + { + "epoch": 0.9310667652131067, + "grad_norm": 0.6129161824755393, + "learning_rate": 2.8553496038445707e-07, + "loss": 1.1949, + "step": 2362 + }, + { + "epoch": 0.9314609509731461, + "grad_norm": 0.5946581674891636, + "learning_rate": 2.8227786483120523e-07, + "loss": 1.1596, + "step": 2363 + }, + { + "epoch": 0.9318551367331855, + "grad_norm": 0.6220408417857064, + "learning_rate": 2.790391868710374e-07, + "loss": 1.1697, + "step": 2364 + }, + { + "epoch": 0.9322493224932249, + "grad_norm": 0.611301302747428, + "learning_rate": 2.7581893264208346e-07, + "loss": 1.1655, + "step": 2365 + }, + { + "epoch": 0.9326435082532644, + "grad_norm": 0.6303361299231326, + "learning_rate": 2.7261710824755814e-07, + "loss": 1.1762, + "step": 2366 + }, + { + "epoch": 0.9330376940133037, + "grad_norm": 0.613809194427214, + "learning_rate": 2.694337197557462e-07, + "loss": 1.217, + "step": 2367 + }, + { + "epoch": 0.9334318797733432, + "grad_norm": 0.5947788641950997, + "learning_rate": 2.66268773199988e-07, + "loss": 1.2082, + "step": 2368 + }, + { + "epoch": 0.9338260655333827, + "grad_norm": 0.6342184771933248, + "learning_rate": 2.631222745786788e-07, + "loss": 1.2426, + "step": 2369 + }, + { + "epoch": 0.934220251293422, + "grad_norm": 0.6238792637987063, + "learning_rate": 2.5999422985524157e-07, + "loss": 1.2235, + "step": 2370 + }, + { + "epoch": 0.9346144370534615, + "grad_norm": 0.6601808628731608, + "learning_rate": 2.5688464495813304e-07, + "loss": 1.2687, + "step": 2371 + }, + { + "epoch": 0.9350086228135008, + "grad_norm": 0.591780101499758, + "learning_rate": 2.537935257808177e-07, + "loss": 1.1727, + "step": 2372 + }, + { + "epoch": 0.9354028085735403, + "grad_norm": 0.6004908722208354, + "learning_rate": 2.507208781817638e-07, + "loss": 1.1644, + "step": 2373 + }, + { + "epoch": 0.9357969943335797, + "grad_norm": 0.6213717940339839, + "learning_rate": 2.4766670798443414e-07, + "loss": 1.1808, + "step": 2374 + }, + { + "epoch": 0.9361911800936191, + "grad_norm": 0.6088482849843166, + "learning_rate": 2.4463102097726843e-07, + "loss": 1.1679, + "step": 2375 + }, + { + "epoch": 0.9365853658536586, + "grad_norm": 0.5797582430181196, + "learning_rate": 2.4161382291367776e-07, + "loss": 1.1257, + "step": 2376 + }, + { + "epoch": 0.936979551613698, + "grad_norm": 0.619020334020193, + "learning_rate": 2.386151195120323e-07, + "loss": 1.1419, + "step": 2377 + }, + { + "epoch": 0.9373737373737374, + "grad_norm": 0.5946052196409608, + "learning_rate": 2.356349164556493e-07, + "loss": 1.1304, + "step": 2378 + }, + { + "epoch": 0.9377679231337768, + "grad_norm": 0.6091945754012382, + "learning_rate": 2.3267321939278277e-07, + "loss": 1.2201, + "step": 2379 + }, + { + "epoch": 0.9381621088938162, + "grad_norm": 0.6170932843567667, + "learning_rate": 2.2973003393661374e-07, + "loss": 1.2362, + "step": 2380 + }, + { + "epoch": 0.9385562946538556, + "grad_norm": 0.6012825687735323, + "learning_rate": 2.2680536566523802e-07, + "loss": 1.15, + "step": 2381 + }, + { + "epoch": 0.9389504804138951, + "grad_norm": 0.6255938234171833, + "learning_rate": 2.2389922012165944e-07, + "loss": 1.2223, + "step": 2382 + }, + { + "epoch": 0.9393446661739344, + "grad_norm": 0.5876733837374598, + "learning_rate": 2.2101160281377098e-07, + "loss": 1.141, + "step": 2383 + }, + { + "epoch": 0.9397388519339739, + "grad_norm": 0.6146179783064085, + "learning_rate": 2.1814251921435603e-07, + "loss": 1.1977, + "step": 2384 + }, + { + "epoch": 0.9401330376940134, + "grad_norm": 0.5988256998213285, + "learning_rate": 2.1529197476106821e-07, + "loss": 1.1755, + "step": 2385 + }, + { + "epoch": 0.9405272234540527, + "grad_norm": 0.619835334128145, + "learning_rate": 2.124599748564249e-07, + "loss": 1.1283, + "step": 2386 + }, + { + "epoch": 0.9409214092140922, + "grad_norm": 0.598162178135982, + "learning_rate": 2.0964652486779814e-07, + "loss": 1.1926, + "step": 2387 + }, + { + "epoch": 0.9413155949741315, + "grad_norm": 0.613362224923904, + "learning_rate": 2.0685163012740039e-07, + "loss": 1.1947, + "step": 2388 + }, + { + "epoch": 0.941709780734171, + "grad_norm": 0.5975727904035542, + "learning_rate": 2.0407529593228114e-07, + "loss": 1.1629, + "step": 2389 + }, + { + "epoch": 0.9421039664942104, + "grad_norm": 0.6139860108767166, + "learning_rate": 2.013175275443102e-07, + "loss": 1.2471, + "step": 2390 + }, + { + "epoch": 0.9424981522542498, + "grad_norm": 0.585425153613225, + "learning_rate": 1.9857833019017004e-07, + "loss": 1.0983, + "step": 2391 + }, + { + "epoch": 0.9428923380142893, + "grad_norm": 0.6118000826090201, + "learning_rate": 1.9585770906134671e-07, + "loss": 1.1331, + "step": 2392 + }, + { + "epoch": 0.9432865237743286, + "grad_norm": 0.5921590656780138, + "learning_rate": 1.9315566931412233e-07, + "loss": 1.1126, + "step": 2393 + }, + { + "epoch": 0.9436807095343681, + "grad_norm": 0.6165903484277372, + "learning_rate": 1.9047221606955713e-07, + "loss": 1.198, + "step": 2394 + }, + { + "epoch": 0.9440748952944075, + "grad_norm": 0.6368352242306206, + "learning_rate": 1.8780735441348842e-07, + "loss": 1.2699, + "step": 2395 + }, + { + "epoch": 0.9444690810544469, + "grad_norm": 0.6099076721349784, + "learning_rate": 1.8516108939651945e-07, + "loss": 1.2367, + "step": 2396 + }, + { + "epoch": 0.9448632668144863, + "grad_norm": 0.6085928656086841, + "learning_rate": 1.8253342603400503e-07, + "loss": 1.1395, + "step": 2397 + }, + { + "epoch": 0.9452574525745258, + "grad_norm": 0.6174687470746002, + "learning_rate": 1.7992436930604484e-07, + "loss": 1.1651, + "step": 2398 + }, + { + "epoch": 0.9456516383345651, + "grad_norm": 0.6129685190288655, + "learning_rate": 1.7733392415747452e-07, + "loss": 1.1806, + "step": 2399 + }, + { + "epoch": 0.9460458240946046, + "grad_norm": 0.5836621907525494, + "learning_rate": 1.7476209549785906e-07, + "loss": 1.1498, + "step": 2400 + }, + { + "epoch": 0.946440009854644, + "grad_norm": 0.5996938824902894, + "learning_rate": 1.7220888820147607e-07, + "loss": 1.1156, + "step": 2401 + }, + { + "epoch": 0.9468341956146834, + "grad_norm": 0.6162536454834876, + "learning_rate": 1.6967430710731258e-07, + "loss": 1.1963, + "step": 2402 + }, + { + "epoch": 0.9472283813747229, + "grad_norm": 0.6280127586386618, + "learning_rate": 1.6715835701905604e-07, + "loss": 1.2415, + "step": 2403 + }, + { + "epoch": 0.9476225671347622, + "grad_norm": 0.6202334141414314, + "learning_rate": 1.6466104270508099e-07, + "loss": 1.1966, + "step": 2404 + }, + { + "epoch": 0.9480167528948017, + "grad_norm": 0.6122489081297163, + "learning_rate": 1.6218236889844142e-07, + "loss": 1.1671, + "step": 2405 + }, + { + "epoch": 0.948410938654841, + "grad_norm": 0.6035232347033065, + "learning_rate": 1.5972234029686617e-07, + "loss": 1.0962, + "step": 2406 + }, + { + "epoch": 0.9488051244148805, + "grad_norm": 0.6496961489577563, + "learning_rate": 1.5728096156274353e-07, + "loss": 1.2318, + "step": 2407 + }, + { + "epoch": 0.94919931017492, + "grad_norm": 0.6147346192870907, + "learning_rate": 1.5485823732311777e-07, + "loss": 1.0982, + "step": 2408 + }, + { + "epoch": 0.9495934959349593, + "grad_norm": 0.6303713451636969, + "learning_rate": 1.5245417216967596e-07, + "loss": 1.2279, + "step": 2409 + }, + { + "epoch": 0.9499876816949988, + "grad_norm": 0.5889090939067558, + "learning_rate": 1.5006877065874338e-07, + "loss": 1.169, + "step": 2410 + }, + { + "epoch": 0.9503818674550382, + "grad_norm": 0.6019171279270943, + "learning_rate": 1.477020373112714e-07, + "loss": 1.1254, + "step": 2411 + }, + { + "epoch": 0.9507760532150776, + "grad_norm": 0.6157755932202649, + "learning_rate": 1.4535397661283092e-07, + "loss": 1.1132, + "step": 2412 + }, + { + "epoch": 0.951170238975117, + "grad_norm": 0.6132084756622929, + "learning_rate": 1.4302459301360428e-07, + "loss": 1.1932, + "step": 2413 + }, + { + "epoch": 0.9515644247351565, + "grad_norm": 0.6249158834646313, + "learning_rate": 1.4071389092837339e-07, + "loss": 1.2299, + "step": 2414 + }, + { + "epoch": 0.9519586104951958, + "grad_norm": 0.6183091225952251, + "learning_rate": 1.3842187473651626e-07, + "loss": 1.1556, + "step": 2415 + }, + { + "epoch": 0.9523527962552353, + "grad_norm": 0.5918073875966923, + "learning_rate": 1.3614854878199578e-07, + "loss": 1.1273, + "step": 2416 + }, + { + "epoch": 0.9527469820152747, + "grad_norm": 0.5982357040080991, + "learning_rate": 1.3389391737335112e-07, + "loss": 1.1114, + "step": 2417 + }, + { + "epoch": 0.9531411677753141, + "grad_norm": 0.5883507787023478, + "learning_rate": 1.3165798478369184e-07, + "loss": 1.1184, + "step": 2418 + }, + { + "epoch": 0.9535353535353536, + "grad_norm": 0.6182981301693431, + "learning_rate": 1.2944075525068712e-07, + "loss": 1.1803, + "step": 2419 + }, + { + "epoch": 0.9539295392953929, + "grad_norm": 0.6185455523897264, + "learning_rate": 1.272422329765588e-07, + "loss": 1.1795, + "step": 2420 + }, + { + "epoch": 0.9543237250554324, + "grad_norm": 0.6220883345091087, + "learning_rate": 1.2506242212807607e-07, + "loss": 1.2235, + "step": 2421 + }, + { + "epoch": 0.9547179108154717, + "grad_norm": 0.6098949505020008, + "learning_rate": 1.2290132683654087e-07, + "loss": 1.1566, + "step": 2422 + }, + { + "epoch": 0.9551120965755112, + "grad_norm": 0.6015695706886922, + "learning_rate": 1.2075895119779025e-07, + "loss": 1.1703, + "step": 2423 + }, + { + "epoch": 0.9555062823355507, + "grad_norm": 0.6332300803609152, + "learning_rate": 1.1863529927217731e-07, + "loss": 1.1943, + "step": 2424 + }, + { + "epoch": 0.95590046809559, + "grad_norm": 0.612260563852357, + "learning_rate": 1.1653037508457032e-07, + "loss": 1.1732, + "step": 2425 + }, + { + "epoch": 0.9562946538556295, + "grad_norm": 0.5999781512649874, + "learning_rate": 1.1444418262434587e-07, + "loss": 1.1752, + "step": 2426 + }, + { + "epoch": 0.9566888396156689, + "grad_norm": 0.6008667456915643, + "learning_rate": 1.1237672584537673e-07, + "loss": 1.1495, + "step": 2427 + }, + { + "epoch": 0.9570830253757083, + "grad_norm": 0.6153244050308969, + "learning_rate": 1.1032800866602633e-07, + "loss": 1.1937, + "step": 2428 + }, + { + "epoch": 0.9574772111357477, + "grad_norm": 0.5959829809552201, + "learning_rate": 1.0829803496914537e-07, + "loss": 1.1581, + "step": 2429 + }, + { + "epoch": 0.9578713968957872, + "grad_norm": 0.6077619966859046, + "learning_rate": 1.062868086020552e-07, + "loss": 1.1725, + "step": 2430 + }, + { + "epoch": 0.9582655826558265, + "grad_norm": 0.6047743581903363, + "learning_rate": 1.0429433337655115e-07, + "loss": 1.1331, + "step": 2431 + }, + { + "epoch": 0.958659768415866, + "grad_norm": 0.6201599918518463, + "learning_rate": 1.0232061306888918e-07, + "loss": 1.1858, + "step": 2432 + }, + { + "epoch": 0.9590539541759054, + "grad_norm": 0.6231710616869747, + "learning_rate": 1.0036565141977594e-07, + "loss": 1.2016, + "step": 2433 + }, + { + "epoch": 0.9594481399359448, + "grad_norm": 0.6448288343953715, + "learning_rate": 9.842945213437094e-08, + "loss": 1.2158, + "step": 2434 + }, + { + "epoch": 0.9598423256959843, + "grad_norm": 0.6167891303410092, + "learning_rate": 9.651201888227102e-08, + "loss": 1.1559, + "step": 2435 + }, + { + "epoch": 0.9602365114560236, + "grad_norm": 0.6038868590043498, + "learning_rate": 9.461335529750815e-08, + "loss": 1.1601, + "step": 2436 + }, + { + "epoch": 0.9606306972160631, + "grad_norm": 0.6077888775853522, + "learning_rate": 9.273346497854052e-08, + "loss": 1.1977, + "step": 2437 + }, + { + "epoch": 0.9610248829761024, + "grad_norm": 0.603082429453148, + "learning_rate": 9.08723514882437e-08, + "loss": 1.1205, + "step": 2438 + }, + { + "epoch": 0.9614190687361419, + "grad_norm": 0.6010255915248192, + "learning_rate": 8.903001835390946e-08, + "loss": 1.1565, + "step": 2439 + }, + { + "epoch": 0.9618132544961814, + "grad_norm": 0.5911163710697771, + "learning_rate": 8.720646906723585e-08, + "loss": 1.1529, + "step": 2440 + }, + { + "epoch": 0.9622074402562207, + "grad_norm": 0.6227655050280417, + "learning_rate": 8.540170708431716e-08, + "loss": 1.2165, + "step": 2441 + }, + { + "epoch": 0.9626016260162602, + "grad_norm": 0.626494521422824, + "learning_rate": 8.36157358256473e-08, + "loss": 1.2108, + "step": 2442 + }, + { + "epoch": 0.9629958117762996, + "grad_norm": 0.5903062085449574, + "learning_rate": 8.184855867609976e-08, + "loss": 1.1558, + "step": 2443 + }, + { + "epoch": 0.963389997536339, + "grad_norm": 0.6107447987815348, + "learning_rate": 8.010017898493316e-08, + "loss": 1.159, + "step": 2444 + }, + { + "epoch": 0.9637841832963784, + "grad_norm": 0.608930442078416, + "learning_rate": 7.837060006577801e-08, + "loss": 1.1968, + "step": 2445 + }, + { + "epoch": 0.9641783690564178, + "grad_norm": 0.594295975968586, + "learning_rate": 7.665982519663329e-08, + "loss": 1.1405, + "step": 2446 + }, + { + "epoch": 0.9645725548164572, + "grad_norm": 0.5973153401367114, + "learning_rate": 7.49678576198587e-08, + "loss": 1.1439, + "step": 2447 + }, + { + "epoch": 0.9649667405764967, + "grad_norm": 0.5985621492583797, + "learning_rate": 7.329470054217024e-08, + "loss": 1.1717, + "step": 2448 + }, + { + "epoch": 0.9653609263365361, + "grad_norm": 0.602845907873701, + "learning_rate": 7.164035713463358e-08, + "loss": 1.1579, + "step": 2449 + }, + { + "epoch": 0.9657551120965755, + "grad_norm": 0.6205834350913317, + "learning_rate": 7.000483053265506e-08, + "loss": 1.2058, + "step": 2450 + }, + { + "epoch": 0.966149297856615, + "grad_norm": 0.6363339379587928, + "learning_rate": 6.838812383597959e-08, + "loss": 1.2335, + "step": 2451 + }, + { + "epoch": 0.9665434836166543, + "grad_norm": 0.6717079440212176, + "learning_rate": 6.679024010868617e-08, + "loss": 1.1835, + "step": 2452 + }, + { + "epoch": 0.9669376693766938, + "grad_norm": 0.6013068431470037, + "learning_rate": 6.521118237917456e-08, + "loss": 1.1285, + "step": 2453 + }, + { + "epoch": 0.9673318551367331, + "grad_norm": 0.5951721146532576, + "learning_rate": 6.365095364016971e-08, + "loss": 1.1539, + "step": 2454 + }, + { + "epoch": 0.9677260408967726, + "grad_norm": 0.6658577073295611, + "learning_rate": 6.210955684870512e-08, + "loss": 1.2482, + "step": 2455 + }, + { + "epoch": 0.9681202266568121, + "grad_norm": 0.6300768133578355, + "learning_rate": 6.058699492612841e-08, + "loss": 1.2359, + "step": 2456 + }, + { + "epoch": 0.9685144124168514, + "grad_norm": 0.6082556264479969, + "learning_rate": 5.9083270758085733e-08, + "loss": 1.1134, + "step": 2457 + }, + { + "epoch": 0.9689085981768909, + "grad_norm": 0.6185300650907809, + "learning_rate": 5.759838719452404e-08, + "loss": 1.2206, + "step": 2458 + }, + { + "epoch": 0.9693027839369303, + "grad_norm": 0.6117970900606814, + "learning_rate": 5.6132347049679955e-08, + "loss": 1.1647, + "step": 2459 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.5976874867227856, + "learning_rate": 5.468515310207867e-08, + "loss": 1.1589, + "step": 2460 + }, + { + "epoch": 0.9700911554570091, + "grad_norm": 0.6304288708508361, + "learning_rate": 5.3256808094527266e-08, + "loss": 1.1898, + "step": 2461 + }, + { + "epoch": 0.9704853412170485, + "grad_norm": 0.6311672116169154, + "learning_rate": 5.184731473410698e-08, + "loss": 1.1659, + "step": 2462 + }, + { + "epoch": 0.9708795269770879, + "grad_norm": 0.58587149930154, + "learning_rate": 5.045667569217316e-08, + "loss": 1.1655, + "step": 2463 + }, + { + "epoch": 0.9712737127371274, + "grad_norm": 0.6010861112474221, + "learning_rate": 4.9084893604344205e-08, + "loss": 1.1392, + "step": 2464 + }, + { + "epoch": 0.9716678984971668, + "grad_norm": 0.6136708610607174, + "learning_rate": 4.7731971070503754e-08, + "loss": 1.1839, + "step": 2465 + }, + { + "epoch": 0.9720620842572062, + "grad_norm": 0.5941054001607767, + "learning_rate": 4.639791065478738e-08, + "loss": 1.1675, + "step": 2466 + }, + { + "epoch": 0.9724562700172457, + "grad_norm": 0.6082606108108427, + "learning_rate": 4.508271488558369e-08, + "loss": 1.1678, + "step": 2467 + }, + { + "epoch": 0.972850455777285, + "grad_norm": 0.63694043332642, + "learning_rate": 4.3786386255531e-08, + "loss": 1.2357, + "step": 2468 + }, + { + "epoch": 0.9732446415373245, + "grad_norm": 0.6218499921470892, + "learning_rate": 4.250892722150401e-08, + "loss": 1.1817, + "step": 2469 + }, + { + "epoch": 0.9736388272973638, + "grad_norm": 0.618351384803128, + "learning_rate": 4.1250340204619375e-08, + "loss": 1.1498, + "step": 2470 + }, + { + "epoch": 0.9740330130574033, + "grad_norm": 0.6221821265806511, + "learning_rate": 4.001062759022456e-08, + "loss": 1.1812, + "step": 2471 + }, + { + "epoch": 0.9744271988174428, + "grad_norm": 0.6350605796642136, + "learning_rate": 3.878979172789454e-08, + "loss": 1.2148, + "step": 2472 + }, + { + "epoch": 0.9748213845774821, + "grad_norm": 0.6203025166705224, + "learning_rate": 3.758783493142737e-08, + "loss": 1.1737, + "step": 2473 + }, + { + "epoch": 0.9752155703375216, + "grad_norm": 0.6008544551965036, + "learning_rate": 3.640475947884303e-08, + "loss": 1.1266, + "step": 2474 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.6113341557887032, + "learning_rate": 3.5240567612375706e-08, + "loss": 1.2014, + "step": 2475 + }, + { + "epoch": 0.9760039418576004, + "grad_norm": 0.603617063644902, + "learning_rate": 3.4095261538468204e-08, + "loss": 1.166, + "step": 2476 + }, + { + "epoch": 0.9763981276176398, + "grad_norm": 0.6271623067160851, + "learning_rate": 3.2968843427770844e-08, + "loss": 1.201, + "step": 2477 + }, + { + "epoch": 0.9767923133776792, + "grad_norm": 0.5896479252918767, + "learning_rate": 3.186131541513926e-08, + "loss": 1.1689, + "step": 2478 + }, + { + "epoch": 0.9771864991377186, + "grad_norm": 0.6139597394243195, + "learning_rate": 3.0772679599623266e-08, + "loss": 1.1962, + "step": 2479 + }, + { + "epoch": 0.9775806848977581, + "grad_norm": 0.6298030226727921, + "learning_rate": 2.9702938044468e-08, + "loss": 1.1874, + "step": 2480 + }, + { + "epoch": 0.9779748706577975, + "grad_norm": 0.5794413704040846, + "learning_rate": 2.865209277711167e-08, + "loss": 1.1074, + "step": 2481 + }, + { + "epoch": 0.9783690564178369, + "grad_norm": 0.5885716516364036, + "learning_rate": 2.7620145789177823e-08, + "loss": 1.125, + "step": 2482 + }, + { + "epoch": 0.9787632421778764, + "grad_norm": 0.6320208790946613, + "learning_rate": 2.6607099036470853e-08, + "loss": 1.2337, + "step": 2483 + }, + { + "epoch": 0.9791574279379157, + "grad_norm": 0.6070406774043791, + "learning_rate": 2.5612954438977155e-08, + "loss": 1.1309, + "step": 2484 + }, + { + "epoch": 0.9795516136979552, + "grad_norm": 0.6061624110898025, + "learning_rate": 2.463771388085623e-08, + "loss": 1.161, + "step": 2485 + }, + { + "epoch": 0.9799457994579945, + "grad_norm": 0.6181129393801446, + "learning_rate": 2.368137921044289e-08, + "loss": 1.152, + "step": 2486 + }, + { + "epoch": 0.980339985218034, + "grad_norm": 0.6053023110866588, + "learning_rate": 2.274395224023618e-08, + "loss": 1.2039, + "step": 2487 + }, + { + "epoch": 0.9807341709780735, + "grad_norm": 0.611443540064316, + "learning_rate": 2.1825434746903794e-08, + "loss": 1.2308, + "step": 2488 + }, + { + "epoch": 0.9811283567381128, + "grad_norm": 0.5983940583235254, + "learning_rate": 2.0925828471272115e-08, + "loss": 1.1492, + "step": 2489 + }, + { + "epoch": 0.9815225424981523, + "grad_norm": 0.6070581145638013, + "learning_rate": 2.0045135118328397e-08, + "loss": 1.1946, + "step": 2490 + }, + { + "epoch": 0.9819167282581917, + "grad_norm": 0.6080141003498726, + "learning_rate": 1.9183356357215242e-08, + "loss": 1.1755, + "step": 2491 + }, + { + "epoch": 0.9823109140182311, + "grad_norm": 0.6183949984566449, + "learning_rate": 1.8340493821222827e-08, + "loss": 1.234, + "step": 2492 + }, + { + "epoch": 0.9827050997782705, + "grad_norm": 0.6158791546765815, + "learning_rate": 1.7516549107795543e-08, + "loss": 1.1807, + "step": 2493 + }, + { + "epoch": 0.9830992855383099, + "grad_norm": 0.6008031176354653, + "learning_rate": 1.671152377852092e-08, + "loss": 1.1555, + "step": 2494 + }, + { + "epoch": 0.9834934712983493, + "grad_norm": 0.6243823889960919, + "learning_rate": 1.5925419359130723e-08, + "loss": 1.1506, + "step": 2495 + }, + { + "epoch": 0.9838876570583888, + "grad_norm": 0.6092824290673818, + "learning_rate": 1.5158237339494285e-08, + "loss": 1.1245, + "step": 2496 + }, + { + "epoch": 0.9842818428184282, + "grad_norm": 0.6173876535957193, + "learning_rate": 1.4409979173620747e-08, + "loss": 1.1329, + "step": 2497 + }, + { + "epoch": 0.9846760285784676, + "grad_norm": 0.6081775209074783, + "learning_rate": 1.3680646279651266e-08, + "loss": 1.1479, + "step": 2498 + }, + { + "epoch": 0.985070214338507, + "grad_norm": 0.6202693029416111, + "learning_rate": 1.2970240039861248e-08, + "loss": 1.2072, + "step": 2499 + }, + { + "epoch": 0.9854644000985464, + "grad_norm": 0.6008870570624699, + "learning_rate": 1.2278761800653682e-08, + "loss": 1.1418, + "step": 2500 + }, + { + "epoch": 0.9858585858585859, + "grad_norm": 0.624028333998548, + "learning_rate": 1.1606212872559142e-08, + "loss": 1.2152, + "step": 2501 + }, + { + "epoch": 0.9862527716186252, + "grad_norm": 0.6239253652188765, + "learning_rate": 1.0952594530230232e-08, + "loss": 1.2422, + "step": 2502 + }, + { + "epoch": 0.9866469573786647, + "grad_norm": 0.6066337975290457, + "learning_rate": 1.0317908012442701e-08, + "loss": 1.1602, + "step": 2503 + }, + { + "epoch": 0.9870411431387042, + "grad_norm": 0.6377500814670377, + "learning_rate": 9.702154522092111e-09, + "loss": 1.2192, + "step": 2504 + }, + { + "epoch": 0.9874353288987435, + "grad_norm": 0.5987907515887436, + "learning_rate": 9.105335226190504e-09, + "loss": 1.1616, + "step": 2505 + }, + { + "epoch": 0.987829514658783, + "grad_norm": 0.6172014036158203, + "learning_rate": 8.527451255863073e-09, + "loss": 1.225, + "step": 2506 + }, + { + "epoch": 0.9882237004188223, + "grad_norm": 0.6077694286293223, + "learning_rate": 7.968503706350384e-09, + "loss": 1.1987, + "step": 2507 + }, + { + "epoch": 0.9886178861788618, + "grad_norm": 0.6138556064349517, + "learning_rate": 7.42849363700282e-09, + "loss": 1.1483, + "step": 2508 + }, + { + "epoch": 0.9890120719389012, + "grad_norm": 0.6120940708596503, + "learning_rate": 6.907422071278369e-09, + "loss": 1.1581, + "step": 2509 + }, + { + "epoch": 0.9894062576989406, + "grad_norm": 0.5962048270770236, + "learning_rate": 6.405289996741504e-09, + "loss": 1.1662, + "step": 2510 + }, + { + "epoch": 0.98980044345898, + "grad_norm": 0.6184599584147658, + "learning_rate": 5.922098365063189e-09, + "loss": 1.1495, + "step": 2511 + }, + { + "epoch": 0.9901946292190195, + "grad_norm": 0.6296776196488952, + "learning_rate": 5.457848092015328e-09, + "loss": 1.1905, + "step": 2512 + }, + { + "epoch": 0.9905888149790589, + "grad_norm": 0.6131588421344288, + "learning_rate": 5.012540057474091e-09, + "loss": 1.1818, + "step": 2513 + }, + { + "epoch": 0.9909830007390983, + "grad_norm": 0.5964517876857598, + "learning_rate": 4.586175105411039e-09, + "loss": 1.1824, + "step": 2514 + }, + { + "epoch": 0.9913771864991378, + "grad_norm": 0.6158891574168905, + "learning_rate": 4.178754043898669e-09, + "loss": 1.1601, + "step": 2515 + }, + { + "epoch": 0.9917713722591771, + "grad_norm": 0.6048009237523553, + "learning_rate": 3.790277645104867e-09, + "loss": 1.1299, + "step": 2516 + }, + { + "epoch": 0.9921655580192166, + "grad_norm": 0.6238556971612192, + "learning_rate": 3.420746645292905e-09, + "loss": 1.1244, + "step": 2517 + }, + { + "epoch": 0.9925597437792559, + "grad_norm": 0.6155143754125697, + "learning_rate": 3.0701617448203325e-09, + "loss": 1.1856, + "step": 2518 + }, + { + "epoch": 0.9929539295392954, + "grad_norm": 0.6156379383507039, + "learning_rate": 2.738523608135646e-09, + "loss": 1.1921, + "step": 2519 + }, + { + "epoch": 0.9933481152993349, + "grad_norm": 0.6287557362309201, + "learning_rate": 2.4258328637771776e-09, + "loss": 1.1696, + "step": 2520 + }, + { + "epoch": 0.9937423010593742, + "grad_norm": 0.6035984671210802, + "learning_rate": 2.1320901043764276e-09, + "loss": 1.1752, + "step": 2521 + }, + { + "epoch": 0.9941364868194137, + "grad_norm": 0.6095120389983935, + "learning_rate": 1.8572958866514e-09, + "loss": 1.1458, + "step": 2522 + }, + { + "epoch": 0.994530672579453, + "grad_norm": 0.7589305669134696, + "learning_rate": 1.6014507314077165e-09, + "loss": 1.1667, + "step": 2523 + }, + { + "epoch": 0.9949248583394925, + "grad_norm": 0.6114552923969634, + "learning_rate": 1.3645551235386134e-09, + "loss": 1.1621, + "step": 2524 + }, + { + "epoch": 0.9953190440995319, + "grad_norm": 0.6058392606138625, + "learning_rate": 1.1466095120216126e-09, + "loss": 1.1241, + "step": 2525 + }, + { + "epoch": 0.9957132298595713, + "grad_norm": 0.5936603980813377, + "learning_rate": 9.476143099207414e-10, + "loss": 1.1423, + "step": 2526 + }, + { + "epoch": 0.9961074156196107, + "grad_norm": 0.5977975525192136, + "learning_rate": 7.67569894382092e-10, + "loss": 1.1964, + "step": 2527 + }, + { + "epoch": 0.9965016013796502, + "grad_norm": 0.5957259774856952, + "learning_rate": 6.064766066382622e-10, + "loss": 1.1949, + "step": 2528 + }, + { + "epoch": 0.9968957871396896, + "grad_norm": 0.6094021396471523, + "learning_rate": 4.643347520005836e-10, + "loss": 1.2123, + "step": 2529 + }, + { + "epoch": 0.997289972899729, + "grad_norm": 0.6120542827325469, + "learning_rate": 3.4114459986689386e-10, + "loss": 1.1313, + "step": 2530 + }, + { + "epoch": 0.9976841586597684, + "grad_norm": 0.6151506851061069, + "learning_rate": 2.369063837115437e-10, + "loss": 1.2058, + "step": 2531 + }, + { + "epoch": 0.9980783444198078, + "grad_norm": 0.6008592003001969, + "learning_rate": 1.5162030109538982e-10, + "loss": 1.151, + "step": 2532 + }, + { + "epoch": 0.9984725301798473, + "grad_norm": 0.5857304461429403, + "learning_rate": 8.528651365580232e-11, + "loss": 1.1576, + "step": 2533 + }, + { + "epoch": 0.9988667159398866, + "grad_norm": 0.6021334182290597, + "learning_rate": 3.790514711332627e-11, + "loss": 1.195, + "step": 2534 + }, + { + "epoch": 0.9992609016999261, + "grad_norm": 0.6202010114249676, + "learning_rate": 9.476291268351035e-12, + "loss": 1.192, + "step": 2535 + }, + { + "epoch": 0.9996550874599656, + "grad_norm": 0.6201498827195971, + "learning_rate": 0.0, + "loss": 1.1993, + "step": 2536 + }, + { + "epoch": 0.9996550874599656, + "eval_loss": 1.168265700340271, + "eval_runtime": 2983.6589, + "eval_samples_per_second": 5.508, + "eval_steps_per_second": 0.689, + "step": 2536 + }, + { + "epoch": 0.9996550874599656, + "step": 2536, + "total_flos": 661690545340416.0, + "train_loss": 1.391600751820423, + "train_runtime": 151844.1268, + "train_samples_per_second": 1.069, + "train_steps_per_second": 0.017 + } + ], + "logging_steps": 1, + "max_steps": 2536, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 661690545340416.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}