{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998596294216732, "eval_steps": 500, "global_step": 5935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008422234699606962, "grad_norm": 5.776987075805664, "learning_rate": 1.6835016835016835e-08, "loss": 0.8113, "step": 1 }, { "epoch": 0.0016844469399213925, "grad_norm": 5.731700420379639, "learning_rate": 3.367003367003367e-08, "loss": 0.818, "step": 2 }, { "epoch": 0.0025266704098820887, "grad_norm": 5.806257724761963, "learning_rate": 5.050505050505051e-08, "loss": 0.8415, "step": 3 }, { "epoch": 0.003368893879842785, "grad_norm": 5.792017459869385, "learning_rate": 6.734006734006734e-08, "loss": 0.8421, "step": 4 }, { "epoch": 0.004211117349803481, "grad_norm": 5.9308366775512695, "learning_rate": 8.417508417508418e-08, "loss": 0.86, "step": 5 }, { "epoch": 0.0050533408197641775, "grad_norm": 5.979487419128418, "learning_rate": 1.0101010101010103e-07, "loss": 0.8677, "step": 6 }, { "epoch": 0.005895564289724873, "grad_norm": 6.015115261077881, "learning_rate": 1.1784511784511785e-07, "loss": 0.8991, "step": 7 }, { "epoch": 0.00673778775968557, "grad_norm": 6.0273356437683105, "learning_rate": 1.3468013468013468e-07, "loss": 0.8651, "step": 8 }, { "epoch": 0.007580011229646266, "grad_norm": 5.9260358810424805, "learning_rate": 1.5151515151515152e-07, "loss": 0.8538, "step": 9 }, { "epoch": 0.008422234699606962, "grad_norm": 5.725348472595215, "learning_rate": 1.6835016835016837e-07, "loss": 0.847, "step": 10 }, { "epoch": 0.009264458169567658, "grad_norm": 5.714199542999268, "learning_rate": 1.8518518518518518e-07, "loss": 0.8524, "step": 11 }, { "epoch": 0.010106681639528355, "grad_norm": 5.929133415222168, "learning_rate": 2.0202020202020205e-07, "loss": 0.8546, "step": 12 }, { "epoch": 0.010948905109489052, "grad_norm": 5.69346284866333, "learning_rate": 2.188552188552189e-07, "loss": 0.8631, "step": 13 }, { "epoch": 0.011791128579449747, "grad_norm": 5.936670303344727, "learning_rate": 2.356902356902357e-07, "loss": 0.8814, "step": 14 }, { "epoch": 0.012633352049410443, "grad_norm": 5.8322601318359375, "learning_rate": 2.525252525252525e-07, "loss": 0.8738, "step": 15 }, { "epoch": 0.01347557551937114, "grad_norm": 5.473421573638916, "learning_rate": 2.6936026936026936e-07, "loss": 0.7996, "step": 16 }, { "epoch": 0.014317798989331837, "grad_norm": 5.642951965332031, "learning_rate": 2.8619528619528626e-07, "loss": 0.8722, "step": 17 }, { "epoch": 0.015160022459292532, "grad_norm": 5.274138450622559, "learning_rate": 3.0303030303030305e-07, "loss": 0.8338, "step": 18 }, { "epoch": 0.016002245929253228, "grad_norm": 5.380349636077881, "learning_rate": 3.198653198653199e-07, "loss": 0.8542, "step": 19 }, { "epoch": 0.016844469399213923, "grad_norm": 5.5772552490234375, "learning_rate": 3.3670033670033673e-07, "loss": 0.8379, "step": 20 }, { "epoch": 0.01768669286917462, "grad_norm": 5.257245063781738, "learning_rate": 3.535353535353536e-07, "loss": 0.8393, "step": 21 }, { "epoch": 0.018528916339135316, "grad_norm": 5.2609663009643555, "learning_rate": 3.7037037037037036e-07, "loss": 0.8285, "step": 22 }, { "epoch": 0.019371139809096015, "grad_norm": 4.728295803070068, "learning_rate": 3.872053872053872e-07, "loss": 0.7942, "step": 23 }, { "epoch": 0.02021336327905671, "grad_norm": 4.521248817443848, "learning_rate": 4.040404040404041e-07, "loss": 0.8277, "step": 24 }, { "epoch": 0.021055586749017405, "grad_norm": 4.532914638519287, "learning_rate": 4.2087542087542094e-07, "loss": 0.8377, "step": 25 }, { "epoch": 0.021897810218978103, "grad_norm": 4.479588031768799, "learning_rate": 4.377104377104378e-07, "loss": 0.8582, "step": 26 }, { "epoch": 0.022740033688938798, "grad_norm": 4.330374717712402, "learning_rate": 4.5454545454545457e-07, "loss": 0.8192, "step": 27 }, { "epoch": 0.023582257158899493, "grad_norm": 4.251430034637451, "learning_rate": 4.713804713804714e-07, "loss": 0.8081, "step": 28 }, { "epoch": 0.02442448062886019, "grad_norm": 4.104611396789551, "learning_rate": 4.882154882154883e-07, "loss": 0.7756, "step": 29 }, { "epoch": 0.025266704098820886, "grad_norm": 4.189428329467773, "learning_rate": 5.05050505050505e-07, "loss": 0.8328, "step": 30 }, { "epoch": 0.026108927568781585, "grad_norm": 3.919502019882202, "learning_rate": 5.218855218855219e-07, "loss": 0.7925, "step": 31 }, { "epoch": 0.02695115103874228, "grad_norm": 2.6382088661193848, "learning_rate": 5.387205387205387e-07, "loss": 0.7719, "step": 32 }, { "epoch": 0.027793374508702975, "grad_norm": 2.2393641471862793, "learning_rate": 5.555555555555555e-07, "loss": 0.7547, "step": 33 }, { "epoch": 0.028635597978663673, "grad_norm": 2.3174548149108887, "learning_rate": 5.723905723905725e-07, "loss": 0.7462, "step": 34 }, { "epoch": 0.029477821448624368, "grad_norm": 2.258150577545166, "learning_rate": 5.892255892255893e-07, "loss": 0.7806, "step": 35 }, { "epoch": 0.030320044918585063, "grad_norm": 2.1710970401763916, "learning_rate": 6.060606060606061e-07, "loss": 0.7402, "step": 36 }, { "epoch": 0.03116226838854576, "grad_norm": 2.1291310787200928, "learning_rate": 6.22895622895623e-07, "loss": 0.7254, "step": 37 }, { "epoch": 0.032004491858506456, "grad_norm": 2.1052963733673096, "learning_rate": 6.397306397306398e-07, "loss": 0.7707, "step": 38 }, { "epoch": 0.032846715328467155, "grad_norm": 2.0386407375335693, "learning_rate": 6.565656565656567e-07, "loss": 0.7713, "step": 39 }, { "epoch": 0.033688938798427846, "grad_norm": 1.802610158920288, "learning_rate": 6.734006734006735e-07, "loss": 0.7322, "step": 40 }, { "epoch": 0.034531162268388545, "grad_norm": 1.9417866468429565, "learning_rate": 6.902356902356904e-07, "loss": 0.7558, "step": 41 }, { "epoch": 0.03537338573834924, "grad_norm": 1.6213467121124268, "learning_rate": 7.070707070707071e-07, "loss": 0.7357, "step": 42 }, { "epoch": 0.03621560920830994, "grad_norm": 1.4845741987228394, "learning_rate": 7.23905723905724e-07, "loss": 0.7444, "step": 43 }, { "epoch": 0.03705783267827063, "grad_norm": 1.3799744844436646, "learning_rate": 7.407407407407407e-07, "loss": 0.7088, "step": 44 }, { "epoch": 0.03790005614823133, "grad_norm": 1.6377196311950684, "learning_rate": 7.575757575757576e-07, "loss": 0.7337, "step": 45 }, { "epoch": 0.03874227961819203, "grad_norm": 1.7535754442214966, "learning_rate": 7.744107744107744e-07, "loss": 0.7036, "step": 46 }, { "epoch": 0.03958450308815272, "grad_norm": 1.8757281303405762, "learning_rate": 7.912457912457913e-07, "loss": 0.7358, "step": 47 }, { "epoch": 0.04042672655811342, "grad_norm": 1.7928193807601929, "learning_rate": 8.080808080808082e-07, "loss": 0.6878, "step": 48 }, { "epoch": 0.04126895002807412, "grad_norm": 1.859838843345642, "learning_rate": 8.24915824915825e-07, "loss": 0.7108, "step": 49 }, { "epoch": 0.04211117349803481, "grad_norm": 1.520964503288269, "learning_rate": 8.417508417508419e-07, "loss": 0.6595, "step": 50 }, { "epoch": 0.04295339696799551, "grad_norm": 1.5494898557662964, "learning_rate": 8.585858585858587e-07, "loss": 0.6928, "step": 51 }, { "epoch": 0.043795620437956206, "grad_norm": 1.4317282438278198, "learning_rate": 8.754208754208756e-07, "loss": 0.6958, "step": 52 }, { "epoch": 0.0446378439079169, "grad_norm": 1.2734413146972656, "learning_rate": 8.922558922558923e-07, "loss": 0.7265, "step": 53 }, { "epoch": 0.045480067377877596, "grad_norm": 1.1653810739517212, "learning_rate": 9.090909090909091e-07, "loss": 0.7128, "step": 54 }, { "epoch": 0.046322290847838295, "grad_norm": 1.0482120513916016, "learning_rate": 9.259259259259259e-07, "loss": 0.7144, "step": 55 }, { "epoch": 0.047164514317798986, "grad_norm": 0.9268953800201416, "learning_rate": 9.427609427609428e-07, "loss": 0.6947, "step": 56 }, { "epoch": 0.048006737787759685, "grad_norm": 0.9306286573410034, "learning_rate": 9.595959595959596e-07, "loss": 0.6829, "step": 57 }, { "epoch": 0.04884896125772038, "grad_norm": 0.928467333316803, "learning_rate": 9.764309764309765e-07, "loss": 0.67, "step": 58 }, { "epoch": 0.04969118472768108, "grad_norm": 0.9721364378929138, "learning_rate": 9.932659932659934e-07, "loss": 0.6278, "step": 59 }, { "epoch": 0.05053340819764177, "grad_norm": 0.9665070176124573, "learning_rate": 1.01010101010101e-06, "loss": 0.704, "step": 60 }, { "epoch": 0.05137563166760247, "grad_norm": 0.8246660828590393, "learning_rate": 1.026936026936027e-06, "loss": 0.6593, "step": 61 }, { "epoch": 0.05221785513756317, "grad_norm": 0.7794570326805115, "learning_rate": 1.0437710437710439e-06, "loss": 0.6463, "step": 62 }, { "epoch": 0.05306007860752386, "grad_norm": 0.7768704891204834, "learning_rate": 1.0606060606060608e-06, "loss": 0.6565, "step": 63 }, { "epoch": 0.05390230207748456, "grad_norm": 0.706986129283905, "learning_rate": 1.0774410774410775e-06, "loss": 0.6183, "step": 64 }, { "epoch": 0.05474452554744526, "grad_norm": 0.7199012041091919, "learning_rate": 1.0942760942760944e-06, "loss": 0.625, "step": 65 }, { "epoch": 0.05558674901740595, "grad_norm": 0.6904604434967041, "learning_rate": 1.111111111111111e-06, "loss": 0.6508, "step": 66 }, { "epoch": 0.05642897248736665, "grad_norm": 0.6288436055183411, "learning_rate": 1.1279461279461281e-06, "loss": 0.6245, "step": 67 }, { "epoch": 0.057271195957327346, "grad_norm": 0.62641841173172, "learning_rate": 1.144781144781145e-06, "loss": 0.6113, "step": 68 }, { "epoch": 0.05811341942728804, "grad_norm": 0.5761350989341736, "learning_rate": 1.1616161616161617e-06, "loss": 0.634, "step": 69 }, { "epoch": 0.058955642897248736, "grad_norm": 0.5666042566299438, "learning_rate": 1.1784511784511786e-06, "loss": 0.616, "step": 70 }, { "epoch": 0.059797866367209435, "grad_norm": 0.5939538478851318, "learning_rate": 1.1952861952861953e-06, "loss": 0.6393, "step": 71 }, { "epoch": 0.060640089837170126, "grad_norm": 0.6546233892440796, "learning_rate": 1.2121212121212122e-06, "loss": 0.6367, "step": 72 }, { "epoch": 0.061482313307130824, "grad_norm": 0.5509408116340637, "learning_rate": 1.228956228956229e-06, "loss": 0.6316, "step": 73 }, { "epoch": 0.06232453677709152, "grad_norm": 0.51455157995224, "learning_rate": 1.245791245791246e-06, "loss": 0.6211, "step": 74 }, { "epoch": 0.06316676024705221, "grad_norm": 0.588775634765625, "learning_rate": 1.2626262626262629e-06, "loss": 0.6026, "step": 75 }, { "epoch": 0.06400898371701291, "grad_norm": 0.46751725673675537, "learning_rate": 1.2794612794612796e-06, "loss": 0.5882, "step": 76 }, { "epoch": 0.06485120718697361, "grad_norm": 0.4818503260612488, "learning_rate": 1.2962962962962962e-06, "loss": 0.6069, "step": 77 }, { "epoch": 0.06569343065693431, "grad_norm": 0.5167794227600098, "learning_rate": 1.3131313131313134e-06, "loss": 0.6513, "step": 78 }, { "epoch": 0.06653565412689501, "grad_norm": 0.46660304069519043, "learning_rate": 1.32996632996633e-06, "loss": 0.5974, "step": 79 }, { "epoch": 0.06737787759685569, "grad_norm": 0.46841952204704285, "learning_rate": 1.346801346801347e-06, "loss": 0.6291, "step": 80 }, { "epoch": 0.06822010106681639, "grad_norm": 0.5091264843940735, "learning_rate": 1.3636363636363636e-06, "loss": 0.6121, "step": 81 }, { "epoch": 0.06906232453677709, "grad_norm": 0.5265926122665405, "learning_rate": 1.3804713804713807e-06, "loss": 0.6102, "step": 82 }, { "epoch": 0.06990454800673779, "grad_norm": 0.45408716797828674, "learning_rate": 1.3973063973063974e-06, "loss": 0.5792, "step": 83 }, { "epoch": 0.07074677147669849, "grad_norm": 0.4690511226654053, "learning_rate": 1.4141414141414143e-06, "loss": 0.6068, "step": 84 }, { "epoch": 0.07158899494665918, "grad_norm": 0.44161954522132874, "learning_rate": 1.430976430976431e-06, "loss": 0.5827, "step": 85 }, { "epoch": 0.07243121841661988, "grad_norm": 0.45739510655403137, "learning_rate": 1.447811447811448e-06, "loss": 0.6315, "step": 86 }, { "epoch": 0.07327344188658057, "grad_norm": 0.4600661098957062, "learning_rate": 1.4646464646464648e-06, "loss": 0.6037, "step": 87 }, { "epoch": 0.07411566535654127, "grad_norm": 0.47874510288238525, "learning_rate": 1.4814814814814815e-06, "loss": 0.6241, "step": 88 }, { "epoch": 0.07495788882650196, "grad_norm": 0.45248445868492126, "learning_rate": 1.4983164983164986e-06, "loss": 0.6116, "step": 89 }, { "epoch": 0.07580011229646266, "grad_norm": 0.47081783413887024, "learning_rate": 1.5151515151515152e-06, "loss": 0.6276, "step": 90 }, { "epoch": 0.07664233576642336, "grad_norm": 0.4259335398674011, "learning_rate": 1.5319865319865321e-06, "loss": 0.5899, "step": 91 }, { "epoch": 0.07748455923638406, "grad_norm": 0.41876012086868286, "learning_rate": 1.5488215488215488e-06, "loss": 0.5846, "step": 92 }, { "epoch": 0.07832678270634474, "grad_norm": 0.5013454556465149, "learning_rate": 1.565656565656566e-06, "loss": 0.5901, "step": 93 }, { "epoch": 0.07916900617630544, "grad_norm": 0.45448780059814453, "learning_rate": 1.5824915824915826e-06, "loss": 0.6193, "step": 94 }, { "epoch": 0.08001122964626614, "grad_norm": 0.43251457810401917, "learning_rate": 1.5993265993265993e-06, "loss": 0.5881, "step": 95 }, { "epoch": 0.08085345311622684, "grad_norm": 0.394329309463501, "learning_rate": 1.6161616161616164e-06, "loss": 0.5797, "step": 96 }, { "epoch": 0.08169567658618754, "grad_norm": 0.4374575912952423, "learning_rate": 1.6329966329966333e-06, "loss": 0.6127, "step": 97 }, { "epoch": 0.08253790005614824, "grad_norm": 0.41746532917022705, "learning_rate": 1.64983164983165e-06, "loss": 0.5709, "step": 98 }, { "epoch": 0.08338012352610892, "grad_norm": 0.4681364595890045, "learning_rate": 1.6666666666666667e-06, "loss": 0.565, "step": 99 }, { "epoch": 0.08422234699606962, "grad_norm": 0.40742427110671997, "learning_rate": 1.6835016835016838e-06, "loss": 0.6075, "step": 100 }, { "epoch": 0.08506457046603032, "grad_norm": 0.43414172530174255, "learning_rate": 1.7003367003367005e-06, "loss": 0.5861, "step": 101 }, { "epoch": 0.08590679393599102, "grad_norm": 0.4720393121242523, "learning_rate": 1.7171717171717173e-06, "loss": 0.6138, "step": 102 }, { "epoch": 0.08674901740595171, "grad_norm": 0.47483813762664795, "learning_rate": 1.734006734006734e-06, "loss": 0.5754, "step": 103 }, { "epoch": 0.08759124087591241, "grad_norm": 0.4342634975910187, "learning_rate": 1.7508417508417511e-06, "loss": 0.6078, "step": 104 }, { "epoch": 0.08843346434587311, "grad_norm": 0.45002737641334534, "learning_rate": 1.7676767676767678e-06, "loss": 0.5875, "step": 105 }, { "epoch": 0.0892756878158338, "grad_norm": 0.39936861395835876, "learning_rate": 1.7845117845117845e-06, "loss": 0.5613, "step": 106 }, { "epoch": 0.0901179112857945, "grad_norm": 0.440410852432251, "learning_rate": 1.8013468013468016e-06, "loss": 0.5673, "step": 107 }, { "epoch": 0.09096013475575519, "grad_norm": 0.41290518641471863, "learning_rate": 1.8181818181818183e-06, "loss": 0.5836, "step": 108 }, { "epoch": 0.09180235822571589, "grad_norm": 0.42081207036972046, "learning_rate": 1.8350168350168352e-06, "loss": 0.5832, "step": 109 }, { "epoch": 0.09264458169567659, "grad_norm": 0.4456280469894409, "learning_rate": 1.8518518518518519e-06, "loss": 0.5772, "step": 110 }, { "epoch": 0.09348680516563729, "grad_norm": 0.40804535150527954, "learning_rate": 1.868686868686869e-06, "loss": 0.5974, "step": 111 }, { "epoch": 0.09432902863559797, "grad_norm": 0.401093989610672, "learning_rate": 1.8855218855218857e-06, "loss": 0.5595, "step": 112 }, { "epoch": 0.09517125210555867, "grad_norm": 0.4588485658168793, "learning_rate": 1.9023569023569026e-06, "loss": 0.5804, "step": 113 }, { "epoch": 0.09601347557551937, "grad_norm": 0.4583130478858948, "learning_rate": 1.9191919191919192e-06, "loss": 0.5838, "step": 114 }, { "epoch": 0.09685569904548007, "grad_norm": 0.42039844393730164, "learning_rate": 1.936026936026936e-06, "loss": 0.5634, "step": 115 }, { "epoch": 0.09769792251544077, "grad_norm": 0.402934730052948, "learning_rate": 1.952861952861953e-06, "loss": 0.5622, "step": 116 }, { "epoch": 0.09854014598540146, "grad_norm": 0.38958680629730225, "learning_rate": 1.96969696969697e-06, "loss": 0.5768, "step": 117 }, { "epoch": 0.09938236945536216, "grad_norm": 0.42844828963279724, "learning_rate": 1.986531986531987e-06, "loss": 0.571, "step": 118 }, { "epoch": 0.10022459292532285, "grad_norm": 0.4351634085178375, "learning_rate": 2.0033670033670037e-06, "loss": 0.5746, "step": 119 }, { "epoch": 0.10106681639528355, "grad_norm": 0.4018780291080475, "learning_rate": 2.02020202020202e-06, "loss": 0.5604, "step": 120 }, { "epoch": 0.10190903986524424, "grad_norm": 0.4231856167316437, "learning_rate": 2.037037037037037e-06, "loss": 0.5946, "step": 121 }, { "epoch": 0.10275126333520494, "grad_norm": 0.4118141531944275, "learning_rate": 2.053872053872054e-06, "loss": 0.5588, "step": 122 }, { "epoch": 0.10359348680516564, "grad_norm": 0.38295167684555054, "learning_rate": 2.070707070707071e-06, "loss": 0.5502, "step": 123 }, { "epoch": 0.10443571027512634, "grad_norm": 0.4265163540840149, "learning_rate": 2.0875420875420878e-06, "loss": 0.5336, "step": 124 }, { "epoch": 0.10527793374508702, "grad_norm": 0.37607690691947937, "learning_rate": 2.1043771043771047e-06, "loss": 0.5513, "step": 125 }, { "epoch": 0.10612015721504772, "grad_norm": 0.43694496154785156, "learning_rate": 2.1212121212121216e-06, "loss": 0.6044, "step": 126 }, { "epoch": 0.10696238068500842, "grad_norm": 0.4131213426589966, "learning_rate": 2.138047138047138e-06, "loss": 0.564, "step": 127 }, { "epoch": 0.10780460415496912, "grad_norm": 0.37771841883659363, "learning_rate": 2.154882154882155e-06, "loss": 0.5412, "step": 128 }, { "epoch": 0.10864682762492982, "grad_norm": 0.40186527371406555, "learning_rate": 2.171717171717172e-06, "loss": 0.5548, "step": 129 }, { "epoch": 0.10948905109489052, "grad_norm": 0.4168996810913086, "learning_rate": 2.1885521885521887e-06, "loss": 0.5852, "step": 130 }, { "epoch": 0.1103312745648512, "grad_norm": 0.3783559799194336, "learning_rate": 2.2053872053872056e-06, "loss": 0.5621, "step": 131 }, { "epoch": 0.1111734980348119, "grad_norm": 0.4038413465023041, "learning_rate": 2.222222222222222e-06, "loss": 0.5451, "step": 132 }, { "epoch": 0.1120157215047726, "grad_norm": 0.4264334440231323, "learning_rate": 2.2390572390572394e-06, "loss": 0.5698, "step": 133 }, { "epoch": 0.1128579449747333, "grad_norm": 0.4022243916988373, "learning_rate": 2.2558922558922563e-06, "loss": 0.5694, "step": 134 }, { "epoch": 0.113700168444694, "grad_norm": 0.3824006915092468, "learning_rate": 2.2727272727272728e-06, "loss": 0.5311, "step": 135 }, { "epoch": 0.11454239191465469, "grad_norm": 0.42129233479499817, "learning_rate": 2.28956228956229e-06, "loss": 0.5781, "step": 136 }, { "epoch": 0.11538461538461539, "grad_norm": 0.39466583728790283, "learning_rate": 2.3063973063973065e-06, "loss": 0.5328, "step": 137 }, { "epoch": 0.11622683885457608, "grad_norm": 0.41918909549713135, "learning_rate": 2.3232323232323234e-06, "loss": 0.5578, "step": 138 }, { "epoch": 0.11706906232453677, "grad_norm": 0.39125731587409973, "learning_rate": 2.3400673400673403e-06, "loss": 0.5154, "step": 139 }, { "epoch": 0.11791128579449747, "grad_norm": 0.439113587141037, "learning_rate": 2.3569023569023572e-06, "loss": 0.5588, "step": 140 }, { "epoch": 0.11875350926445817, "grad_norm": 0.38148432970046997, "learning_rate": 2.373737373737374e-06, "loss": 0.56, "step": 141 }, { "epoch": 0.11959573273441887, "grad_norm": 0.41025426983833313, "learning_rate": 2.3905723905723906e-06, "loss": 0.5456, "step": 142 }, { "epoch": 0.12043795620437957, "grad_norm": 0.37580060958862305, "learning_rate": 2.4074074074074075e-06, "loss": 0.534, "step": 143 }, { "epoch": 0.12128017967434025, "grad_norm": 0.3843674659729004, "learning_rate": 2.4242424242424244e-06, "loss": 0.5694, "step": 144 }, { "epoch": 0.12212240314430095, "grad_norm": 0.4690525233745575, "learning_rate": 2.4410774410774413e-06, "loss": 0.554, "step": 145 }, { "epoch": 0.12296462661426165, "grad_norm": 0.40167638659477234, "learning_rate": 2.457912457912458e-06, "loss": 0.5555, "step": 146 }, { "epoch": 0.12380685008422235, "grad_norm": 0.4418562948703766, "learning_rate": 2.474747474747475e-06, "loss": 0.5509, "step": 147 }, { "epoch": 0.12464907355418305, "grad_norm": 0.4236592650413513, "learning_rate": 2.491582491582492e-06, "loss": 0.5476, "step": 148 }, { "epoch": 0.12549129702414374, "grad_norm": 0.4365476667881012, "learning_rate": 2.508417508417509e-06, "loss": 0.5553, "step": 149 }, { "epoch": 0.12633352049410443, "grad_norm": 0.4413875639438629, "learning_rate": 2.5252525252525258e-06, "loss": 0.5803, "step": 150 }, { "epoch": 0.12717574396406514, "grad_norm": 0.38760268688201904, "learning_rate": 2.5420875420875422e-06, "loss": 0.5179, "step": 151 }, { "epoch": 0.12801796743402583, "grad_norm": 0.43812376260757446, "learning_rate": 2.558922558922559e-06, "loss": 0.5465, "step": 152 }, { "epoch": 0.12886019090398654, "grad_norm": 0.38343387842178345, "learning_rate": 2.575757575757576e-06, "loss": 0.5629, "step": 153 }, { "epoch": 0.12970241437394722, "grad_norm": 0.409627765417099, "learning_rate": 2.5925925925925925e-06, "loss": 0.5571, "step": 154 }, { "epoch": 0.1305446378439079, "grad_norm": 0.4320380389690399, "learning_rate": 2.6094276094276094e-06, "loss": 0.5764, "step": 155 }, { "epoch": 0.13138686131386862, "grad_norm": 0.3807540237903595, "learning_rate": 2.6262626262626267e-06, "loss": 0.5399, "step": 156 }, { "epoch": 0.1322290847838293, "grad_norm": 0.4925825893878937, "learning_rate": 2.6430976430976436e-06, "loss": 0.5721, "step": 157 }, { "epoch": 0.13307130825379002, "grad_norm": 0.4407508075237274, "learning_rate": 2.65993265993266e-06, "loss": 0.5386, "step": 158 }, { "epoch": 0.1339135317237507, "grad_norm": 0.3947726786136627, "learning_rate": 2.676767676767677e-06, "loss": 0.5716, "step": 159 }, { "epoch": 0.13475575519371139, "grad_norm": 0.41203153133392334, "learning_rate": 2.693602693602694e-06, "loss": 0.5494, "step": 160 }, { "epoch": 0.1355979786636721, "grad_norm": 0.5117720365524292, "learning_rate": 2.7104377104377103e-06, "loss": 0.5703, "step": 161 }, { "epoch": 0.13644020213363278, "grad_norm": 0.4902465045452118, "learning_rate": 2.7272727272727272e-06, "loss": 0.5339, "step": 162 }, { "epoch": 0.1372824256035935, "grad_norm": 0.40383705496788025, "learning_rate": 2.7441077441077445e-06, "loss": 0.5342, "step": 163 }, { "epoch": 0.13812464907355418, "grad_norm": 0.44565486907958984, "learning_rate": 2.7609427609427614e-06, "loss": 0.5849, "step": 164 }, { "epoch": 0.1389668725435149, "grad_norm": 0.4988057315349579, "learning_rate": 2.7777777777777783e-06, "loss": 0.5334, "step": 165 }, { "epoch": 0.13980909601347558, "grad_norm": 0.46498870849609375, "learning_rate": 2.794612794612795e-06, "loss": 0.5748, "step": 166 }, { "epoch": 0.14065131948343626, "grad_norm": 0.43350544571876526, "learning_rate": 2.8114478114478117e-06, "loss": 0.5454, "step": 167 }, { "epoch": 0.14149354295339697, "grad_norm": 0.4967784285545349, "learning_rate": 2.8282828282828286e-06, "loss": 0.5675, "step": 168 }, { "epoch": 0.14233576642335766, "grad_norm": 0.4603004455566406, "learning_rate": 2.845117845117845e-06, "loss": 0.5478, "step": 169 }, { "epoch": 0.14317798989331837, "grad_norm": 0.3968580663204193, "learning_rate": 2.861952861952862e-06, "loss": 0.5455, "step": 170 }, { "epoch": 0.14402021336327905, "grad_norm": 0.459073543548584, "learning_rate": 2.8787878787878793e-06, "loss": 0.5282, "step": 171 }, { "epoch": 0.14486243683323977, "grad_norm": 0.46953684091567993, "learning_rate": 2.895622895622896e-06, "loss": 0.5624, "step": 172 }, { "epoch": 0.14570466030320045, "grad_norm": 0.45719990134239197, "learning_rate": 2.9124579124579126e-06, "loss": 0.5438, "step": 173 }, { "epoch": 0.14654688377316114, "grad_norm": 0.37315723299980164, "learning_rate": 2.9292929292929295e-06, "loss": 0.5135, "step": 174 }, { "epoch": 0.14738910724312185, "grad_norm": 0.425453782081604, "learning_rate": 2.9461279461279464e-06, "loss": 0.5502, "step": 175 }, { "epoch": 0.14823133071308253, "grad_norm": 0.44172659516334534, "learning_rate": 2.962962962962963e-06, "loss": 0.5322, "step": 176 }, { "epoch": 0.14907355418304324, "grad_norm": 0.42367592453956604, "learning_rate": 2.97979797979798e-06, "loss": 0.54, "step": 177 }, { "epoch": 0.14991577765300393, "grad_norm": 0.4770338535308838, "learning_rate": 2.996632996632997e-06, "loss": 0.5852, "step": 178 }, { "epoch": 0.1507580011229646, "grad_norm": 0.4225494861602783, "learning_rate": 3.013468013468014e-06, "loss": 0.5293, "step": 179 }, { "epoch": 0.15160022459292533, "grad_norm": 0.4103783965110779, "learning_rate": 3.0303030303030305e-06, "loss": 0.5459, "step": 180 }, { "epoch": 0.152442448062886, "grad_norm": 0.4027358889579773, "learning_rate": 3.0471380471380474e-06, "loss": 0.5056, "step": 181 }, { "epoch": 0.15328467153284672, "grad_norm": 0.4330751299858093, "learning_rate": 3.0639730639730643e-06, "loss": 0.5472, "step": 182 }, { "epoch": 0.1541268950028074, "grad_norm": 0.4501986503601074, "learning_rate": 3.0808080808080807e-06, "loss": 0.5388, "step": 183 }, { "epoch": 0.15496911847276812, "grad_norm": 0.4392593204975128, "learning_rate": 3.0976430976430976e-06, "loss": 0.5576, "step": 184 }, { "epoch": 0.1558113419427288, "grad_norm": 0.4822996258735657, "learning_rate": 3.114478114478115e-06, "loss": 0.5705, "step": 185 }, { "epoch": 0.1566535654126895, "grad_norm": 0.44021180272102356, "learning_rate": 3.131313131313132e-06, "loss": 0.53, "step": 186 }, { "epoch": 0.1574957888826502, "grad_norm": 0.4028305113315582, "learning_rate": 3.1481481481481483e-06, "loss": 0.5049, "step": 187 }, { "epoch": 0.15833801235261089, "grad_norm": 0.4939925968647003, "learning_rate": 3.1649831649831652e-06, "loss": 0.5307, "step": 188 }, { "epoch": 0.1591802358225716, "grad_norm": 0.4758276045322418, "learning_rate": 3.181818181818182e-06, "loss": 0.5773, "step": 189 }, { "epoch": 0.16002245929253228, "grad_norm": 0.4271968603134155, "learning_rate": 3.1986531986531986e-06, "loss": 0.533, "step": 190 }, { "epoch": 0.160864682762493, "grad_norm": 0.41890472173690796, "learning_rate": 3.2154882154882155e-06, "loss": 0.5114, "step": 191 }, { "epoch": 0.16170690623245368, "grad_norm": 0.47529691457748413, "learning_rate": 3.232323232323233e-06, "loss": 0.5244, "step": 192 }, { "epoch": 0.16254912970241436, "grad_norm": 0.4498061537742615, "learning_rate": 3.2491582491582497e-06, "loss": 0.5243, "step": 193 }, { "epoch": 0.16339135317237508, "grad_norm": 0.411465048789978, "learning_rate": 3.2659932659932666e-06, "loss": 0.5257, "step": 194 }, { "epoch": 0.16423357664233576, "grad_norm": 0.4189465641975403, "learning_rate": 3.282828282828283e-06, "loss": 0.5289, "step": 195 }, { "epoch": 0.16507580011229647, "grad_norm": 0.42838072776794434, "learning_rate": 3.2996632996633e-06, "loss": 0.5075, "step": 196 }, { "epoch": 0.16591802358225716, "grad_norm": 0.4399939775466919, "learning_rate": 3.316498316498317e-06, "loss": 0.5463, "step": 197 }, { "epoch": 0.16676024705221784, "grad_norm": 0.4141615927219391, "learning_rate": 3.3333333333333333e-06, "loss": 0.5472, "step": 198 }, { "epoch": 0.16760247052217855, "grad_norm": 0.40688008069992065, "learning_rate": 3.3501683501683502e-06, "loss": 0.5299, "step": 199 }, { "epoch": 0.16844469399213924, "grad_norm": 0.4695212244987488, "learning_rate": 3.3670033670033675e-06, "loss": 0.5499, "step": 200 }, { "epoch": 0.16928691746209995, "grad_norm": 0.42900463938713074, "learning_rate": 3.3838383838383844e-06, "loss": 0.536, "step": 201 }, { "epoch": 0.17012914093206064, "grad_norm": 0.4439961910247803, "learning_rate": 3.400673400673401e-06, "loss": 0.51, "step": 202 }, { "epoch": 0.17097136440202135, "grad_norm": 0.4091702103614807, "learning_rate": 3.417508417508418e-06, "loss": 0.5179, "step": 203 }, { "epoch": 0.17181358787198203, "grad_norm": 0.4129495620727539, "learning_rate": 3.4343434343434347e-06, "loss": 0.563, "step": 204 }, { "epoch": 0.17265581134194272, "grad_norm": 0.39078933000564575, "learning_rate": 3.451178451178451e-06, "loss": 0.5178, "step": 205 }, { "epoch": 0.17349803481190343, "grad_norm": 0.4615379571914673, "learning_rate": 3.468013468013468e-06, "loss": 0.5166, "step": 206 }, { "epoch": 0.1743402582818641, "grad_norm": 0.39247986674308777, "learning_rate": 3.4848484848484854e-06, "loss": 0.519, "step": 207 }, { "epoch": 0.17518248175182483, "grad_norm": 0.45915648341178894, "learning_rate": 3.5016835016835023e-06, "loss": 0.5472, "step": 208 }, { "epoch": 0.1760247052217855, "grad_norm": 0.4408401548862457, "learning_rate": 3.5185185185185187e-06, "loss": 0.5271, "step": 209 }, { "epoch": 0.17686692869174622, "grad_norm": 0.4287291467189789, "learning_rate": 3.5353535353535356e-06, "loss": 0.5351, "step": 210 }, { "epoch": 0.1777091521617069, "grad_norm": 0.4072006046772003, "learning_rate": 3.5521885521885525e-06, "loss": 0.538, "step": 211 }, { "epoch": 0.1785513756316676, "grad_norm": 0.41168832778930664, "learning_rate": 3.569023569023569e-06, "loss": 0.517, "step": 212 }, { "epoch": 0.1793935991016283, "grad_norm": 0.438351571559906, "learning_rate": 3.585858585858586e-06, "loss": 0.5365, "step": 213 }, { "epoch": 0.180235822571589, "grad_norm": 0.43116092681884766, "learning_rate": 3.6026936026936032e-06, "loss": 0.5547, "step": 214 }, { "epoch": 0.1810780460415497, "grad_norm": 0.3968820571899414, "learning_rate": 3.61952861952862e-06, "loss": 0.5417, "step": 215 }, { "epoch": 0.18192026951151039, "grad_norm": 0.42071765661239624, "learning_rate": 3.6363636363636366e-06, "loss": 0.5374, "step": 216 }, { "epoch": 0.1827624929814711, "grad_norm": 0.38877007365226746, "learning_rate": 3.6531986531986535e-06, "loss": 0.5276, "step": 217 }, { "epoch": 0.18360471645143178, "grad_norm": 0.43149521946907043, "learning_rate": 3.6700336700336704e-06, "loss": 0.5088, "step": 218 }, { "epoch": 0.18444693992139247, "grad_norm": 0.3926420509815216, "learning_rate": 3.686868686868687e-06, "loss": 0.5328, "step": 219 }, { "epoch": 0.18528916339135318, "grad_norm": 0.39404818415641785, "learning_rate": 3.7037037037037037e-06, "loss": 0.5278, "step": 220 }, { "epoch": 0.18613138686131386, "grad_norm": 0.4399459958076477, "learning_rate": 3.720538720538721e-06, "loss": 0.5218, "step": 221 }, { "epoch": 0.18697361033127458, "grad_norm": 0.41848158836364746, "learning_rate": 3.737373737373738e-06, "loss": 0.5352, "step": 222 }, { "epoch": 0.18781583380123526, "grad_norm": 0.41001957654953003, "learning_rate": 3.7542087542087544e-06, "loss": 0.5235, "step": 223 }, { "epoch": 0.18865805727119594, "grad_norm": 0.5028105974197388, "learning_rate": 3.7710437710437713e-06, "loss": 0.516, "step": 224 }, { "epoch": 0.18950028074115666, "grad_norm": 0.40041211247444153, "learning_rate": 3.7878787878787882e-06, "loss": 0.5019, "step": 225 }, { "epoch": 0.19034250421111734, "grad_norm": 0.44110947847366333, "learning_rate": 3.804713804713805e-06, "loss": 0.54, "step": 226 }, { "epoch": 0.19118472768107805, "grad_norm": 0.4043547511100769, "learning_rate": 3.821548821548822e-06, "loss": 0.5388, "step": 227 }, { "epoch": 0.19202695115103874, "grad_norm": 0.41902074217796326, "learning_rate": 3.8383838383838385e-06, "loss": 0.5115, "step": 228 }, { "epoch": 0.19286917462099945, "grad_norm": 0.46237629652023315, "learning_rate": 3.855218855218856e-06, "loss": 0.517, "step": 229 }, { "epoch": 0.19371139809096014, "grad_norm": 0.40824878215789795, "learning_rate": 3.872053872053872e-06, "loss": 0.5299, "step": 230 }, { "epoch": 0.19455362156092082, "grad_norm": 0.4160974323749542, "learning_rate": 3.88888888888889e-06, "loss": 0.5198, "step": 231 }, { "epoch": 0.19539584503088153, "grad_norm": 0.4164097011089325, "learning_rate": 3.905723905723906e-06, "loss": 0.4971, "step": 232 }, { "epoch": 0.19623806850084222, "grad_norm": 0.40995925664901733, "learning_rate": 3.9225589225589225e-06, "loss": 0.5629, "step": 233 }, { "epoch": 0.19708029197080293, "grad_norm": 0.43913060426712036, "learning_rate": 3.93939393939394e-06, "loss": 0.5314, "step": 234 }, { "epoch": 0.1979225154407636, "grad_norm": 0.4449063241481781, "learning_rate": 3.956228956228956e-06, "loss": 0.5108, "step": 235 }, { "epoch": 0.19876473891072433, "grad_norm": 0.4434010684490204, "learning_rate": 3.973063973063974e-06, "loss": 0.498, "step": 236 }, { "epoch": 0.199606962380685, "grad_norm": 0.4149439334869385, "learning_rate": 3.98989898989899e-06, "loss": 0.5312, "step": 237 }, { "epoch": 0.2004491858506457, "grad_norm": 0.45361265540122986, "learning_rate": 4.0067340067340074e-06, "loss": 0.5124, "step": 238 }, { "epoch": 0.2012914093206064, "grad_norm": 0.42861616611480713, "learning_rate": 4.023569023569024e-06, "loss": 0.5083, "step": 239 }, { "epoch": 0.2021336327905671, "grad_norm": 0.42821767926216125, "learning_rate": 4.04040404040404e-06, "loss": 0.5216, "step": 240 }, { "epoch": 0.2029758562605278, "grad_norm": 0.4700522720813751, "learning_rate": 4.057239057239058e-06, "loss": 0.521, "step": 241 }, { "epoch": 0.2038180797304885, "grad_norm": 0.4585249125957489, "learning_rate": 4.074074074074074e-06, "loss": 0.5166, "step": 242 }, { "epoch": 0.20466030320044917, "grad_norm": 0.45544037222862244, "learning_rate": 4.0909090909090915e-06, "loss": 0.5603, "step": 243 }, { "epoch": 0.20550252667040989, "grad_norm": 0.4855538010597229, "learning_rate": 4.107744107744108e-06, "loss": 0.5156, "step": 244 }, { "epoch": 0.20634475014037057, "grad_norm": 0.48297661542892456, "learning_rate": 4.124579124579125e-06, "loss": 0.508, "step": 245 }, { "epoch": 0.20718697361033128, "grad_norm": 0.5136886835098267, "learning_rate": 4.141414141414142e-06, "loss": 0.5281, "step": 246 }, { "epoch": 0.20802919708029197, "grad_norm": 0.45758986473083496, "learning_rate": 4.158249158249158e-06, "loss": 0.5419, "step": 247 }, { "epoch": 0.20887142055025268, "grad_norm": 0.43933457136154175, "learning_rate": 4.1750841750841755e-06, "loss": 0.5113, "step": 248 }, { "epoch": 0.20971364402021336, "grad_norm": 0.5147191286087036, "learning_rate": 4.191919191919192e-06, "loss": 0.5122, "step": 249 }, { "epoch": 0.21055586749017405, "grad_norm": 0.47171908617019653, "learning_rate": 4.208754208754209e-06, "loss": 0.4905, "step": 250 }, { "epoch": 0.21139809096013476, "grad_norm": 0.5241402387619019, "learning_rate": 4.225589225589226e-06, "loss": 0.5165, "step": 251 }, { "epoch": 0.21224031443009544, "grad_norm": 0.4837073087692261, "learning_rate": 4.242424242424243e-06, "loss": 0.512, "step": 252 }, { "epoch": 0.21308253790005616, "grad_norm": 0.4136230945587158, "learning_rate": 4.2592592592592596e-06, "loss": 0.5175, "step": 253 }, { "epoch": 0.21392476137001684, "grad_norm": 0.4824727475643158, "learning_rate": 4.276094276094276e-06, "loss": 0.5461, "step": 254 }, { "epoch": 0.21476698483997755, "grad_norm": 0.4895179569721222, "learning_rate": 4.292929292929293e-06, "loss": 0.5202, "step": 255 }, { "epoch": 0.21560920830993824, "grad_norm": 0.41264256834983826, "learning_rate": 4.30976430976431e-06, "loss": 0.526, "step": 256 }, { "epoch": 0.21645143177989892, "grad_norm": 0.42047274112701416, "learning_rate": 4.326599326599326e-06, "loss": 0.522, "step": 257 }, { "epoch": 0.21729365524985964, "grad_norm": 0.42209258675575256, "learning_rate": 4.343434343434344e-06, "loss": 0.5008, "step": 258 }, { "epoch": 0.21813587871982032, "grad_norm": 0.46702736616134644, "learning_rate": 4.360269360269361e-06, "loss": 0.5092, "step": 259 }, { "epoch": 0.21897810218978103, "grad_norm": 0.44454440474510193, "learning_rate": 4.377104377104377e-06, "loss": 0.5056, "step": 260 }, { "epoch": 0.21982032565974172, "grad_norm": 0.4477729797363281, "learning_rate": 4.393939393939394e-06, "loss": 0.4958, "step": 261 }, { "epoch": 0.2206625491297024, "grad_norm": 0.48324647545814514, "learning_rate": 4.410774410774411e-06, "loss": 0.5069, "step": 262 }, { "epoch": 0.2215047725996631, "grad_norm": 0.41691625118255615, "learning_rate": 4.427609427609428e-06, "loss": 0.5585, "step": 263 }, { "epoch": 0.2223469960696238, "grad_norm": 0.5005670785903931, "learning_rate": 4.444444444444444e-06, "loss": 0.5237, "step": 264 }, { "epoch": 0.2231892195395845, "grad_norm": 0.4408778250217438, "learning_rate": 4.4612794612794615e-06, "loss": 0.5232, "step": 265 }, { "epoch": 0.2240314430095452, "grad_norm": 0.4632425010204315, "learning_rate": 4.478114478114479e-06, "loss": 0.5245, "step": 266 }, { "epoch": 0.2248736664795059, "grad_norm": 0.5183952450752258, "learning_rate": 4.494949494949495e-06, "loss": 0.5253, "step": 267 }, { "epoch": 0.2257158899494666, "grad_norm": 0.46484944224357605, "learning_rate": 4.5117845117845126e-06, "loss": 0.5019, "step": 268 }, { "epoch": 0.22655811341942728, "grad_norm": 0.42933914065361023, "learning_rate": 4.528619528619529e-06, "loss": 0.4889, "step": 269 }, { "epoch": 0.227400336889388, "grad_norm": 0.4943735897541046, "learning_rate": 4.5454545454545455e-06, "loss": 0.5078, "step": 270 }, { "epoch": 0.22824256035934867, "grad_norm": 0.5464941263198853, "learning_rate": 4.562289562289563e-06, "loss": 0.5301, "step": 271 }, { "epoch": 0.22908478382930939, "grad_norm": 0.41851603984832764, "learning_rate": 4.57912457912458e-06, "loss": 0.527, "step": 272 }, { "epoch": 0.22992700729927007, "grad_norm": 0.6074217557907104, "learning_rate": 4.595959595959597e-06, "loss": 0.5299, "step": 273 }, { "epoch": 0.23076923076923078, "grad_norm": 0.42911866307258606, "learning_rate": 4.612794612794613e-06, "loss": 0.4764, "step": 274 }, { "epoch": 0.23161145423919147, "grad_norm": 0.46192818880081177, "learning_rate": 4.62962962962963e-06, "loss": 0.5146, "step": 275 }, { "epoch": 0.23245367770915215, "grad_norm": 0.4823738932609558, "learning_rate": 4.646464646464647e-06, "loss": 0.5261, "step": 276 }, { "epoch": 0.23329590117911286, "grad_norm": 0.45401594042778015, "learning_rate": 4.663299663299663e-06, "loss": 0.517, "step": 277 }, { "epoch": 0.23413812464907355, "grad_norm": 0.44986918568611145, "learning_rate": 4.680134680134681e-06, "loss": 0.5496, "step": 278 }, { "epoch": 0.23498034811903426, "grad_norm": 0.4612451493740082, "learning_rate": 4.696969696969698e-06, "loss": 0.5221, "step": 279 }, { "epoch": 0.23582257158899494, "grad_norm": 0.45069706439971924, "learning_rate": 4.7138047138047145e-06, "loss": 0.4945, "step": 280 }, { "epoch": 0.23666479505895563, "grad_norm": 0.40829938650131226, "learning_rate": 4.730639730639731e-06, "loss": 0.5265, "step": 281 }, { "epoch": 0.23750701852891634, "grad_norm": 0.4309101700782776, "learning_rate": 4.747474747474748e-06, "loss": 0.507, "step": 282 }, { "epoch": 0.23834924199887703, "grad_norm": 0.44291427731513977, "learning_rate": 4.764309764309765e-06, "loss": 0.5237, "step": 283 }, { "epoch": 0.23919146546883774, "grad_norm": 0.4691574275493622, "learning_rate": 4.781144781144781e-06, "loss": 0.4958, "step": 284 }, { "epoch": 0.24003368893879842, "grad_norm": 0.45377105474472046, "learning_rate": 4.7979797979797985e-06, "loss": 0.545, "step": 285 }, { "epoch": 0.24087591240875914, "grad_norm": 0.40558335185050964, "learning_rate": 4.814814814814815e-06, "loss": 0.486, "step": 286 }, { "epoch": 0.24171813587871982, "grad_norm": 0.44560232758522034, "learning_rate": 4.831649831649832e-06, "loss": 0.5233, "step": 287 }, { "epoch": 0.2425603593486805, "grad_norm": 0.42544713616371155, "learning_rate": 4.848484848484849e-06, "loss": 0.5221, "step": 288 }, { "epoch": 0.24340258281864122, "grad_norm": 0.4425361752510071, "learning_rate": 4.865319865319866e-06, "loss": 0.4839, "step": 289 }, { "epoch": 0.2442448062886019, "grad_norm": 0.4349897503852844, "learning_rate": 4.8821548821548826e-06, "loss": 0.4889, "step": 290 }, { "epoch": 0.2450870297585626, "grad_norm": 0.4375971555709839, "learning_rate": 4.898989898989899e-06, "loss": 0.5298, "step": 291 }, { "epoch": 0.2459292532285233, "grad_norm": 0.4123791456222534, "learning_rate": 4.915824915824916e-06, "loss": 0.4792, "step": 292 }, { "epoch": 0.246771476698484, "grad_norm": 0.4386853873729706, "learning_rate": 4.932659932659933e-06, "loss": 0.5122, "step": 293 }, { "epoch": 0.2476137001684447, "grad_norm": 0.5190219283103943, "learning_rate": 4.94949494949495e-06, "loss": 0.5214, "step": 294 }, { "epoch": 0.24845592363840538, "grad_norm": 0.3920060694217682, "learning_rate": 4.966329966329967e-06, "loss": 0.4873, "step": 295 }, { "epoch": 0.2492981471083661, "grad_norm": 0.4809557795524597, "learning_rate": 4.983164983164984e-06, "loss": 0.5102, "step": 296 }, { "epoch": 0.2501403705783268, "grad_norm": 0.4378650486469269, "learning_rate": 5e-06, "loss": 0.5032, "step": 297 }, { "epoch": 0.2509825940482875, "grad_norm": 0.43049660325050354, "learning_rate": 5.016835016835018e-06, "loss": 0.5063, "step": 298 }, { "epoch": 0.2518248175182482, "grad_norm": 0.4294068217277527, "learning_rate": 5.033670033670034e-06, "loss": 0.5185, "step": 299 }, { "epoch": 0.25266704098820886, "grad_norm": 0.4621216356754303, "learning_rate": 5.0505050505050515e-06, "loss": 0.5062, "step": 300 }, { "epoch": 0.25350926445816957, "grad_norm": 0.45335647463798523, "learning_rate": 5.067340067340067e-06, "loss": 0.5276, "step": 301 }, { "epoch": 0.2543514879281303, "grad_norm": 0.4388349950313568, "learning_rate": 5.0841750841750845e-06, "loss": 0.4761, "step": 302 }, { "epoch": 0.25519371139809094, "grad_norm": 0.3999445140361786, "learning_rate": 5.101010101010101e-06, "loss": 0.5235, "step": 303 }, { "epoch": 0.25603593486805165, "grad_norm": 0.4347052574157715, "learning_rate": 5.117845117845118e-06, "loss": 0.5144, "step": 304 }, { "epoch": 0.25687815833801236, "grad_norm": 0.472493052482605, "learning_rate": 5.1346801346801356e-06, "loss": 0.5249, "step": 305 }, { "epoch": 0.2577203818079731, "grad_norm": 0.4408933222293854, "learning_rate": 5.151515151515152e-06, "loss": 0.4788, "step": 306 }, { "epoch": 0.25856260527793373, "grad_norm": 0.45961734652519226, "learning_rate": 5.168350168350169e-06, "loss": 0.5177, "step": 307 }, { "epoch": 0.25940482874789444, "grad_norm": 0.41620251536369324, "learning_rate": 5.185185185185185e-06, "loss": 0.4961, "step": 308 }, { "epoch": 0.26024705221785516, "grad_norm": 0.45012426376342773, "learning_rate": 5.202020202020202e-06, "loss": 0.5, "step": 309 }, { "epoch": 0.2610892756878158, "grad_norm": 0.4195323586463928, "learning_rate": 5.218855218855219e-06, "loss": 0.4988, "step": 310 }, { "epoch": 0.2619314991577765, "grad_norm": 0.4384330213069916, "learning_rate": 5.235690235690236e-06, "loss": 0.506, "step": 311 }, { "epoch": 0.26277372262773724, "grad_norm": 0.42956018447875977, "learning_rate": 5.252525252525253e-06, "loss": 0.525, "step": 312 }, { "epoch": 0.2636159460976979, "grad_norm": 0.39414551854133606, "learning_rate": 5.26936026936027e-06, "loss": 0.5193, "step": 313 }, { "epoch": 0.2644581695676586, "grad_norm": 0.42922139167785645, "learning_rate": 5.286195286195287e-06, "loss": 0.5321, "step": 314 }, { "epoch": 0.2653003930376193, "grad_norm": 0.41558343172073364, "learning_rate": 5.303030303030303e-06, "loss": 0.489, "step": 315 }, { "epoch": 0.26614261650758003, "grad_norm": 0.42797690629959106, "learning_rate": 5.31986531986532e-06, "loss": 0.5271, "step": 316 }, { "epoch": 0.2669848399775407, "grad_norm": 0.4213487207889557, "learning_rate": 5.336700336700337e-06, "loss": 0.4864, "step": 317 }, { "epoch": 0.2678270634475014, "grad_norm": 0.4255501627922058, "learning_rate": 5.353535353535354e-06, "loss": 0.5107, "step": 318 }, { "epoch": 0.2686692869174621, "grad_norm": 0.4558629095554352, "learning_rate": 5.370370370370371e-06, "loss": 0.53, "step": 319 }, { "epoch": 0.26951151038742277, "grad_norm": 0.4140864908695221, "learning_rate": 5.387205387205388e-06, "loss": 0.4917, "step": 320 }, { "epoch": 0.2703537338573835, "grad_norm": 0.4648992121219635, "learning_rate": 5.404040404040405e-06, "loss": 0.5118, "step": 321 }, { "epoch": 0.2711959573273442, "grad_norm": 0.41138648986816406, "learning_rate": 5.420875420875421e-06, "loss": 0.5078, "step": 322 }, { "epoch": 0.2720381807973049, "grad_norm": 0.4770265817642212, "learning_rate": 5.437710437710438e-06, "loss": 0.4807, "step": 323 }, { "epoch": 0.27288040426726556, "grad_norm": 0.4645610451698303, "learning_rate": 5.4545454545454545e-06, "loss": 0.4919, "step": 324 }, { "epoch": 0.2737226277372263, "grad_norm": 0.4085693359375, "learning_rate": 5.471380471380472e-06, "loss": 0.5009, "step": 325 }, { "epoch": 0.274564851207187, "grad_norm": 0.4103808104991913, "learning_rate": 5.488215488215489e-06, "loss": 0.4968, "step": 326 }, { "epoch": 0.27540707467714765, "grad_norm": 0.42407605051994324, "learning_rate": 5.5050505050505056e-06, "loss": 0.5054, "step": 327 }, { "epoch": 0.27624929814710836, "grad_norm": 0.42540448904037476, "learning_rate": 5.521885521885523e-06, "loss": 0.4947, "step": 328 }, { "epoch": 0.27709152161706907, "grad_norm": 0.4279448390007019, "learning_rate": 5.538720538720539e-06, "loss": 0.4854, "step": 329 }, { "epoch": 0.2779337450870298, "grad_norm": 0.4387977719306946, "learning_rate": 5.555555555555557e-06, "loss": 0.5155, "step": 330 }, { "epoch": 0.27877596855699044, "grad_norm": 0.402165025472641, "learning_rate": 5.572390572390572e-06, "loss": 0.518, "step": 331 }, { "epoch": 0.27961819202695115, "grad_norm": 0.46611127257347107, "learning_rate": 5.58922558922559e-06, "loss": 0.4783, "step": 332 }, { "epoch": 0.28046041549691186, "grad_norm": 0.44151929020881653, "learning_rate": 5.606060606060606e-06, "loss": 0.5373, "step": 333 }, { "epoch": 0.2813026389668725, "grad_norm": 0.5066059231758118, "learning_rate": 5.622895622895623e-06, "loss": 0.5195, "step": 334 }, { "epoch": 0.28214486243683323, "grad_norm": 0.560751736164093, "learning_rate": 5.639730639730641e-06, "loss": 0.5103, "step": 335 }, { "epoch": 0.28298708590679394, "grad_norm": 0.4968298375606537, "learning_rate": 5.656565656565657e-06, "loss": 0.5438, "step": 336 }, { "epoch": 0.28382930937675466, "grad_norm": 0.4976928234100342, "learning_rate": 5.6734006734006745e-06, "loss": 0.5137, "step": 337 }, { "epoch": 0.2846715328467153, "grad_norm": 0.4725048542022705, "learning_rate": 5.69023569023569e-06, "loss": 0.4757, "step": 338 }, { "epoch": 0.285513756316676, "grad_norm": 0.4671187102794647, "learning_rate": 5.7070707070707075e-06, "loss": 0.4904, "step": 339 }, { "epoch": 0.28635597978663674, "grad_norm": 0.4991739094257355, "learning_rate": 5.723905723905724e-06, "loss": 0.5237, "step": 340 }, { "epoch": 0.2871982032565974, "grad_norm": 0.4460315704345703, "learning_rate": 5.740740740740741e-06, "loss": 0.5011, "step": 341 }, { "epoch": 0.2880404267265581, "grad_norm": 0.47052547335624695, "learning_rate": 5.7575757575757586e-06, "loss": 0.4921, "step": 342 }, { "epoch": 0.2888826501965188, "grad_norm": 0.5069209933280945, "learning_rate": 5.774410774410775e-06, "loss": 0.5268, "step": 343 }, { "epoch": 0.28972487366647953, "grad_norm": 0.5080049633979797, "learning_rate": 5.791245791245792e-06, "loss": 0.5163, "step": 344 }, { "epoch": 0.2905670971364402, "grad_norm": 0.45466458797454834, "learning_rate": 5.808080808080808e-06, "loss": 0.5118, "step": 345 }, { "epoch": 0.2914093206064009, "grad_norm": 0.47711947560310364, "learning_rate": 5.824915824915825e-06, "loss": 0.4713, "step": 346 }, { "epoch": 0.2922515440763616, "grad_norm": 0.5068085193634033, "learning_rate": 5.841750841750842e-06, "loss": 0.4785, "step": 347 }, { "epoch": 0.29309376754632227, "grad_norm": 0.5156922936439514, "learning_rate": 5.858585858585859e-06, "loss": 0.4846, "step": 348 }, { "epoch": 0.293935991016283, "grad_norm": 0.5105670690536499, "learning_rate": 5.875420875420876e-06, "loss": 0.519, "step": 349 }, { "epoch": 0.2947782144862437, "grad_norm": 0.5012142062187195, "learning_rate": 5.892255892255893e-06, "loss": 0.4978, "step": 350 }, { "epoch": 0.2956204379562044, "grad_norm": 0.49114683270454407, "learning_rate": 5.90909090909091e-06, "loss": 0.4735, "step": 351 }, { "epoch": 0.29646266142616506, "grad_norm": 0.5335782766342163, "learning_rate": 5.925925925925926e-06, "loss": 0.4761, "step": 352 }, { "epoch": 0.2973048848961258, "grad_norm": 0.5186587572097778, "learning_rate": 5.942760942760943e-06, "loss": 0.5271, "step": 353 }, { "epoch": 0.2981471083660865, "grad_norm": 0.4791840612888336, "learning_rate": 5.95959595959596e-06, "loss": 0.5169, "step": 354 }, { "epoch": 0.29898933183604715, "grad_norm": 0.49082159996032715, "learning_rate": 5.976430976430977e-06, "loss": 0.4852, "step": 355 }, { "epoch": 0.29983155530600786, "grad_norm": 0.43412622809410095, "learning_rate": 5.993265993265994e-06, "loss": 0.4994, "step": 356 }, { "epoch": 0.30067377877596857, "grad_norm": 0.4774993062019348, "learning_rate": 6.010101010101011e-06, "loss": 0.5141, "step": 357 }, { "epoch": 0.3015160022459292, "grad_norm": 0.43106019496917725, "learning_rate": 6.026936026936028e-06, "loss": 0.5026, "step": 358 }, { "epoch": 0.30235822571588994, "grad_norm": 0.4472343325614929, "learning_rate": 6.043771043771044e-06, "loss": 0.5211, "step": 359 }, { "epoch": 0.30320044918585065, "grad_norm": 0.5119704008102417, "learning_rate": 6.060606060606061e-06, "loss": 0.4676, "step": 360 }, { "epoch": 0.30404267265581136, "grad_norm": 0.5184417366981506, "learning_rate": 6.0774410774410774e-06, "loss": 0.4909, "step": 361 }, { "epoch": 0.304884896125772, "grad_norm": 0.4644922912120819, "learning_rate": 6.094276094276095e-06, "loss": 0.5143, "step": 362 }, { "epoch": 0.30572711959573273, "grad_norm": 0.4254929721355438, "learning_rate": 6.111111111111112e-06, "loss": 0.4968, "step": 363 }, { "epoch": 0.30656934306569344, "grad_norm": 0.512502133846283, "learning_rate": 6.1279461279461286e-06, "loss": 0.5077, "step": 364 }, { "epoch": 0.3074115665356541, "grad_norm": 0.39353257417678833, "learning_rate": 6.144781144781146e-06, "loss": 0.5082, "step": 365 }, { "epoch": 0.3082537900056148, "grad_norm": 0.4202759861946106, "learning_rate": 6.1616161616161615e-06, "loss": 0.5032, "step": 366 }, { "epoch": 0.3090960134755755, "grad_norm": 0.4790390431880951, "learning_rate": 6.178451178451179e-06, "loss": 0.4924, "step": 367 }, { "epoch": 0.30993823694553624, "grad_norm": 0.4364878535270691, "learning_rate": 6.195286195286195e-06, "loss": 0.4927, "step": 368 }, { "epoch": 0.3107804604154969, "grad_norm": 0.45293012261390686, "learning_rate": 6.212121212121213e-06, "loss": 0.4943, "step": 369 }, { "epoch": 0.3116226838854576, "grad_norm": 0.4330050051212311, "learning_rate": 6.22895622895623e-06, "loss": 0.4979, "step": 370 }, { "epoch": 0.3124649073554183, "grad_norm": 0.4348853826522827, "learning_rate": 6.245791245791246e-06, "loss": 0.4806, "step": 371 }, { "epoch": 0.313307130825379, "grad_norm": 0.4625408351421356, "learning_rate": 6.262626262626264e-06, "loss": 0.5172, "step": 372 }, { "epoch": 0.3141493542953397, "grad_norm": 0.519190788269043, "learning_rate": 6.279461279461279e-06, "loss": 0.4918, "step": 373 }, { "epoch": 0.3149915777653004, "grad_norm": 0.4609246551990509, "learning_rate": 6.296296296296297e-06, "loss": 0.4991, "step": 374 }, { "epoch": 0.3158338012352611, "grad_norm": 0.4887892007827759, "learning_rate": 6.313131313131313e-06, "loss": 0.4535, "step": 375 }, { "epoch": 0.31667602470522177, "grad_norm": 0.5209500193595886, "learning_rate": 6.3299663299663304e-06, "loss": 0.5151, "step": 376 }, { "epoch": 0.3175182481751825, "grad_norm": 0.46867451071739197, "learning_rate": 6.346801346801348e-06, "loss": 0.4966, "step": 377 }, { "epoch": 0.3183604716451432, "grad_norm": 0.44218122959136963, "learning_rate": 6.363636363636364e-06, "loss": 0.505, "step": 378 }, { "epoch": 0.31920269511510385, "grad_norm": 0.684224545955658, "learning_rate": 6.3804713804713816e-06, "loss": 0.5018, "step": 379 }, { "epoch": 0.32004491858506456, "grad_norm": 0.45292574167251587, "learning_rate": 6.397306397306397e-06, "loss": 0.5038, "step": 380 }, { "epoch": 0.3208871420550253, "grad_norm": 0.503074586391449, "learning_rate": 6.4141414141414145e-06, "loss": 0.5216, "step": 381 }, { "epoch": 0.321729365524986, "grad_norm": 0.6159339547157288, "learning_rate": 6.430976430976431e-06, "loss": 0.5131, "step": 382 }, { "epoch": 0.32257158899494665, "grad_norm": 0.4706195592880249, "learning_rate": 6.447811447811448e-06, "loss": 0.5037, "step": 383 }, { "epoch": 0.32341381246490736, "grad_norm": 0.5323322415351868, "learning_rate": 6.464646464646466e-06, "loss": 0.5318, "step": 384 }, { "epoch": 0.32425603593486807, "grad_norm": 0.5616297125816345, "learning_rate": 6.481481481481482e-06, "loss": 0.4701, "step": 385 }, { "epoch": 0.3250982594048287, "grad_norm": 0.47448647022247314, "learning_rate": 6.498316498316499e-06, "loss": 0.5058, "step": 386 }, { "epoch": 0.32594048287478944, "grad_norm": 0.46082887053489685, "learning_rate": 6.515151515151516e-06, "loss": 0.5053, "step": 387 }, { "epoch": 0.32678270634475015, "grad_norm": 0.5207511782646179, "learning_rate": 6.531986531986533e-06, "loss": 0.5262, "step": 388 }, { "epoch": 0.32762492981471086, "grad_norm": 0.5015504956245422, "learning_rate": 6.548821548821549e-06, "loss": 0.4894, "step": 389 }, { "epoch": 0.3284671532846715, "grad_norm": 0.4676060974597931, "learning_rate": 6.565656565656566e-06, "loss": 0.4591, "step": 390 }, { "epoch": 0.32930937675463223, "grad_norm": 0.5197309255599976, "learning_rate": 6.582491582491583e-06, "loss": 0.4916, "step": 391 }, { "epoch": 0.33015160022459294, "grad_norm": 0.5237851738929749, "learning_rate": 6.5993265993266e-06, "loss": 0.5328, "step": 392 }, { "epoch": 0.3309938236945536, "grad_norm": 0.5311485528945923, "learning_rate": 6.616161616161617e-06, "loss": 0.5116, "step": 393 }, { "epoch": 0.3318360471645143, "grad_norm": 0.5559781193733215, "learning_rate": 6.632996632996634e-06, "loss": 0.4977, "step": 394 }, { "epoch": 0.332678270634475, "grad_norm": 0.6253042221069336, "learning_rate": 6.649831649831651e-06, "loss": 0.5018, "step": 395 }, { "epoch": 0.3335204941044357, "grad_norm": 0.503348171710968, "learning_rate": 6.666666666666667e-06, "loss": 0.4977, "step": 396 }, { "epoch": 0.3343627175743964, "grad_norm": 0.685606837272644, "learning_rate": 6.683501683501684e-06, "loss": 0.494, "step": 397 }, { "epoch": 0.3352049410443571, "grad_norm": 0.4982592463493347, "learning_rate": 6.7003367003367004e-06, "loss": 0.5247, "step": 398 }, { "epoch": 0.3360471645143178, "grad_norm": 0.6131967306137085, "learning_rate": 6.717171717171718e-06, "loss": 0.5204, "step": 399 }, { "epoch": 0.3368893879842785, "grad_norm": 0.5212045907974243, "learning_rate": 6.734006734006735e-06, "loss": 0.4977, "step": 400 }, { "epoch": 0.3377316114542392, "grad_norm": 0.4675288796424866, "learning_rate": 6.7508417508417515e-06, "loss": 0.453, "step": 401 }, { "epoch": 0.3385738349241999, "grad_norm": 0.5343837738037109, "learning_rate": 6.767676767676769e-06, "loss": 0.524, "step": 402 }, { "epoch": 0.33941605839416056, "grad_norm": 0.5351298451423645, "learning_rate": 6.7845117845117845e-06, "loss": 0.4964, "step": 403 }, { "epoch": 0.34025828186412127, "grad_norm": 0.46285006403923035, "learning_rate": 6.801346801346802e-06, "loss": 0.4626, "step": 404 }, { "epoch": 0.341100505334082, "grad_norm": 0.5020408034324646, "learning_rate": 6.818181818181818e-06, "loss": 0.5055, "step": 405 }, { "epoch": 0.3419427288040427, "grad_norm": 0.4540770947933197, "learning_rate": 6.835016835016836e-06, "loss": 0.5158, "step": 406 }, { "epoch": 0.34278495227400335, "grad_norm": 0.45801612734794617, "learning_rate": 6.851851851851853e-06, "loss": 0.5093, "step": 407 }, { "epoch": 0.34362717574396406, "grad_norm": 0.4555765986442566, "learning_rate": 6.868686868686869e-06, "loss": 0.4903, "step": 408 }, { "epoch": 0.3444693992139248, "grad_norm": 0.4921708106994629, "learning_rate": 6.885521885521887e-06, "loss": 0.5114, "step": 409 }, { "epoch": 0.34531162268388543, "grad_norm": 0.5010226964950562, "learning_rate": 6.902356902356902e-06, "loss": 0.4995, "step": 410 }, { "epoch": 0.34615384615384615, "grad_norm": 0.45662209391593933, "learning_rate": 6.91919191919192e-06, "loss": 0.511, "step": 411 }, { "epoch": 0.34699606962380686, "grad_norm": 0.47807177901268005, "learning_rate": 6.936026936026936e-06, "loss": 0.5033, "step": 412 }, { "epoch": 0.34783829309376757, "grad_norm": 0.47333812713623047, "learning_rate": 6.9528619528619534e-06, "loss": 0.4874, "step": 413 }, { "epoch": 0.3486805165637282, "grad_norm": 0.5075779557228088, "learning_rate": 6.969696969696971e-06, "loss": 0.4927, "step": 414 }, { "epoch": 0.34952274003368894, "grad_norm": 0.4549364745616913, "learning_rate": 6.986531986531987e-06, "loss": 0.4624, "step": 415 }, { "epoch": 0.35036496350364965, "grad_norm": 0.5265827178955078, "learning_rate": 7.0033670033670045e-06, "loss": 0.4996, "step": 416 }, { "epoch": 0.3512071869736103, "grad_norm": 0.42467236518859863, "learning_rate": 7.02020202020202e-06, "loss": 0.4803, "step": 417 }, { "epoch": 0.352049410443571, "grad_norm": 0.46576905250549316, "learning_rate": 7.0370370370370375e-06, "loss": 0.4991, "step": 418 }, { "epoch": 0.35289163391353173, "grad_norm": 0.5132504105567932, "learning_rate": 7.053872053872054e-06, "loss": 0.5115, "step": 419 }, { "epoch": 0.35373385738349244, "grad_norm": 0.4743048846721649, "learning_rate": 7.070707070707071e-06, "loss": 0.5136, "step": 420 }, { "epoch": 0.3545760808534531, "grad_norm": 0.5139487385749817, "learning_rate": 7.087542087542089e-06, "loss": 0.5012, "step": 421 }, { "epoch": 0.3554183043234138, "grad_norm": 0.4473351240158081, "learning_rate": 7.104377104377105e-06, "loss": 0.4737, "step": 422 }, { "epoch": 0.3562605277933745, "grad_norm": 0.5622629523277283, "learning_rate": 7.121212121212122e-06, "loss": 0.5159, "step": 423 }, { "epoch": 0.3571027512633352, "grad_norm": 0.4622264802455902, "learning_rate": 7.138047138047138e-06, "loss": 0.5251, "step": 424 }, { "epoch": 0.3579449747332959, "grad_norm": 0.5244413614273071, "learning_rate": 7.154882154882155e-06, "loss": 0.4813, "step": 425 }, { "epoch": 0.3587871982032566, "grad_norm": 0.5199175477027893, "learning_rate": 7.171717171717172e-06, "loss": 0.5009, "step": 426 }, { "epoch": 0.3596294216732173, "grad_norm": 0.4910876750946045, "learning_rate": 7.188552188552189e-06, "loss": 0.4913, "step": 427 }, { "epoch": 0.360471645143178, "grad_norm": 0.4522247910499573, "learning_rate": 7.2053872053872064e-06, "loss": 0.5151, "step": 428 }, { "epoch": 0.3613138686131387, "grad_norm": 0.5706003308296204, "learning_rate": 7.222222222222223e-06, "loss": 0.5047, "step": 429 }, { "epoch": 0.3621560920830994, "grad_norm": 0.45375022292137146, "learning_rate": 7.23905723905724e-06, "loss": 0.4861, "step": 430 }, { "epoch": 0.36299831555306006, "grad_norm": 0.5022145509719849, "learning_rate": 7.255892255892256e-06, "loss": 0.4991, "step": 431 }, { "epoch": 0.36384053902302077, "grad_norm": 0.48541751503944397, "learning_rate": 7.272727272727273e-06, "loss": 0.4849, "step": 432 }, { "epoch": 0.3646827624929815, "grad_norm": 0.5896349549293518, "learning_rate": 7.28956228956229e-06, "loss": 0.4987, "step": 433 }, { "epoch": 0.3655249859629422, "grad_norm": 0.5624788403511047, "learning_rate": 7.306397306397307e-06, "loss": 0.51, "step": 434 }, { "epoch": 0.36636720943290285, "grad_norm": 0.5901549458503723, "learning_rate": 7.323232323232324e-06, "loss": 0.4953, "step": 435 }, { "epoch": 0.36720943290286356, "grad_norm": 0.4812483787536621, "learning_rate": 7.340067340067341e-06, "loss": 0.5043, "step": 436 }, { "epoch": 0.3680516563728243, "grad_norm": 0.6220892667770386, "learning_rate": 7.356902356902358e-06, "loss": 0.5173, "step": 437 }, { "epoch": 0.36889387984278493, "grad_norm": 0.5254179835319519, "learning_rate": 7.373737373737374e-06, "loss": 0.4952, "step": 438 }, { "epoch": 0.36973610331274565, "grad_norm": 0.4904094338417053, "learning_rate": 7.390572390572391e-06, "loss": 0.4658, "step": 439 }, { "epoch": 0.37057832678270636, "grad_norm": 0.504805326461792, "learning_rate": 7.4074074074074075e-06, "loss": 0.5012, "step": 440 }, { "epoch": 0.371420550252667, "grad_norm": 0.533440887928009, "learning_rate": 7.424242424242425e-06, "loss": 0.5033, "step": 441 }, { "epoch": 0.3722627737226277, "grad_norm": 0.6042219996452332, "learning_rate": 7.441077441077442e-06, "loss": 0.5108, "step": 442 }, { "epoch": 0.37310499719258844, "grad_norm": 0.5207930207252502, "learning_rate": 7.457912457912459e-06, "loss": 0.5062, "step": 443 }, { "epoch": 0.37394722066254915, "grad_norm": 0.5438379645347595, "learning_rate": 7.474747474747476e-06, "loss": 0.5016, "step": 444 }, { "epoch": 0.3747894441325098, "grad_norm": 0.5808053612709045, "learning_rate": 7.491582491582492e-06, "loss": 0.502, "step": 445 }, { "epoch": 0.3756316676024705, "grad_norm": 0.4831872284412384, "learning_rate": 7.508417508417509e-06, "loss": 0.4872, "step": 446 }, { "epoch": 0.37647389107243123, "grad_norm": 0.5635850429534912, "learning_rate": 7.525252525252525e-06, "loss": 0.4841, "step": 447 }, { "epoch": 0.3773161145423919, "grad_norm": 0.5132287740707397, "learning_rate": 7.542087542087543e-06, "loss": 0.4973, "step": 448 }, { "epoch": 0.3781583380123526, "grad_norm": 0.5520679950714111, "learning_rate": 7.558922558922559e-06, "loss": 0.5122, "step": 449 }, { "epoch": 0.3790005614823133, "grad_norm": 0.6291957497596741, "learning_rate": 7.5757575757575764e-06, "loss": 0.5187, "step": 450 }, { "epoch": 0.379842784952274, "grad_norm": 0.520693302154541, "learning_rate": 7.592592592592594e-06, "loss": 0.4773, "step": 451 }, { "epoch": 0.3806850084222347, "grad_norm": 0.4528855085372925, "learning_rate": 7.60942760942761e-06, "loss": 0.4811, "step": 452 }, { "epoch": 0.3815272318921954, "grad_norm": 0.5607033371925354, "learning_rate": 7.6262626262626275e-06, "loss": 0.4983, "step": 453 }, { "epoch": 0.3823694553621561, "grad_norm": 0.4888259172439575, "learning_rate": 7.643097643097644e-06, "loss": 0.4934, "step": 454 }, { "epoch": 0.38321167883211676, "grad_norm": 0.5693116188049316, "learning_rate": 7.659932659932661e-06, "loss": 0.4852, "step": 455 }, { "epoch": 0.3840539023020775, "grad_norm": 0.49751928448677063, "learning_rate": 7.676767676767677e-06, "loss": 0.4657, "step": 456 }, { "epoch": 0.3848961257720382, "grad_norm": 0.522938072681427, "learning_rate": 7.693602693602694e-06, "loss": 0.4644, "step": 457 }, { "epoch": 0.3857383492419989, "grad_norm": 0.5266737937927246, "learning_rate": 7.710437710437712e-06, "loss": 0.5066, "step": 458 }, { "epoch": 0.38658057271195956, "grad_norm": 0.47939759492874146, "learning_rate": 7.727272727272727e-06, "loss": 0.5005, "step": 459 }, { "epoch": 0.38742279618192027, "grad_norm": 0.4762495160102844, "learning_rate": 7.744107744107745e-06, "loss": 0.5104, "step": 460 }, { "epoch": 0.388265019651881, "grad_norm": 0.4385337233543396, "learning_rate": 7.760942760942762e-06, "loss": 0.4738, "step": 461 }, { "epoch": 0.38910724312184164, "grad_norm": 0.4623595178127289, "learning_rate": 7.77777777777778e-06, "loss": 0.4508, "step": 462 }, { "epoch": 0.38994946659180235, "grad_norm": 0.45611491799354553, "learning_rate": 7.794612794612795e-06, "loss": 0.4945, "step": 463 }, { "epoch": 0.39079169006176306, "grad_norm": 0.4578178822994232, "learning_rate": 7.811447811447812e-06, "loss": 0.4843, "step": 464 }, { "epoch": 0.3916339135317238, "grad_norm": 0.44649121165275574, "learning_rate": 7.82828282828283e-06, "loss": 0.4927, "step": 465 }, { "epoch": 0.39247613700168443, "grad_norm": 0.5449602603912354, "learning_rate": 7.845117845117845e-06, "loss": 0.4734, "step": 466 }, { "epoch": 0.39331836047164515, "grad_norm": 0.4554929733276367, "learning_rate": 7.861952861952862e-06, "loss": 0.4999, "step": 467 }, { "epoch": 0.39416058394160586, "grad_norm": 0.4321384131908417, "learning_rate": 7.87878787878788e-06, "loss": 0.4924, "step": 468 }, { "epoch": 0.3950028074115665, "grad_norm": 0.47380703687667847, "learning_rate": 7.895622895622897e-06, "loss": 0.4925, "step": 469 }, { "epoch": 0.3958450308815272, "grad_norm": 0.4294160306453705, "learning_rate": 7.912457912457913e-06, "loss": 0.4782, "step": 470 }, { "epoch": 0.39668725435148794, "grad_norm": 0.4454585015773773, "learning_rate": 7.92929292929293e-06, "loss": 0.5038, "step": 471 }, { "epoch": 0.39752947782144865, "grad_norm": 0.41354677081108093, "learning_rate": 7.946127946127947e-06, "loss": 0.4918, "step": 472 }, { "epoch": 0.3983717012914093, "grad_norm": 0.4190719425678253, "learning_rate": 7.962962962962963e-06, "loss": 0.5079, "step": 473 }, { "epoch": 0.39921392476137, "grad_norm": 0.43857941031455994, "learning_rate": 7.97979797979798e-06, "loss": 0.4762, "step": 474 }, { "epoch": 0.40005614823133073, "grad_norm": 0.4401494562625885, "learning_rate": 7.996632996632998e-06, "loss": 0.4998, "step": 475 }, { "epoch": 0.4008983717012914, "grad_norm": 0.4199152886867523, "learning_rate": 8.013468013468015e-06, "loss": 0.5144, "step": 476 }, { "epoch": 0.4017405951712521, "grad_norm": 0.42967289686203003, "learning_rate": 8.03030303030303e-06, "loss": 0.4923, "step": 477 }, { "epoch": 0.4025828186412128, "grad_norm": 0.4239676296710968, "learning_rate": 8.047138047138048e-06, "loss": 0.4572, "step": 478 }, { "epoch": 0.40342504211117347, "grad_norm": 0.4643702805042267, "learning_rate": 8.063973063973065e-06, "loss": 0.4798, "step": 479 }, { "epoch": 0.4042672655811342, "grad_norm": 0.42832478880882263, "learning_rate": 8.08080808080808e-06, "loss": 0.5061, "step": 480 }, { "epoch": 0.4051094890510949, "grad_norm": 0.4320860207080841, "learning_rate": 8.097643097643098e-06, "loss": 0.459, "step": 481 }, { "epoch": 0.4059517125210556, "grad_norm": 0.4440138041973114, "learning_rate": 8.114478114478115e-06, "loss": 0.4995, "step": 482 }, { "epoch": 0.40679393599101626, "grad_norm": 0.491728812456131, "learning_rate": 8.131313131313133e-06, "loss": 0.4977, "step": 483 }, { "epoch": 0.407636159460977, "grad_norm": 0.43908974528312683, "learning_rate": 8.148148148148148e-06, "loss": 0.5076, "step": 484 }, { "epoch": 0.4084783829309377, "grad_norm": 0.5585023760795593, "learning_rate": 8.164983164983166e-06, "loss": 0.5085, "step": 485 }, { "epoch": 0.40932060640089835, "grad_norm": 0.4755890965461731, "learning_rate": 8.181818181818183e-06, "loss": 0.4808, "step": 486 }, { "epoch": 0.41016282987085906, "grad_norm": 0.46663787961006165, "learning_rate": 8.198653198653199e-06, "loss": 0.4914, "step": 487 }, { "epoch": 0.41100505334081977, "grad_norm": 0.5151968002319336, "learning_rate": 8.215488215488216e-06, "loss": 0.4939, "step": 488 }, { "epoch": 0.4118472768107805, "grad_norm": 0.4474145770072937, "learning_rate": 8.232323232323233e-06, "loss": 0.4679, "step": 489 }, { "epoch": 0.41268950028074114, "grad_norm": 0.5198603272438049, "learning_rate": 8.24915824915825e-06, "loss": 0.4804, "step": 490 }, { "epoch": 0.41353172375070185, "grad_norm": 0.5067389011383057, "learning_rate": 8.265993265993266e-06, "loss": 0.5038, "step": 491 }, { "epoch": 0.41437394722066256, "grad_norm": 0.4700777232646942, "learning_rate": 8.282828282828283e-06, "loss": 0.4734, "step": 492 }, { "epoch": 0.4152161706906232, "grad_norm": 0.5549312829971313, "learning_rate": 8.2996632996633e-06, "loss": 0.4857, "step": 493 }, { "epoch": 0.41605839416058393, "grad_norm": 0.4999919533729553, "learning_rate": 8.316498316498316e-06, "loss": 0.4608, "step": 494 }, { "epoch": 0.41690061763054465, "grad_norm": 0.4700174033641815, "learning_rate": 8.333333333333334e-06, "loss": 0.485, "step": 495 }, { "epoch": 0.41774284110050536, "grad_norm": 0.7112223505973816, "learning_rate": 8.350168350168351e-06, "loss": 0.5198, "step": 496 }, { "epoch": 0.418585064570466, "grad_norm": 0.5496985912322998, "learning_rate": 8.367003367003368e-06, "loss": 0.4993, "step": 497 }, { "epoch": 0.4194272880404267, "grad_norm": 0.6055114269256592, "learning_rate": 8.383838383838384e-06, "loss": 0.48, "step": 498 }, { "epoch": 0.42026951151038744, "grad_norm": 0.6956057548522949, "learning_rate": 8.400673400673401e-06, "loss": 0.511, "step": 499 }, { "epoch": 0.4211117349803481, "grad_norm": 0.49425026774406433, "learning_rate": 8.417508417508419e-06, "loss": 0.492, "step": 500 }, { "epoch": 0.4219539584503088, "grad_norm": 0.7275473475456238, "learning_rate": 8.434343434343434e-06, "loss": 0.4995, "step": 501 }, { "epoch": 0.4227961819202695, "grad_norm": 0.48956814408302307, "learning_rate": 8.451178451178452e-06, "loss": 0.4799, "step": 502 }, { "epoch": 0.42363840539023023, "grad_norm": 0.6456526517868042, "learning_rate": 8.468013468013469e-06, "loss": 0.4984, "step": 503 }, { "epoch": 0.4244806288601909, "grad_norm": 0.661300539970398, "learning_rate": 8.484848484848486e-06, "loss": 0.4838, "step": 504 }, { "epoch": 0.4253228523301516, "grad_norm": 0.58360755443573, "learning_rate": 8.501683501683502e-06, "loss": 0.4968, "step": 505 }, { "epoch": 0.4261650758001123, "grad_norm": 0.5267910957336426, "learning_rate": 8.518518518518519e-06, "loss": 0.4754, "step": 506 }, { "epoch": 0.42700729927007297, "grad_norm": 0.6866562962532043, "learning_rate": 8.535353535353535e-06, "loss": 0.5095, "step": 507 }, { "epoch": 0.4278495227400337, "grad_norm": 0.5155329704284668, "learning_rate": 8.552188552188552e-06, "loss": 0.4818, "step": 508 }, { "epoch": 0.4286917462099944, "grad_norm": 0.575142502784729, "learning_rate": 8.56902356902357e-06, "loss": 0.4747, "step": 509 }, { "epoch": 0.4295339696799551, "grad_norm": 0.6305699348449707, "learning_rate": 8.585858585858587e-06, "loss": 0.4732, "step": 510 }, { "epoch": 0.43037619314991576, "grad_norm": 0.424802303314209, "learning_rate": 8.602693602693604e-06, "loss": 0.4727, "step": 511 }, { "epoch": 0.4312184166198765, "grad_norm": 0.5849542617797852, "learning_rate": 8.61952861952862e-06, "loss": 0.4709, "step": 512 }, { "epoch": 0.4320606400898372, "grad_norm": 0.555602490901947, "learning_rate": 8.636363636363637e-06, "loss": 0.4978, "step": 513 }, { "epoch": 0.43290286355979785, "grad_norm": 0.4678948223590851, "learning_rate": 8.653198653198653e-06, "loss": 0.4897, "step": 514 }, { "epoch": 0.43374508702975856, "grad_norm": 0.5773173570632935, "learning_rate": 8.67003367003367e-06, "loss": 0.5068, "step": 515 }, { "epoch": 0.43458731049971927, "grad_norm": 0.4907878637313843, "learning_rate": 8.686868686868687e-06, "loss": 0.4839, "step": 516 }, { "epoch": 0.43542953396968, "grad_norm": 0.4886923134326935, "learning_rate": 8.703703703703705e-06, "loss": 0.4929, "step": 517 }, { "epoch": 0.43627175743964064, "grad_norm": 0.5006362199783325, "learning_rate": 8.720538720538722e-06, "loss": 0.4835, "step": 518 }, { "epoch": 0.43711398090960135, "grad_norm": 0.48082542419433594, "learning_rate": 8.737373737373738e-06, "loss": 0.4904, "step": 519 }, { "epoch": 0.43795620437956206, "grad_norm": 0.5444417595863342, "learning_rate": 8.754208754208755e-06, "loss": 0.4699, "step": 520 }, { "epoch": 0.4387984278495227, "grad_norm": 0.5369892120361328, "learning_rate": 8.77104377104377e-06, "loss": 0.5069, "step": 521 }, { "epoch": 0.43964065131948343, "grad_norm": 0.49131259322166443, "learning_rate": 8.787878787878788e-06, "loss": 0.503, "step": 522 }, { "epoch": 0.44048287478944415, "grad_norm": 0.4834917187690735, "learning_rate": 8.804713804713805e-06, "loss": 0.4777, "step": 523 }, { "epoch": 0.4413250982594048, "grad_norm": 0.5154290199279785, "learning_rate": 8.821548821548822e-06, "loss": 0.4722, "step": 524 }, { "epoch": 0.4421673217293655, "grad_norm": 0.4485786557197571, "learning_rate": 8.83838383838384e-06, "loss": 0.4684, "step": 525 }, { "epoch": 0.4430095451993262, "grad_norm": 0.5098860859870911, "learning_rate": 8.855218855218855e-06, "loss": 0.4598, "step": 526 }, { "epoch": 0.44385176866928694, "grad_norm": 0.47183758020401, "learning_rate": 8.872053872053873e-06, "loss": 0.4763, "step": 527 }, { "epoch": 0.4446939921392476, "grad_norm": 0.5132126808166504, "learning_rate": 8.888888888888888e-06, "loss": 0.4811, "step": 528 }, { "epoch": 0.4455362156092083, "grad_norm": 0.4398671090602875, "learning_rate": 8.905723905723906e-06, "loss": 0.4907, "step": 529 }, { "epoch": 0.446378439079169, "grad_norm": 0.5522358417510986, "learning_rate": 8.922558922558923e-06, "loss": 0.4605, "step": 530 }, { "epoch": 0.4472206625491297, "grad_norm": 0.48295727372169495, "learning_rate": 8.93939393939394e-06, "loss": 0.4815, "step": 531 }, { "epoch": 0.4480628860190904, "grad_norm": 0.5379626750946045, "learning_rate": 8.956228956228958e-06, "loss": 0.4812, "step": 532 }, { "epoch": 0.4489051094890511, "grad_norm": 0.4464685618877411, "learning_rate": 8.973063973063973e-06, "loss": 0.4781, "step": 533 }, { "epoch": 0.4497473329590118, "grad_norm": 0.4661094546318054, "learning_rate": 8.98989898989899e-06, "loss": 0.4812, "step": 534 }, { "epoch": 0.45058955642897247, "grad_norm": 0.490442156791687, "learning_rate": 9.006734006734008e-06, "loss": 0.4712, "step": 535 }, { "epoch": 0.4514317798989332, "grad_norm": 0.483855277299881, "learning_rate": 9.023569023569025e-06, "loss": 0.4843, "step": 536 }, { "epoch": 0.4522740033688939, "grad_norm": 0.4451216161251068, "learning_rate": 9.040404040404042e-06, "loss": 0.4972, "step": 537 }, { "epoch": 0.45311622683885455, "grad_norm": 0.4881470203399658, "learning_rate": 9.057239057239058e-06, "loss": 0.5157, "step": 538 }, { "epoch": 0.45395845030881526, "grad_norm": 0.5141044855117798, "learning_rate": 9.074074074074075e-06, "loss": 0.5126, "step": 539 }, { "epoch": 0.454800673778776, "grad_norm": 0.5175477862358093, "learning_rate": 9.090909090909091e-06, "loss": 0.4948, "step": 540 }, { "epoch": 0.4556428972487367, "grad_norm": 0.5628520846366882, "learning_rate": 9.107744107744108e-06, "loss": 0.486, "step": 541 }, { "epoch": 0.45648512071869735, "grad_norm": 0.499660849571228, "learning_rate": 9.124579124579126e-06, "loss": 0.481, "step": 542 }, { "epoch": 0.45732734418865806, "grad_norm": 0.5752672553062439, "learning_rate": 9.141414141414143e-06, "loss": 0.4786, "step": 543 }, { "epoch": 0.45816956765861877, "grad_norm": 0.49358314275741577, "learning_rate": 9.15824915824916e-06, "loss": 0.4995, "step": 544 }, { "epoch": 0.4590117911285794, "grad_norm": 0.4195723831653595, "learning_rate": 9.175084175084176e-06, "loss": 0.4424, "step": 545 }, { "epoch": 0.45985401459854014, "grad_norm": 0.5060628652572632, "learning_rate": 9.191919191919193e-06, "loss": 0.4859, "step": 546 }, { "epoch": 0.46069623806850085, "grad_norm": 0.4921729266643524, "learning_rate": 9.208754208754209e-06, "loss": 0.5044, "step": 547 }, { "epoch": 0.46153846153846156, "grad_norm": 0.46490243077278137, "learning_rate": 9.225589225589226e-06, "loss": 0.51, "step": 548 }, { "epoch": 0.4623806850084222, "grad_norm": 0.48533546924591064, "learning_rate": 9.242424242424244e-06, "loss": 0.4806, "step": 549 }, { "epoch": 0.46322290847838293, "grad_norm": 0.4604891240596771, "learning_rate": 9.25925925925926e-06, "loss": 0.4808, "step": 550 }, { "epoch": 0.46406513194834365, "grad_norm": 0.5316649675369263, "learning_rate": 9.276094276094278e-06, "loss": 0.4633, "step": 551 }, { "epoch": 0.4649073554183043, "grad_norm": 0.49797549843788147, "learning_rate": 9.292929292929294e-06, "loss": 0.4906, "step": 552 }, { "epoch": 0.465749578888265, "grad_norm": 0.5146768689155579, "learning_rate": 9.309764309764311e-06, "loss": 0.4786, "step": 553 }, { "epoch": 0.4665918023582257, "grad_norm": 0.5108935236930847, "learning_rate": 9.326599326599327e-06, "loss": 0.4754, "step": 554 }, { "epoch": 0.46743402582818644, "grad_norm": 0.5024922490119934, "learning_rate": 9.343434343434344e-06, "loss": 0.4885, "step": 555 }, { "epoch": 0.4682762492981471, "grad_norm": 0.6081242561340332, "learning_rate": 9.360269360269361e-06, "loss": 0.4814, "step": 556 }, { "epoch": 0.4691184727681078, "grad_norm": 0.47972041368484497, "learning_rate": 9.377104377104379e-06, "loss": 0.4597, "step": 557 }, { "epoch": 0.4699606962380685, "grad_norm": 0.6305183172225952, "learning_rate": 9.393939393939396e-06, "loss": 0.4748, "step": 558 }, { "epoch": 0.4708029197080292, "grad_norm": 0.45016196370124817, "learning_rate": 9.410774410774412e-06, "loss": 0.4671, "step": 559 }, { "epoch": 0.4716451431779899, "grad_norm": 0.5068502426147461, "learning_rate": 9.427609427609429e-06, "loss": 0.4785, "step": 560 }, { "epoch": 0.4724873666479506, "grad_norm": 0.5698970556259155, "learning_rate": 9.444444444444445e-06, "loss": 0.4724, "step": 561 }, { "epoch": 0.47332959011791126, "grad_norm": 0.4252326488494873, "learning_rate": 9.461279461279462e-06, "loss": 0.4936, "step": 562 }, { "epoch": 0.47417181358787197, "grad_norm": 0.5283361673355103, "learning_rate": 9.47811447811448e-06, "loss": 0.4755, "step": 563 }, { "epoch": 0.4750140370578327, "grad_norm": 0.5314534902572632, "learning_rate": 9.494949494949497e-06, "loss": 0.4827, "step": 564 }, { "epoch": 0.4758562605277934, "grad_norm": 0.45917460322380066, "learning_rate": 9.511784511784512e-06, "loss": 0.4736, "step": 565 }, { "epoch": 0.47669848399775405, "grad_norm": 0.516704797744751, "learning_rate": 9.52861952861953e-06, "loss": 0.4918, "step": 566 }, { "epoch": 0.47754070746771476, "grad_norm": 0.4795782268047333, "learning_rate": 9.545454545454547e-06, "loss": 0.4829, "step": 567 }, { "epoch": 0.4783829309376755, "grad_norm": 0.4878758192062378, "learning_rate": 9.562289562289562e-06, "loss": 0.4949, "step": 568 }, { "epoch": 0.47922515440763613, "grad_norm": 0.5236625075340271, "learning_rate": 9.57912457912458e-06, "loss": 0.4586, "step": 569 }, { "epoch": 0.48006737787759685, "grad_norm": 0.49027445912361145, "learning_rate": 9.595959595959597e-06, "loss": 0.4642, "step": 570 }, { "epoch": 0.48090960134755756, "grad_norm": 0.539688766002655, "learning_rate": 9.612794612794614e-06, "loss": 0.4878, "step": 571 }, { "epoch": 0.48175182481751827, "grad_norm": 0.49353957176208496, "learning_rate": 9.62962962962963e-06, "loss": 0.4812, "step": 572 }, { "epoch": 0.4825940482874789, "grad_norm": 0.5238080024719238, "learning_rate": 9.646464646464647e-06, "loss": 0.4883, "step": 573 }, { "epoch": 0.48343627175743964, "grad_norm": 0.5076724290847778, "learning_rate": 9.663299663299665e-06, "loss": 0.5009, "step": 574 }, { "epoch": 0.48427849522740035, "grad_norm": 0.5576752424240112, "learning_rate": 9.68013468013468e-06, "loss": 0.4869, "step": 575 }, { "epoch": 0.485120718697361, "grad_norm": 0.4999569356441498, "learning_rate": 9.696969696969698e-06, "loss": 0.4623, "step": 576 }, { "epoch": 0.4859629421673217, "grad_norm": 0.5969550609588623, "learning_rate": 9.713804713804715e-06, "loss": 0.5109, "step": 577 }, { "epoch": 0.48680516563728243, "grad_norm": 0.5151420831680298, "learning_rate": 9.730639730639732e-06, "loss": 0.482, "step": 578 }, { "epoch": 0.48764738910724315, "grad_norm": 0.46023136377334595, "learning_rate": 9.747474747474748e-06, "loss": 0.4555, "step": 579 }, { "epoch": 0.4884896125772038, "grad_norm": 0.5698410868644714, "learning_rate": 9.764309764309765e-06, "loss": 0.5068, "step": 580 }, { "epoch": 0.4893318360471645, "grad_norm": 0.5622819662094116, "learning_rate": 9.781144781144782e-06, "loss": 0.4937, "step": 581 }, { "epoch": 0.4901740595171252, "grad_norm": 0.5992958545684814, "learning_rate": 9.797979797979798e-06, "loss": 0.4958, "step": 582 }, { "epoch": 0.4910162829870859, "grad_norm": 0.516898512840271, "learning_rate": 9.814814814814815e-06, "loss": 0.4807, "step": 583 }, { "epoch": 0.4918585064570466, "grad_norm": 0.5199419260025024, "learning_rate": 9.831649831649833e-06, "loss": 0.482, "step": 584 }, { "epoch": 0.4927007299270073, "grad_norm": 0.5815016627311707, "learning_rate": 9.84848484848485e-06, "loss": 0.4614, "step": 585 }, { "epoch": 0.493542953396968, "grad_norm": 0.5574349164962769, "learning_rate": 9.865319865319866e-06, "loss": 0.4903, "step": 586 }, { "epoch": 0.4943851768669287, "grad_norm": 0.4916158616542816, "learning_rate": 9.882154882154883e-06, "loss": 0.4828, "step": 587 }, { "epoch": 0.4952274003368894, "grad_norm": 0.5901016592979431, "learning_rate": 9.8989898989899e-06, "loss": 0.478, "step": 588 }, { "epoch": 0.4960696238068501, "grad_norm": 0.6030367612838745, "learning_rate": 9.915824915824916e-06, "loss": 0.5082, "step": 589 }, { "epoch": 0.49691184727681076, "grad_norm": 0.584021806716919, "learning_rate": 9.932659932659933e-06, "loss": 0.4881, "step": 590 }, { "epoch": 0.49775407074677147, "grad_norm": 0.4569278359413147, "learning_rate": 9.94949494949495e-06, "loss": 0.4738, "step": 591 }, { "epoch": 0.4985962942167322, "grad_norm": 0.6027945280075073, "learning_rate": 9.966329966329968e-06, "loss": 0.4832, "step": 592 }, { "epoch": 0.4994385176866929, "grad_norm": 0.533332109451294, "learning_rate": 9.983164983164983e-06, "loss": 0.488, "step": 593 }, { "epoch": 0.5002807411566536, "grad_norm": 0.45026883482933044, "learning_rate": 1e-05, "loss": 0.4401, "step": 594 }, { "epoch": 0.5011229646266142, "grad_norm": 0.6000810265541077, "learning_rate": 9.999999135042866e-06, "loss": 0.4979, "step": 595 }, { "epoch": 0.501965188096575, "grad_norm": 0.44417840242385864, "learning_rate": 9.999996540171759e-06, "loss": 0.4917, "step": 596 }, { "epoch": 0.5028074115665356, "grad_norm": 0.5371363162994385, "learning_rate": 9.999992215387579e-06, "loss": 0.4733, "step": 597 }, { "epoch": 0.5036496350364964, "grad_norm": 0.5185532569885254, "learning_rate": 9.999986160691824e-06, "loss": 0.4577, "step": 598 }, { "epoch": 0.5044918585064571, "grad_norm": 0.47804689407348633, "learning_rate": 9.999978376086585e-06, "loss": 0.5046, "step": 599 }, { "epoch": 0.5053340819764177, "grad_norm": 0.5497897863388062, "learning_rate": 9.999968861574556e-06, "loss": 0.485, "step": 600 }, { "epoch": 0.5061763054463785, "grad_norm": 0.4786771833896637, "learning_rate": 9.999957617159032e-06, "loss": 0.5065, "step": 601 }, { "epoch": 0.5070185289163391, "grad_norm": 0.4893760681152344, "learning_rate": 9.9999446428439e-06, "loss": 0.501, "step": 602 }, { "epoch": 0.5078607523862998, "grad_norm": 0.5187221169471741, "learning_rate": 9.999929938633652e-06, "loss": 0.4769, "step": 603 }, { "epoch": 0.5087029758562606, "grad_norm": 0.47750741243362427, "learning_rate": 9.999913504533372e-06, "loss": 0.5003, "step": 604 }, { "epoch": 0.5095451993262212, "grad_norm": 0.5577571988105774, "learning_rate": 9.999895340548748e-06, "loss": 0.4937, "step": 605 }, { "epoch": 0.5103874227961819, "grad_norm": 0.4885021150112152, "learning_rate": 9.999875446686064e-06, "loss": 0.4896, "step": 606 }, { "epoch": 0.5112296462661426, "grad_norm": 0.47188201546669006, "learning_rate": 9.999853822952202e-06, "loss": 0.4752, "step": 607 }, { "epoch": 0.5120718697361033, "grad_norm": 0.513225257396698, "learning_rate": 9.999830469354645e-06, "loss": 0.4728, "step": 608 }, { "epoch": 0.512914093206064, "grad_norm": 0.4879067540168762, "learning_rate": 9.999805385901473e-06, "loss": 0.4879, "step": 609 }, { "epoch": 0.5137563166760247, "grad_norm": 0.6001608967781067, "learning_rate": 9.999778572601364e-06, "loss": 0.4814, "step": 610 }, { "epoch": 0.5145985401459854, "grad_norm": 0.4997578561306, "learning_rate": 9.999750029463596e-06, "loss": 0.4735, "step": 611 }, { "epoch": 0.5154407636159462, "grad_norm": 0.553519070148468, "learning_rate": 9.99971975649804e-06, "loss": 0.4623, "step": 612 }, { "epoch": 0.5162829870859068, "grad_norm": 0.5097311735153198, "learning_rate": 9.999687753715175e-06, "loss": 0.4714, "step": 613 }, { "epoch": 0.5171252105558675, "grad_norm": 0.5073575973510742, "learning_rate": 9.99965402112607e-06, "loss": 0.4973, "step": 614 }, { "epoch": 0.5179674340258282, "grad_norm": 0.5223052501678467, "learning_rate": 9.9996185587424e-06, "loss": 0.4896, "step": 615 }, { "epoch": 0.5188096574957889, "grad_norm": 0.4770972728729248, "learning_rate": 9.999581366576428e-06, "loss": 0.457, "step": 616 }, { "epoch": 0.5196518809657495, "grad_norm": 0.6010829210281372, "learning_rate": 9.999542444641028e-06, "loss": 0.5055, "step": 617 }, { "epoch": 0.5204941044357103, "grad_norm": 0.53952556848526, "learning_rate": 9.999501792949664e-06, "loss": 0.4861, "step": 618 }, { "epoch": 0.521336327905671, "grad_norm": 0.4939385950565338, "learning_rate": 9.9994594115164e-06, "loss": 0.4899, "step": 619 }, { "epoch": 0.5221785513756316, "grad_norm": 0.5609344840049744, "learning_rate": 9.9994153003559e-06, "loss": 0.4684, "step": 620 }, { "epoch": 0.5230207748455924, "grad_norm": 0.48643818497657776, "learning_rate": 9.999369459483425e-06, "loss": 0.4676, "step": 621 }, { "epoch": 0.523862998315553, "grad_norm": 0.523987889289856, "learning_rate": 9.999321888914837e-06, "loss": 0.4879, "step": 622 }, { "epoch": 0.5247052217855137, "grad_norm": 0.5215567350387573, "learning_rate": 9.999272588666593e-06, "loss": 0.4736, "step": 623 }, { "epoch": 0.5255474452554745, "grad_norm": 0.5453155040740967, "learning_rate": 9.999221558755751e-06, "loss": 0.4946, "step": 624 }, { "epoch": 0.5263896687254351, "grad_norm": 0.4562123715877533, "learning_rate": 9.999168799199967e-06, "loss": 0.4563, "step": 625 }, { "epoch": 0.5272318921953958, "grad_norm": 0.549990713596344, "learning_rate": 9.999114310017491e-06, "loss": 0.4738, "step": 626 }, { "epoch": 0.5280741156653566, "grad_norm": 0.5211537480354309, "learning_rate": 9.99905809122718e-06, "loss": 0.512, "step": 627 }, { "epoch": 0.5289163391353172, "grad_norm": 0.6031003594398499, "learning_rate": 9.999000142848483e-06, "loss": 0.508, "step": 628 }, { "epoch": 0.529758562605278, "grad_norm": 0.6156628727912903, "learning_rate": 9.998940464901448e-06, "loss": 0.4792, "step": 629 }, { "epoch": 0.5306007860752386, "grad_norm": 0.5990478992462158, "learning_rate": 9.998879057406726e-06, "loss": 0.4874, "step": 630 }, { "epoch": 0.5314430095451993, "grad_norm": 0.6115618944168091, "learning_rate": 9.998815920385558e-06, "loss": 0.4797, "step": 631 }, { "epoch": 0.5322852330151601, "grad_norm": 0.5512371063232422, "learning_rate": 9.99875105385979e-06, "loss": 0.4838, "step": 632 }, { "epoch": 0.5331274564851207, "grad_norm": 0.5999553799629211, "learning_rate": 9.998684457851868e-06, "loss": 0.4578, "step": 633 }, { "epoch": 0.5339696799550814, "grad_norm": 0.5091526508331299, "learning_rate": 9.99861613238483e-06, "loss": 0.4641, "step": 634 }, { "epoch": 0.5348119034250421, "grad_norm": 0.6253949999809265, "learning_rate": 9.998546077482317e-06, "loss": 0.5016, "step": 635 }, { "epoch": 0.5356541268950028, "grad_norm": 0.6216267943382263, "learning_rate": 9.998474293168564e-06, "loss": 0.4985, "step": 636 }, { "epoch": 0.5364963503649635, "grad_norm": 0.4993249177932739, "learning_rate": 9.99840077946841e-06, "loss": 0.4651, "step": 637 }, { "epoch": 0.5373385738349242, "grad_norm": 0.7353835105895996, "learning_rate": 9.998325536407287e-06, "loss": 0.4432, "step": 638 }, { "epoch": 0.5381807973048849, "grad_norm": 0.4696723222732544, "learning_rate": 9.998248564011231e-06, "loss": 0.4866, "step": 639 }, { "epoch": 0.5390230207748455, "grad_norm": 0.6496427059173584, "learning_rate": 9.998169862306873e-06, "loss": 0.4933, "step": 640 }, { "epoch": 0.5398652442448063, "grad_norm": 0.6164034605026245, "learning_rate": 9.998089431321438e-06, "loss": 0.4788, "step": 641 }, { "epoch": 0.540707467714767, "grad_norm": 0.46849820017814636, "learning_rate": 9.998007271082756e-06, "loss": 0.4751, "step": 642 }, { "epoch": 0.5415496911847277, "grad_norm": 0.6874107122421265, "learning_rate": 9.997923381619257e-06, "loss": 0.5115, "step": 643 }, { "epoch": 0.5423919146546884, "grad_norm": 0.5642508268356323, "learning_rate": 9.99783776295996e-06, "loss": 0.4732, "step": 644 }, { "epoch": 0.543234138124649, "grad_norm": 0.4731847643852234, "learning_rate": 9.997750415134489e-06, "loss": 0.4857, "step": 645 }, { "epoch": 0.5440763615946098, "grad_norm": 0.547943651676178, "learning_rate": 9.997661338173067e-06, "loss": 0.4815, "step": 646 }, { "epoch": 0.5449185850645705, "grad_norm": 0.47541576623916626, "learning_rate": 9.997570532106511e-06, "loss": 0.4653, "step": 647 }, { "epoch": 0.5457608085345311, "grad_norm": 0.5927355289459229, "learning_rate": 9.997477996966238e-06, "loss": 0.4879, "step": 648 }, { "epoch": 0.5466030320044919, "grad_norm": 0.4606771767139435, "learning_rate": 9.997383732784264e-06, "loss": 0.4896, "step": 649 }, { "epoch": 0.5474452554744526, "grad_norm": 0.5368162989616394, "learning_rate": 9.997287739593206e-06, "loss": 0.4639, "step": 650 }, { "epoch": 0.5482874789444132, "grad_norm": 0.5524506568908691, "learning_rate": 9.997190017426272e-06, "loss": 0.483, "step": 651 }, { "epoch": 0.549129702414374, "grad_norm": 0.465293824672699, "learning_rate": 9.997090566317273e-06, "loss": 0.4688, "step": 652 }, { "epoch": 0.5499719258843346, "grad_norm": 0.5541974306106567, "learning_rate": 9.996989386300617e-06, "loss": 0.4609, "step": 653 }, { "epoch": 0.5508141493542953, "grad_norm": 0.483320027589798, "learning_rate": 9.996886477411312e-06, "loss": 0.4407, "step": 654 }, { "epoch": 0.5516563728242561, "grad_norm": 0.4944576323032379, "learning_rate": 9.996781839684962e-06, "loss": 0.4768, "step": 655 }, { "epoch": 0.5524985962942167, "grad_norm": 0.5040367841720581, "learning_rate": 9.996675473157772e-06, "loss": 0.471, "step": 656 }, { "epoch": 0.5533408197641775, "grad_norm": 0.5272096395492554, "learning_rate": 9.996567377866537e-06, "loss": 0.4783, "step": 657 }, { "epoch": 0.5541830432341381, "grad_norm": 0.5243476033210754, "learning_rate": 9.996457553848661e-06, "loss": 0.4846, "step": 658 }, { "epoch": 0.5550252667040988, "grad_norm": 0.5559556484222412, "learning_rate": 9.996346001142141e-06, "loss": 0.4773, "step": 659 }, { "epoch": 0.5558674901740596, "grad_norm": 0.5864510536193848, "learning_rate": 9.996232719785573e-06, "loss": 0.4862, "step": 660 }, { "epoch": 0.5567097136440202, "grad_norm": 0.5358628630638123, "learning_rate": 9.996117709818147e-06, "loss": 0.5098, "step": 661 }, { "epoch": 0.5575519371139809, "grad_norm": 0.5562862157821655, "learning_rate": 9.996000971279657e-06, "loss": 0.485, "step": 662 }, { "epoch": 0.5583941605839416, "grad_norm": 0.5148352980613708, "learning_rate": 9.995882504210493e-06, "loss": 0.4504, "step": 663 }, { "epoch": 0.5592363840539023, "grad_norm": 0.5884944796562195, "learning_rate": 9.995762308651641e-06, "loss": 0.5001, "step": 664 }, { "epoch": 0.560078607523863, "grad_norm": 0.5128183364868164, "learning_rate": 9.995640384644687e-06, "loss": 0.5346, "step": 665 }, { "epoch": 0.5609208309938237, "grad_norm": 0.5138581991195679, "learning_rate": 9.995516732231816e-06, "loss": 0.4579, "step": 666 }, { "epoch": 0.5617630544637844, "grad_norm": 0.5538572072982788, "learning_rate": 9.995391351455806e-06, "loss": 0.4778, "step": 667 }, { "epoch": 0.562605277933745, "grad_norm": 0.46517857909202576, "learning_rate": 9.99526424236004e-06, "loss": 0.4626, "step": 668 }, { "epoch": 0.5634475014037058, "grad_norm": 0.527840256690979, "learning_rate": 9.995135404988495e-06, "loss": 0.4763, "step": 669 }, { "epoch": 0.5642897248736665, "grad_norm": 0.5023984909057617, "learning_rate": 9.995004839385745e-06, "loss": 0.5016, "step": 670 }, { "epoch": 0.5651319483436271, "grad_norm": 0.5717486143112183, "learning_rate": 9.994872545596966e-06, "loss": 0.4492, "step": 671 }, { "epoch": 0.5659741718135879, "grad_norm": 0.5153781771659851, "learning_rate": 9.994738523667928e-06, "loss": 0.45, "step": 672 }, { "epoch": 0.5668163952835485, "grad_norm": 0.48645874857902527, "learning_rate": 9.994602773645e-06, "loss": 0.4656, "step": 673 }, { "epoch": 0.5676586187535093, "grad_norm": 0.5264657735824585, "learning_rate": 9.994465295575149e-06, "loss": 0.4569, "step": 674 }, { "epoch": 0.56850084222347, "grad_norm": 0.5830297470092773, "learning_rate": 9.994326089505939e-06, "loss": 0.4847, "step": 675 }, { "epoch": 0.5693430656934306, "grad_norm": 0.5025862455368042, "learning_rate": 9.994185155485536e-06, "loss": 0.4654, "step": 676 }, { "epoch": 0.5701852891633914, "grad_norm": 0.4999510943889618, "learning_rate": 9.9940424935627e-06, "loss": 0.4535, "step": 677 }, { "epoch": 0.571027512633352, "grad_norm": 0.4995343089103699, "learning_rate": 9.993898103786787e-06, "loss": 0.4992, "step": 678 }, { "epoch": 0.5718697361033127, "grad_norm": 0.44628146290779114, "learning_rate": 9.993751986207755e-06, "loss": 0.4698, "step": 679 }, { "epoch": 0.5727119595732735, "grad_norm": 0.5068009495735168, "learning_rate": 9.993604140876158e-06, "loss": 0.4714, "step": 680 }, { "epoch": 0.5735541830432341, "grad_norm": 0.4635987877845764, "learning_rate": 9.99345456784315e-06, "loss": 0.4925, "step": 681 }, { "epoch": 0.5743964065131948, "grad_norm": 0.5062849521636963, "learning_rate": 9.99330326716048e-06, "loss": 0.4871, "step": 682 }, { "epoch": 0.5752386299831556, "grad_norm": 0.5013110041618347, "learning_rate": 9.993150238880492e-06, "loss": 0.4618, "step": 683 }, { "epoch": 0.5760808534531162, "grad_norm": 0.5050705671310425, "learning_rate": 9.992995483056134e-06, "loss": 0.4775, "step": 684 }, { "epoch": 0.5769230769230769, "grad_norm": 0.5259485244750977, "learning_rate": 9.992838999740949e-06, "loss": 0.4868, "step": 685 }, { "epoch": 0.5777653003930376, "grad_norm": 0.5781870484352112, "learning_rate": 9.992680788989075e-06, "loss": 0.4787, "step": 686 }, { "epoch": 0.5786075238629983, "grad_norm": 0.4620145857334137, "learning_rate": 9.992520850855252e-06, "loss": 0.4772, "step": 687 }, { "epoch": 0.5794497473329591, "grad_norm": 0.6015411019325256, "learning_rate": 9.992359185394818e-06, "loss": 0.4595, "step": 688 }, { "epoch": 0.5802919708029197, "grad_norm": 0.5113403797149658, "learning_rate": 9.992195792663703e-06, "loss": 0.4782, "step": 689 }, { "epoch": 0.5811341942728804, "grad_norm": 0.6137397885322571, "learning_rate": 9.992030672718443e-06, "loss": 0.4848, "step": 690 }, { "epoch": 0.5819764177428411, "grad_norm": 0.5435703992843628, "learning_rate": 9.99186382561616e-06, "loss": 0.4571, "step": 691 }, { "epoch": 0.5828186412128018, "grad_norm": 0.569404125213623, "learning_rate": 9.991695251414584e-06, "loss": 0.4412, "step": 692 }, { "epoch": 0.5836608646827625, "grad_norm": 0.549757182598114, "learning_rate": 9.991524950172038e-06, "loss": 0.4793, "step": 693 }, { "epoch": 0.5845030881527232, "grad_norm": 0.5944202542304993, "learning_rate": 9.991352921947444e-06, "loss": 0.4765, "step": 694 }, { "epoch": 0.5853453116226839, "grad_norm": 0.511986255645752, "learning_rate": 9.99117916680032e-06, "loss": 0.4974, "step": 695 }, { "epoch": 0.5861875350926445, "grad_norm": 0.46266019344329834, "learning_rate": 9.991003684790784e-06, "loss": 0.4661, "step": 696 }, { "epoch": 0.5870297585626053, "grad_norm": 0.5883674025535583, "learning_rate": 9.990826475979547e-06, "loss": 0.4659, "step": 697 }, { "epoch": 0.587871982032566, "grad_norm": 0.5423708558082581, "learning_rate": 9.990647540427923e-06, "loss": 0.4982, "step": 698 }, { "epoch": 0.5887142055025266, "grad_norm": 0.49799203872680664, "learning_rate": 9.990466878197818e-06, "loss": 0.4678, "step": 699 }, { "epoch": 0.5895564289724874, "grad_norm": 0.48099133372306824, "learning_rate": 9.990284489351738e-06, "loss": 0.4473, "step": 700 }, { "epoch": 0.590398652442448, "grad_norm": 0.488294780254364, "learning_rate": 9.99010037395279e-06, "loss": 0.4721, "step": 701 }, { "epoch": 0.5912408759124088, "grad_norm": 0.4466111958026886, "learning_rate": 9.989914532064673e-06, "loss": 0.4616, "step": 702 }, { "epoch": 0.5920830993823695, "grad_norm": 0.47376322746276855, "learning_rate": 9.989726963751683e-06, "loss": 0.4504, "step": 703 }, { "epoch": 0.5929253228523301, "grad_norm": 0.45125812292099, "learning_rate": 9.989537669078717e-06, "loss": 0.4521, "step": 704 }, { "epoch": 0.5937675463222909, "grad_norm": 0.49056822061538696, "learning_rate": 9.98934664811127e-06, "loss": 0.4725, "step": 705 }, { "epoch": 0.5946097697922516, "grad_norm": 0.49633243680000305, "learning_rate": 9.989153900915428e-06, "loss": 0.4767, "step": 706 }, { "epoch": 0.5954519932622122, "grad_norm": 0.4469139575958252, "learning_rate": 9.988959427557878e-06, "loss": 0.4746, "step": 707 }, { "epoch": 0.596294216732173, "grad_norm": 0.5396443605422974, "learning_rate": 9.988763228105909e-06, "loss": 0.4572, "step": 708 }, { "epoch": 0.5971364402021336, "grad_norm": 0.44772765040397644, "learning_rate": 9.988565302627398e-06, "loss": 0.4579, "step": 709 }, { "epoch": 0.5979786636720943, "grad_norm": 0.5307087302207947, "learning_rate": 9.988365651190827e-06, "loss": 0.4789, "step": 710 }, { "epoch": 0.5988208871420551, "grad_norm": 0.543757975101471, "learning_rate": 9.988164273865271e-06, "loss": 0.4885, "step": 711 }, { "epoch": 0.5996631106120157, "grad_norm": 0.5016005039215088, "learning_rate": 9.987961170720404e-06, "loss": 0.4821, "step": 712 }, { "epoch": 0.6005053340819764, "grad_norm": 0.549144983291626, "learning_rate": 9.987756341826493e-06, "loss": 0.4404, "step": 713 }, { "epoch": 0.6013475575519371, "grad_norm": 0.4627821147441864, "learning_rate": 9.98754978725441e-06, "loss": 0.4724, "step": 714 }, { "epoch": 0.6021897810218978, "grad_norm": 0.5006039142608643, "learning_rate": 9.987341507075614e-06, "loss": 0.4933, "step": 715 }, { "epoch": 0.6030320044918585, "grad_norm": 0.47527116537094116, "learning_rate": 9.98713150136217e-06, "loss": 0.482, "step": 716 }, { "epoch": 0.6038742279618192, "grad_norm": 0.5477534532546997, "learning_rate": 9.986919770186736e-06, "loss": 0.4784, "step": 717 }, { "epoch": 0.6047164514317799, "grad_norm": 0.5342768430709839, "learning_rate": 9.986706313622567e-06, "loss": 0.487, "step": 718 }, { "epoch": 0.6055586749017406, "grad_norm": 0.5340936183929443, "learning_rate": 9.986491131743517e-06, "loss": 0.4654, "step": 719 }, { "epoch": 0.6064008983717013, "grad_norm": 0.48899954557418823, "learning_rate": 9.98627422462403e-06, "loss": 0.4417, "step": 720 }, { "epoch": 0.607243121841662, "grad_norm": 0.6063597798347473, "learning_rate": 9.986055592339157e-06, "loss": 0.4828, "step": 721 }, { "epoch": 0.6080853453116227, "grad_norm": 0.46916455030441284, "learning_rate": 9.98583523496454e-06, "loss": 0.4967, "step": 722 }, { "epoch": 0.6089275687815834, "grad_norm": 0.5215853452682495, "learning_rate": 9.985613152576418e-06, "loss": 0.4699, "step": 723 }, { "epoch": 0.609769792251544, "grad_norm": 0.49355486035346985, "learning_rate": 9.985389345251628e-06, "loss": 0.4575, "step": 724 }, { "epoch": 0.6106120157215048, "grad_norm": 0.5172094106674194, "learning_rate": 9.985163813067605e-06, "loss": 0.4791, "step": 725 }, { "epoch": 0.6114542391914655, "grad_norm": 0.53579181432724, "learning_rate": 9.984936556102377e-06, "loss": 0.4411, "step": 726 }, { "epoch": 0.6122964626614261, "grad_norm": 0.4958087205886841, "learning_rate": 9.98470757443457e-06, "loss": 0.4652, "step": 727 }, { "epoch": 0.6131386861313869, "grad_norm": 0.5479157567024231, "learning_rate": 9.98447686814341e-06, "loss": 0.4839, "step": 728 }, { "epoch": 0.6139809096013475, "grad_norm": 0.5258472561836243, "learning_rate": 9.984244437308718e-06, "loss": 0.4717, "step": 729 }, { "epoch": 0.6148231330713082, "grad_norm": 0.4707808196544647, "learning_rate": 9.984010282010913e-06, "loss": 0.4673, "step": 730 }, { "epoch": 0.615665356541269, "grad_norm": 0.5531563758850098, "learning_rate": 9.983774402331004e-06, "loss": 0.4955, "step": 731 }, { "epoch": 0.6165075800112296, "grad_norm": 0.5130414366722107, "learning_rate": 9.983536798350601e-06, "loss": 0.4781, "step": 732 }, { "epoch": 0.6173498034811904, "grad_norm": 0.5124431848526001, "learning_rate": 9.983297470151915e-06, "loss": 0.4586, "step": 733 }, { "epoch": 0.618192026951151, "grad_norm": 0.48972389101982117, "learning_rate": 9.983056417817747e-06, "loss": 0.4682, "step": 734 }, { "epoch": 0.6190342504211117, "grad_norm": 0.6306508779525757, "learning_rate": 9.982813641431499e-06, "loss": 0.4701, "step": 735 }, { "epoch": 0.6198764738910725, "grad_norm": 0.5624862909317017, "learning_rate": 9.982569141077164e-06, "loss": 0.4597, "step": 736 }, { "epoch": 0.6207186973610331, "grad_norm": 0.6212544441223145, "learning_rate": 9.982322916839337e-06, "loss": 0.4752, "step": 737 }, { "epoch": 0.6215609208309938, "grad_norm": 0.5654715895652771, "learning_rate": 9.98207496880321e-06, "loss": 0.464, "step": 738 }, { "epoch": 0.6224031443009546, "grad_norm": 0.5480144023895264, "learning_rate": 9.981825297054563e-06, "loss": 0.4918, "step": 739 }, { "epoch": 0.6232453677709152, "grad_norm": 0.5197372436523438, "learning_rate": 9.981573901679783e-06, "loss": 0.4748, "step": 740 }, { "epoch": 0.6240875912408759, "grad_norm": 0.5325299501419067, "learning_rate": 9.981320782765847e-06, "loss": 0.4665, "step": 741 }, { "epoch": 0.6249298147108366, "grad_norm": 0.5034367442131042, "learning_rate": 9.981065940400328e-06, "loss": 0.4616, "step": 742 }, { "epoch": 0.6257720381807973, "grad_norm": 0.5595768690109253, "learning_rate": 9.9808093746714e-06, "loss": 0.4712, "step": 743 }, { "epoch": 0.626614261650758, "grad_norm": 0.634166955947876, "learning_rate": 9.980551085667828e-06, "loss": 0.4877, "step": 744 }, { "epoch": 0.6274564851207187, "grad_norm": 0.5138841271400452, "learning_rate": 9.980291073478975e-06, "loss": 0.4449, "step": 745 }, { "epoch": 0.6282987085906794, "grad_norm": 0.6347076892852783, "learning_rate": 9.980029338194806e-06, "loss": 0.4709, "step": 746 }, { "epoch": 0.62914093206064, "grad_norm": 0.541976809501648, "learning_rate": 9.97976587990587e-06, "loss": 0.4461, "step": 747 }, { "epoch": 0.6299831555306008, "grad_norm": 0.5878835916519165, "learning_rate": 9.979500698703324e-06, "loss": 0.4824, "step": 748 }, { "epoch": 0.6308253790005615, "grad_norm": 0.6248058676719666, "learning_rate": 9.979233794678914e-06, "loss": 0.496, "step": 749 }, { "epoch": 0.6316676024705222, "grad_norm": 0.49338746070861816, "learning_rate": 9.978965167924985e-06, "loss": 0.4742, "step": 750 }, { "epoch": 0.6325098259404829, "grad_norm": 0.568449854850769, "learning_rate": 9.978694818534476e-06, "loss": 0.4815, "step": 751 }, { "epoch": 0.6333520494104435, "grad_norm": 0.5267360806465149, "learning_rate": 9.978422746600924e-06, "loss": 0.4549, "step": 752 }, { "epoch": 0.6341942728804043, "grad_norm": 0.6119812726974487, "learning_rate": 9.978148952218462e-06, "loss": 0.4714, "step": 753 }, { "epoch": 0.635036496350365, "grad_norm": 0.4893544018268585, "learning_rate": 9.977873435481818e-06, "loss": 0.4515, "step": 754 }, { "epoch": 0.6358787198203256, "grad_norm": 0.4640081226825714, "learning_rate": 9.977596196486314e-06, "loss": 0.4752, "step": 755 }, { "epoch": 0.6367209432902864, "grad_norm": 0.4543865919113159, "learning_rate": 9.977317235327872e-06, "loss": 0.4537, "step": 756 }, { "epoch": 0.637563166760247, "grad_norm": 0.5625088214874268, "learning_rate": 9.977036552103008e-06, "loss": 0.4742, "step": 757 }, { "epoch": 0.6384053902302077, "grad_norm": 0.4558824598789215, "learning_rate": 9.976754146908834e-06, "loss": 0.4774, "step": 758 }, { "epoch": 0.6392476137001685, "grad_norm": 0.5407620668411255, "learning_rate": 9.976470019843054e-06, "loss": 0.4537, "step": 759 }, { "epoch": 0.6400898371701291, "grad_norm": 0.577828049659729, "learning_rate": 9.976184171003972e-06, "loss": 0.4727, "step": 760 }, { "epoch": 0.6409320606400898, "grad_norm": 0.5129311084747314, "learning_rate": 9.97589660049049e-06, "loss": 0.428, "step": 761 }, { "epoch": 0.6417742841100506, "grad_norm": 0.6351892352104187, "learning_rate": 9.975607308402101e-06, "loss": 0.4706, "step": 762 }, { "epoch": 0.6426165075800112, "grad_norm": 0.5532310605049133, "learning_rate": 9.975316294838896e-06, "loss": 0.4811, "step": 763 }, { "epoch": 0.643458731049972, "grad_norm": 0.6076250672340393, "learning_rate": 9.975023559901558e-06, "loss": 0.4337, "step": 764 }, { "epoch": 0.6443009545199326, "grad_norm": 0.48837536573410034, "learning_rate": 9.97472910369137e-06, "loss": 0.465, "step": 765 }, { "epoch": 0.6451431779898933, "grad_norm": 0.6053653359413147, "learning_rate": 9.974432926310206e-06, "loss": 0.4718, "step": 766 }, { "epoch": 0.6459854014598541, "grad_norm": 0.6060327887535095, "learning_rate": 9.974135027860544e-06, "loss": 0.4729, "step": 767 }, { "epoch": 0.6468276249298147, "grad_norm": 0.6306071877479553, "learning_rate": 9.973835408445447e-06, "loss": 0.4766, "step": 768 }, { "epoch": 0.6476698483997754, "grad_norm": 0.49708274006843567, "learning_rate": 9.97353406816858e-06, "loss": 0.4721, "step": 769 }, { "epoch": 0.6485120718697361, "grad_norm": 0.6989413499832153, "learning_rate": 9.9732310071342e-06, "loss": 0.4867, "step": 770 }, { "epoch": 0.6493542953396968, "grad_norm": 0.49209165573120117, "learning_rate": 9.972926225447164e-06, "loss": 0.4768, "step": 771 }, { "epoch": 0.6501965188096575, "grad_norm": 0.7153799533843994, "learning_rate": 9.97261972321292e-06, "loss": 0.4688, "step": 772 }, { "epoch": 0.6510387422796182, "grad_norm": 0.528149425983429, "learning_rate": 9.972311500537511e-06, "loss": 0.4787, "step": 773 }, { "epoch": 0.6518809657495789, "grad_norm": 0.5468606948852539, "learning_rate": 9.972001557527577e-06, "loss": 0.4691, "step": 774 }, { "epoch": 0.6527231892195395, "grad_norm": 0.6236921548843384, "learning_rate": 9.971689894290353e-06, "loss": 0.4579, "step": 775 }, { "epoch": 0.6535654126895003, "grad_norm": 0.5293242335319519, "learning_rate": 9.97137651093367e-06, "loss": 0.503, "step": 776 }, { "epoch": 0.654407636159461, "grad_norm": 0.6629259586334229, "learning_rate": 9.971061407565956e-06, "loss": 0.4634, "step": 777 }, { "epoch": 0.6552498596294217, "grad_norm": 0.5112481713294983, "learning_rate": 9.970744584296225e-06, "loss": 0.4512, "step": 778 }, { "epoch": 0.6560920830993824, "grad_norm": 0.7106290459632874, "learning_rate": 9.970426041234097e-06, "loss": 0.4735, "step": 779 }, { "epoch": 0.656934306569343, "grad_norm": 0.5655325055122375, "learning_rate": 9.97010577848978e-06, "loss": 0.4508, "step": 780 }, { "epoch": 0.6577765300393038, "grad_norm": 0.6577811241149902, "learning_rate": 9.969783796174085e-06, "loss": 0.4713, "step": 781 }, { "epoch": 0.6586187535092645, "grad_norm": 0.5484269261360168, "learning_rate": 9.969460094398404e-06, "loss": 0.4592, "step": 782 }, { "epoch": 0.6594609769792251, "grad_norm": 0.6416325569152832, "learning_rate": 9.969134673274737e-06, "loss": 0.4809, "step": 783 }, { "epoch": 0.6603032004491859, "grad_norm": 0.5590952038764954, "learning_rate": 9.968807532915674e-06, "loss": 0.4895, "step": 784 }, { "epoch": 0.6611454239191465, "grad_norm": 0.5752447247505188, "learning_rate": 9.9684786734344e-06, "loss": 0.4663, "step": 785 }, { "epoch": 0.6619876473891072, "grad_norm": 0.5101684927940369, "learning_rate": 9.968148094944694e-06, "loss": 0.4639, "step": 786 }, { "epoch": 0.662829870859068, "grad_norm": 0.6576343178749084, "learning_rate": 9.96781579756093e-06, "loss": 0.4474, "step": 787 }, { "epoch": 0.6636720943290286, "grad_norm": 0.5779542922973633, "learning_rate": 9.967481781398079e-06, "loss": 0.4961, "step": 788 }, { "epoch": 0.6645143177989893, "grad_norm": 0.5628913044929504, "learning_rate": 9.9671460465717e-06, "loss": 0.476, "step": 789 }, { "epoch": 0.66535654126895, "grad_norm": 0.5923757553100586, "learning_rate": 9.966808593197959e-06, "loss": 0.4433, "step": 790 }, { "epoch": 0.6661987647389107, "grad_norm": 0.4714546203613281, "learning_rate": 9.966469421393604e-06, "loss": 0.4962, "step": 791 }, { "epoch": 0.6670409882088714, "grad_norm": 0.6161687970161438, "learning_rate": 9.966128531275984e-06, "loss": 0.4897, "step": 792 }, { "epoch": 0.6678832116788321, "grad_norm": 0.4822297692298889, "learning_rate": 9.965785922963042e-06, "loss": 0.4526, "step": 793 }, { "epoch": 0.6687254351487928, "grad_norm": 0.5772736072540283, "learning_rate": 9.965441596573313e-06, "loss": 0.4764, "step": 794 }, { "epoch": 0.6695676586187536, "grad_norm": 0.49330729246139526, "learning_rate": 9.965095552225927e-06, "loss": 0.4682, "step": 795 }, { "epoch": 0.6704098820887142, "grad_norm": 0.6185431480407715, "learning_rate": 9.964747790040615e-06, "loss": 0.463, "step": 796 }, { "epoch": 0.6712521055586749, "grad_norm": 0.48180529475212097, "learning_rate": 9.964398310137688e-06, "loss": 0.4671, "step": 797 }, { "epoch": 0.6720943290286356, "grad_norm": 0.5641146898269653, "learning_rate": 9.964047112638069e-06, "loss": 0.4823, "step": 798 }, { "epoch": 0.6729365524985963, "grad_norm": 0.47388333082199097, "learning_rate": 9.96369419766326e-06, "loss": 0.4665, "step": 799 }, { "epoch": 0.673778775968557, "grad_norm": 0.4995035231113434, "learning_rate": 9.963339565335366e-06, "loss": 0.4474, "step": 800 }, { "epoch": 0.6746209994385177, "grad_norm": 0.5843474268913269, "learning_rate": 9.962983215777084e-06, "loss": 0.4779, "step": 801 }, { "epoch": 0.6754632229084784, "grad_norm": 0.47808849811553955, "learning_rate": 9.962625149111704e-06, "loss": 0.493, "step": 802 }, { "epoch": 0.676305446378439, "grad_norm": 0.6560859680175781, "learning_rate": 9.962265365463111e-06, "loss": 0.4799, "step": 803 }, { "epoch": 0.6771476698483998, "grad_norm": 0.57741379737854, "learning_rate": 9.961903864955783e-06, "loss": 0.4838, "step": 804 }, { "epoch": 0.6779898933183605, "grad_norm": 0.545911967754364, "learning_rate": 9.961540647714798e-06, "loss": 0.4534, "step": 805 }, { "epoch": 0.6788321167883211, "grad_norm": 0.726661205291748, "learning_rate": 9.961175713865816e-06, "loss": 0.4998, "step": 806 }, { "epoch": 0.6796743402582819, "grad_norm": 0.5084618926048279, "learning_rate": 9.9608090635351e-06, "loss": 0.4853, "step": 807 }, { "epoch": 0.6805165637282425, "grad_norm": 0.7210710644721985, "learning_rate": 9.960440696849508e-06, "loss": 0.4958, "step": 808 }, { "epoch": 0.6813587871982033, "grad_norm": 0.5819374918937683, "learning_rate": 9.960070613936486e-06, "loss": 0.4702, "step": 809 }, { "epoch": 0.682201010668164, "grad_norm": 0.4866502285003662, "learning_rate": 9.959698814924075e-06, "loss": 0.4718, "step": 810 }, { "epoch": 0.6830432341381246, "grad_norm": 0.6073138117790222, "learning_rate": 9.959325299940914e-06, "loss": 0.474, "step": 811 }, { "epoch": 0.6838854576080854, "grad_norm": 0.47220379114151, "learning_rate": 9.95895006911623e-06, "loss": 0.4647, "step": 812 }, { "epoch": 0.684727681078046, "grad_norm": 0.6630617380142212, "learning_rate": 9.958573122579851e-06, "loss": 0.4665, "step": 813 }, { "epoch": 0.6855699045480067, "grad_norm": 0.5277007818222046, "learning_rate": 9.958194460462188e-06, "loss": 0.4796, "step": 814 }, { "epoch": 0.6864121280179675, "grad_norm": 0.5527482032775879, "learning_rate": 9.957814082894256e-06, "loss": 0.4867, "step": 815 }, { "epoch": 0.6872543514879281, "grad_norm": 0.660800039768219, "learning_rate": 9.957431990007657e-06, "loss": 0.49, "step": 816 }, { "epoch": 0.6880965749578888, "grad_norm": 0.5465556979179382, "learning_rate": 9.957048181934589e-06, "loss": 0.4599, "step": 817 }, { "epoch": 0.6889387984278496, "grad_norm": 0.5417771339416504, "learning_rate": 9.956662658807842e-06, "loss": 0.4747, "step": 818 }, { "epoch": 0.6897810218978102, "grad_norm": 0.6430644392967224, "learning_rate": 9.956275420760804e-06, "loss": 0.4505, "step": 819 }, { "epoch": 0.6906232453677709, "grad_norm": 0.5078498721122742, "learning_rate": 9.955886467927449e-06, "loss": 0.4814, "step": 820 }, { "epoch": 0.6914654688377316, "grad_norm": 0.5456454753875732, "learning_rate": 9.95549580044235e-06, "loss": 0.4855, "step": 821 }, { "epoch": 0.6923076923076923, "grad_norm": 0.5019521713256836, "learning_rate": 9.955103418440672e-06, "loss": 0.4789, "step": 822 }, { "epoch": 0.6931499157776531, "grad_norm": 0.49573081731796265, "learning_rate": 9.954709322058169e-06, "loss": 0.4803, "step": 823 }, { "epoch": 0.6939921392476137, "grad_norm": 0.48820027709007263, "learning_rate": 9.954313511431193e-06, "loss": 0.4975, "step": 824 }, { "epoch": 0.6948343627175744, "grad_norm": 0.42622753977775574, "learning_rate": 9.953915986696689e-06, "loss": 0.4624, "step": 825 }, { "epoch": 0.6956765861875351, "grad_norm": 0.46420031785964966, "learning_rate": 9.953516747992194e-06, "loss": 0.4398, "step": 826 }, { "epoch": 0.6965188096574958, "grad_norm": 0.4468659460544586, "learning_rate": 9.953115795455836e-06, "loss": 0.4663, "step": 827 }, { "epoch": 0.6973610331274565, "grad_norm": 0.4022468030452728, "learning_rate": 9.95271312922634e-06, "loss": 0.4765, "step": 828 }, { "epoch": 0.6982032565974172, "grad_norm": 0.43352752923965454, "learning_rate": 9.952308749443018e-06, "loss": 0.4772, "step": 829 }, { "epoch": 0.6990454800673779, "grad_norm": 0.5102957487106323, "learning_rate": 9.951902656245782e-06, "loss": 0.4822, "step": 830 }, { "epoch": 0.6998877035373385, "grad_norm": 0.3991459310054779, "learning_rate": 9.951494849775131e-06, "loss": 0.4466, "step": 831 }, { "epoch": 0.7007299270072993, "grad_norm": 0.49316227436065674, "learning_rate": 9.951085330172161e-06, "loss": 0.46, "step": 832 }, { "epoch": 0.70157215047726, "grad_norm": 0.42071792483329773, "learning_rate": 9.950674097578558e-06, "loss": 0.4663, "step": 833 }, { "epoch": 0.7024143739472206, "grad_norm": 0.48815056681632996, "learning_rate": 9.9502611521366e-06, "loss": 0.4652, "step": 834 }, { "epoch": 0.7032565974171814, "grad_norm": 0.46437495946884155, "learning_rate": 9.949846493989159e-06, "loss": 0.4932, "step": 835 }, { "epoch": 0.704098820887142, "grad_norm": 0.42174094915390015, "learning_rate": 9.949430123279703e-06, "loss": 0.4352, "step": 836 }, { "epoch": 0.7049410443571027, "grad_norm": 0.4593966603279114, "learning_rate": 9.949012040152286e-06, "loss": 0.4562, "step": 837 }, { "epoch": 0.7057832678270635, "grad_norm": 0.4475049376487732, "learning_rate": 9.948592244751559e-06, "loss": 0.4675, "step": 838 }, { "epoch": 0.7066254912970241, "grad_norm": 0.44301751255989075, "learning_rate": 9.948170737222763e-06, "loss": 0.4681, "step": 839 }, { "epoch": 0.7074677147669849, "grad_norm": 0.48296383023262024, "learning_rate": 9.947747517711733e-06, "loss": 0.4562, "step": 840 }, { "epoch": 0.7083099382369455, "grad_norm": 0.4221026599407196, "learning_rate": 9.947322586364896e-06, "loss": 0.4526, "step": 841 }, { "epoch": 0.7091521617069062, "grad_norm": 0.520110011100769, "learning_rate": 9.94689594332927e-06, "loss": 0.472, "step": 842 }, { "epoch": 0.709994385176867, "grad_norm": 0.40589872002601624, "learning_rate": 9.946467588752466e-06, "loss": 0.4981, "step": 843 }, { "epoch": 0.7108366086468276, "grad_norm": 0.5101873278617859, "learning_rate": 9.94603752278269e-06, "loss": 0.4525, "step": 844 }, { "epoch": 0.7116788321167883, "grad_norm": 0.46556946635246277, "learning_rate": 9.945605745568734e-06, "loss": 0.4843, "step": 845 }, { "epoch": 0.712521055586749, "grad_norm": 0.4809977412223816, "learning_rate": 9.945172257259987e-06, "loss": 0.4676, "step": 846 }, { "epoch": 0.7133632790567097, "grad_norm": 0.4874621629714966, "learning_rate": 9.944737058006428e-06, "loss": 0.4821, "step": 847 }, { "epoch": 0.7142055025266704, "grad_norm": 0.4377692937850952, "learning_rate": 9.944300147958632e-06, "loss": 0.4974, "step": 848 }, { "epoch": 0.7150477259966311, "grad_norm": 0.43947675824165344, "learning_rate": 9.943861527267757e-06, "loss": 0.4743, "step": 849 }, { "epoch": 0.7158899494665918, "grad_norm": 0.495939701795578, "learning_rate": 9.94342119608556e-06, "loss": 0.457, "step": 850 }, { "epoch": 0.7167321729365524, "grad_norm": 0.43020787835121155, "learning_rate": 9.942979154564389e-06, "loss": 0.5016, "step": 851 }, { "epoch": 0.7175743964065132, "grad_norm": 0.4830726087093353, "learning_rate": 9.942535402857183e-06, "loss": 0.4863, "step": 852 }, { "epoch": 0.7184166198764739, "grad_norm": 0.4980354607105255, "learning_rate": 9.942089941117473e-06, "loss": 0.4813, "step": 853 }, { "epoch": 0.7192588433464346, "grad_norm": 0.44717633724212646, "learning_rate": 9.94164276949938e-06, "loss": 0.4431, "step": 854 }, { "epoch": 0.7201010668163953, "grad_norm": 0.47325465083122253, "learning_rate": 9.941193888157616e-06, "loss": 0.4754, "step": 855 }, { "epoch": 0.720943290286356, "grad_norm": 0.5001667141914368, "learning_rate": 9.940743297247489e-06, "loss": 0.4769, "step": 856 }, { "epoch": 0.7217855137563167, "grad_norm": 0.49592381715774536, "learning_rate": 9.940290996924895e-06, "loss": 0.4534, "step": 857 }, { "epoch": 0.7226277372262774, "grad_norm": 0.4938085973262787, "learning_rate": 9.939836987346321e-06, "loss": 0.4635, "step": 858 }, { "epoch": 0.723469960696238, "grad_norm": 0.5034784078598022, "learning_rate": 9.939381268668849e-06, "loss": 0.4787, "step": 859 }, { "epoch": 0.7243121841661988, "grad_norm": 0.49806955456733704, "learning_rate": 9.938923841050147e-06, "loss": 0.4657, "step": 860 }, { "epoch": 0.7251544076361595, "grad_norm": 0.4600309431552887, "learning_rate": 9.938464704648479e-06, "loss": 0.4513, "step": 861 }, { "epoch": 0.7259966311061201, "grad_norm": 0.48210766911506653, "learning_rate": 9.938003859622698e-06, "loss": 0.4773, "step": 862 }, { "epoch": 0.7268388545760809, "grad_norm": 0.43720772862434387, "learning_rate": 9.937541306132247e-06, "loss": 0.4657, "step": 863 }, { "epoch": 0.7276810780460415, "grad_norm": 0.4741774797439575, "learning_rate": 9.937077044337164e-06, "loss": 0.4697, "step": 864 }, { "epoch": 0.7285233015160022, "grad_norm": 0.5397223830223083, "learning_rate": 9.936611074398074e-06, "loss": 0.4574, "step": 865 }, { "epoch": 0.729365524985963, "grad_norm": 0.45597586035728455, "learning_rate": 9.936143396476194e-06, "loss": 0.4622, "step": 866 }, { "epoch": 0.7302077484559236, "grad_norm": 0.41884714365005493, "learning_rate": 9.935674010733337e-06, "loss": 0.444, "step": 867 }, { "epoch": 0.7310499719258844, "grad_norm": 0.4450487196445465, "learning_rate": 9.935202917331894e-06, "loss": 0.4562, "step": 868 }, { "epoch": 0.731892195395845, "grad_norm": 0.42634057998657227, "learning_rate": 9.934730116434864e-06, "loss": 0.4436, "step": 869 }, { "epoch": 0.7327344188658057, "grad_norm": 0.5128663182258606, "learning_rate": 9.934255608205822e-06, "loss": 0.4751, "step": 870 }, { "epoch": 0.7335766423357665, "grad_norm": 0.4837375283241272, "learning_rate": 9.933779392808945e-06, "loss": 0.4687, "step": 871 }, { "epoch": 0.7344188658057271, "grad_norm": 0.46048328280448914, "learning_rate": 9.93330147040899e-06, "loss": 0.4332, "step": 872 }, { "epoch": 0.7352610892756878, "grad_norm": 0.6326079368591309, "learning_rate": 9.932821841171311e-06, "loss": 0.4628, "step": 873 }, { "epoch": 0.7361033127456486, "grad_norm": 0.47041600942611694, "learning_rate": 9.932340505261856e-06, "loss": 0.4459, "step": 874 }, { "epoch": 0.7369455362156092, "grad_norm": 0.5281173586845398, "learning_rate": 9.931857462847152e-06, "loss": 0.4686, "step": 875 }, { "epoch": 0.7377877596855699, "grad_norm": 0.555915892124176, "learning_rate": 9.93137271409433e-06, "loss": 0.4492, "step": 876 }, { "epoch": 0.7386299831555306, "grad_norm": 0.5359073877334595, "learning_rate": 9.930886259171101e-06, "loss": 0.4841, "step": 877 }, { "epoch": 0.7394722066254913, "grad_norm": 0.5758495330810547, "learning_rate": 9.930398098245771e-06, "loss": 0.4677, "step": 878 }, { "epoch": 0.740314430095452, "grad_norm": 0.6444314122200012, "learning_rate": 9.929908231487235e-06, "loss": 0.4777, "step": 879 }, { "epoch": 0.7411566535654127, "grad_norm": 0.517943799495697, "learning_rate": 9.929416659064978e-06, "loss": 0.4636, "step": 880 }, { "epoch": 0.7419988770353734, "grad_norm": 0.7036448121070862, "learning_rate": 9.928923381149079e-06, "loss": 0.4855, "step": 881 }, { "epoch": 0.742841100505334, "grad_norm": 0.509889543056488, "learning_rate": 9.928428397910198e-06, "loss": 0.463, "step": 882 }, { "epoch": 0.7436833239752948, "grad_norm": 0.5095185041427612, "learning_rate": 9.927931709519595e-06, "loss": 0.4553, "step": 883 }, { "epoch": 0.7445255474452555, "grad_norm": 0.47011786699295044, "learning_rate": 9.927433316149114e-06, "loss": 0.4761, "step": 884 }, { "epoch": 0.7453677709152162, "grad_norm": 0.6273627877235413, "learning_rate": 9.926933217971191e-06, "loss": 0.4841, "step": 885 }, { "epoch": 0.7462099943851769, "grad_norm": 0.5003870129585266, "learning_rate": 9.926431415158852e-06, "loss": 0.487, "step": 886 }, { "epoch": 0.7470522178551375, "grad_norm": 0.5939120054244995, "learning_rate": 9.925927907885713e-06, "loss": 0.4785, "step": 887 }, { "epoch": 0.7478944413250983, "grad_norm": 0.56283038854599, "learning_rate": 9.925422696325976e-06, "loss": 0.4781, "step": 888 }, { "epoch": 0.748736664795059, "grad_norm": 0.6758758425712585, "learning_rate": 9.924915780654436e-06, "loss": 0.5112, "step": 889 }, { "epoch": 0.7495788882650196, "grad_norm": 0.47413796186447144, "learning_rate": 9.92440716104648e-06, "loss": 0.451, "step": 890 }, { "epoch": 0.7504211117349804, "grad_norm": 0.606082022190094, "learning_rate": 9.923896837678079e-06, "loss": 0.45, "step": 891 }, { "epoch": 0.751263335204941, "grad_norm": 0.47417351603507996, "learning_rate": 9.923384810725795e-06, "loss": 0.4837, "step": 892 }, { "epoch": 0.7521055586749017, "grad_norm": 0.5353036522865295, "learning_rate": 9.922871080366786e-06, "loss": 0.4591, "step": 893 }, { "epoch": 0.7529477821448625, "grad_norm": 0.46756482124328613, "learning_rate": 9.922355646778789e-06, "loss": 0.4532, "step": 894 }, { "epoch": 0.7537900056148231, "grad_norm": 0.4891708791255951, "learning_rate": 9.921838510140135e-06, "loss": 0.4553, "step": 895 }, { "epoch": 0.7546322290847838, "grad_norm": 0.5232227444648743, "learning_rate": 9.921319670629748e-06, "loss": 0.4565, "step": 896 }, { "epoch": 0.7554744525547445, "grad_norm": 0.6023972630500793, "learning_rate": 9.920799128427134e-06, "loss": 0.4616, "step": 897 }, { "epoch": 0.7563166760247052, "grad_norm": 0.47977980971336365, "learning_rate": 9.920276883712394e-06, "loss": 0.457, "step": 898 }, { "epoch": 0.757158899494666, "grad_norm": 0.4814787805080414, "learning_rate": 9.919752936666216e-06, "loss": 0.4565, "step": 899 }, { "epoch": 0.7580011229646266, "grad_norm": 0.5237877368927002, "learning_rate": 9.919227287469874e-06, "loss": 0.4453, "step": 900 }, { "epoch": 0.7588433464345873, "grad_norm": 0.4573395252227783, "learning_rate": 9.918699936305235e-06, "loss": 0.4949, "step": 901 }, { "epoch": 0.759685569904548, "grad_norm": 0.5195721983909607, "learning_rate": 9.918170883354756e-06, "loss": 0.4535, "step": 902 }, { "epoch": 0.7605277933745087, "grad_norm": 0.5270134210586548, "learning_rate": 9.917640128801476e-06, "loss": 0.4658, "step": 903 }, { "epoch": 0.7613700168444694, "grad_norm": 0.49942880868911743, "learning_rate": 9.917107672829029e-06, "loss": 0.4907, "step": 904 }, { "epoch": 0.7622122403144301, "grad_norm": 0.598584771156311, "learning_rate": 9.916573515621636e-06, "loss": 0.4582, "step": 905 }, { "epoch": 0.7630544637843908, "grad_norm": 0.4719258248806, "learning_rate": 9.916037657364106e-06, "loss": 0.4617, "step": 906 }, { "epoch": 0.7638966872543514, "grad_norm": 0.515297532081604, "learning_rate": 9.915500098241836e-06, "loss": 0.4751, "step": 907 }, { "epoch": 0.7647389107243122, "grad_norm": 0.49206236004829407, "learning_rate": 9.914960838440812e-06, "loss": 0.4854, "step": 908 }, { "epoch": 0.7655811341942729, "grad_norm": 0.5045447945594788, "learning_rate": 9.914419878147611e-06, "loss": 0.4929, "step": 909 }, { "epoch": 0.7664233576642335, "grad_norm": 0.46059924364089966, "learning_rate": 9.913877217549395e-06, "loss": 0.4344, "step": 910 }, { "epoch": 0.7672655811341943, "grad_norm": 0.4695572853088379, "learning_rate": 9.913332856833913e-06, "loss": 0.4667, "step": 911 }, { "epoch": 0.768107804604155, "grad_norm": 0.5009487271308899, "learning_rate": 9.912786796189506e-06, "loss": 0.4845, "step": 912 }, { "epoch": 0.7689500280741156, "grad_norm": 0.5248982906341553, "learning_rate": 9.912239035805104e-06, "loss": 0.4504, "step": 913 }, { "epoch": 0.7697922515440764, "grad_norm": 0.6364567279815674, "learning_rate": 9.911689575870218e-06, "loss": 0.4653, "step": 914 }, { "epoch": 0.770634475014037, "grad_norm": 0.5651689171791077, "learning_rate": 9.911138416574955e-06, "loss": 0.4646, "step": 915 }, { "epoch": 0.7714766984839978, "grad_norm": 0.5725415945053101, "learning_rate": 9.910585558110006e-06, "loss": 0.4701, "step": 916 }, { "epoch": 0.7723189219539585, "grad_norm": 0.5373462438583374, "learning_rate": 9.910031000666652e-06, "loss": 0.4861, "step": 917 }, { "epoch": 0.7731611454239191, "grad_norm": 0.45060834288597107, "learning_rate": 9.909474744436759e-06, "loss": 0.4458, "step": 918 }, { "epoch": 0.7740033688938799, "grad_norm": 0.4726708233356476, "learning_rate": 9.90891678961278e-06, "loss": 0.4397, "step": 919 }, { "epoch": 0.7748455923638405, "grad_norm": 0.44616591930389404, "learning_rate": 9.908357136387758e-06, "loss": 0.4464, "step": 920 }, { "epoch": 0.7756878158338012, "grad_norm": 0.5173125863075256, "learning_rate": 9.907795784955327e-06, "loss": 0.4605, "step": 921 }, { "epoch": 0.776530039303762, "grad_norm": 0.5849637985229492, "learning_rate": 9.907232735509704e-06, "loss": 0.4697, "step": 922 }, { "epoch": 0.7773722627737226, "grad_norm": 0.47747382521629333, "learning_rate": 9.906667988245694e-06, "loss": 0.4884, "step": 923 }, { "epoch": 0.7782144862436833, "grad_norm": 0.5743599534034729, "learning_rate": 9.906101543358687e-06, "loss": 0.5087, "step": 924 }, { "epoch": 0.779056709713644, "grad_norm": 0.4951550364494324, "learning_rate": 9.905533401044666e-06, "loss": 0.4327, "step": 925 }, { "epoch": 0.7798989331836047, "grad_norm": 0.5402474403381348, "learning_rate": 9.9049635615002e-06, "loss": 0.4869, "step": 926 }, { "epoch": 0.7807411566535654, "grad_norm": 0.5782483220100403, "learning_rate": 9.90439202492244e-06, "loss": 0.454, "step": 927 }, { "epoch": 0.7815833801235261, "grad_norm": 0.5131528973579407, "learning_rate": 9.90381879150913e-06, "loss": 0.4511, "step": 928 }, { "epoch": 0.7824256035934868, "grad_norm": 0.4716830551624298, "learning_rate": 9.9032438614586e-06, "loss": 0.4644, "step": 929 }, { "epoch": 0.7832678270634476, "grad_norm": 0.5034114122390747, "learning_rate": 9.902667234969764e-06, "loss": 0.4587, "step": 930 }, { "epoch": 0.7841100505334082, "grad_norm": 0.483680784702301, "learning_rate": 9.902088912242124e-06, "loss": 0.4858, "step": 931 }, { "epoch": 0.7849522740033689, "grad_norm": 0.5788254141807556, "learning_rate": 9.901508893475774e-06, "loss": 0.4805, "step": 932 }, { "epoch": 0.7857944974733296, "grad_norm": 0.4786970019340515, "learning_rate": 9.900927178871387e-06, "loss": 0.4614, "step": 933 }, { "epoch": 0.7866367209432903, "grad_norm": 0.43719249963760376, "learning_rate": 9.900343768630226e-06, "loss": 0.4664, "step": 934 }, { "epoch": 0.787478944413251, "grad_norm": 0.5963888764381409, "learning_rate": 9.899758662954143e-06, "loss": 0.4828, "step": 935 }, { "epoch": 0.7883211678832117, "grad_norm": 0.4644656181335449, "learning_rate": 9.899171862045572e-06, "loss": 0.465, "step": 936 }, { "epoch": 0.7891633913531724, "grad_norm": 0.4614083766937256, "learning_rate": 9.898583366107539e-06, "loss": 0.4776, "step": 937 }, { "epoch": 0.790005614823133, "grad_norm": 0.5263941884040833, "learning_rate": 9.897993175343652e-06, "loss": 0.4659, "step": 938 }, { "epoch": 0.7908478382930938, "grad_norm": 0.5272285342216492, "learning_rate": 9.897401289958105e-06, "loss": 0.4605, "step": 939 }, { "epoch": 0.7916900617630545, "grad_norm": 0.4992671310901642, "learning_rate": 9.896807710155683e-06, "loss": 0.4388, "step": 940 }, { "epoch": 0.7925322852330151, "grad_norm": 0.5701236724853516, "learning_rate": 9.896212436141755e-06, "loss": 0.4353, "step": 941 }, { "epoch": 0.7933745087029759, "grad_norm": 0.4643474817276001, "learning_rate": 9.895615468122272e-06, "loss": 0.4678, "step": 942 }, { "epoch": 0.7942167321729365, "grad_norm": 0.5412276983261108, "learning_rate": 9.89501680630378e-06, "loss": 0.4658, "step": 943 }, { "epoch": 0.7950589556428973, "grad_norm": 0.535186231136322, "learning_rate": 9.8944164508934e-06, "loss": 0.4674, "step": 944 }, { "epoch": 0.795901179112858, "grad_norm": 0.47683751583099365, "learning_rate": 9.893814402098847e-06, "loss": 0.4636, "step": 945 }, { "epoch": 0.7967434025828186, "grad_norm": 0.5672494173049927, "learning_rate": 9.893210660128423e-06, "loss": 0.494, "step": 946 }, { "epoch": 0.7975856260527794, "grad_norm": 0.4505089819431305, "learning_rate": 9.892605225191005e-06, "loss": 0.461, "step": 947 }, { "epoch": 0.79842784952274, "grad_norm": 0.47686219215393066, "learning_rate": 9.891998097496071e-06, "loss": 0.4476, "step": 948 }, { "epoch": 0.7992700729927007, "grad_norm": 0.5206319093704224, "learning_rate": 9.891389277253672e-06, "loss": 0.4544, "step": 949 }, { "epoch": 0.8001122964626615, "grad_norm": 0.4414949119091034, "learning_rate": 9.89077876467445e-06, "loss": 0.4644, "step": 950 }, { "epoch": 0.8009545199326221, "grad_norm": 0.43398547172546387, "learning_rate": 9.890166559969632e-06, "loss": 0.459, "step": 951 }, { "epoch": 0.8017967434025828, "grad_norm": 0.5424755811691284, "learning_rate": 9.88955266335103e-06, "loss": 0.4673, "step": 952 }, { "epoch": 0.8026389668725435, "grad_norm": 0.4726047217845917, "learning_rate": 9.888937075031045e-06, "loss": 0.4671, "step": 953 }, { "epoch": 0.8034811903425042, "grad_norm": 0.48992475867271423, "learning_rate": 9.888319795222654e-06, "loss": 0.4625, "step": 954 }, { "epoch": 0.8043234138124649, "grad_norm": 0.4926284849643707, "learning_rate": 9.887700824139432e-06, "loss": 0.4666, "step": 955 }, { "epoch": 0.8051656372824256, "grad_norm": 0.49296411871910095, "learning_rate": 9.887080161995526e-06, "loss": 0.4908, "step": 956 }, { "epoch": 0.8060078607523863, "grad_norm": 0.4417799413204193, "learning_rate": 9.886457809005681e-06, "loss": 0.4716, "step": 957 }, { "epoch": 0.8068500842223469, "grad_norm": 0.525761067867279, "learning_rate": 9.885833765385213e-06, "loss": 0.4612, "step": 958 }, { "epoch": 0.8076923076923077, "grad_norm": 0.47954583168029785, "learning_rate": 9.885208031350038e-06, "loss": 0.4855, "step": 959 }, { "epoch": 0.8085345311622684, "grad_norm": 0.4634093940258026, "learning_rate": 9.884580607116642e-06, "loss": 0.4387, "step": 960 }, { "epoch": 0.8093767546322291, "grad_norm": 0.575893223285675, "learning_rate": 9.883951492902109e-06, "loss": 0.4893, "step": 961 }, { "epoch": 0.8102189781021898, "grad_norm": 0.44635009765625, "learning_rate": 9.883320688924099e-06, "loss": 0.4695, "step": 962 }, { "epoch": 0.8110612015721504, "grad_norm": 0.5045120716094971, "learning_rate": 9.882688195400858e-06, "loss": 0.4599, "step": 963 }, { "epoch": 0.8119034250421112, "grad_norm": 0.48984596133232117, "learning_rate": 9.882054012551221e-06, "loss": 0.5102, "step": 964 }, { "epoch": 0.8127456485120719, "grad_norm": 0.4571196138858795, "learning_rate": 9.881418140594604e-06, "loss": 0.4433, "step": 965 }, { "epoch": 0.8135878719820325, "grad_norm": 0.5153687000274658, "learning_rate": 9.880780579751005e-06, "loss": 0.4492, "step": 966 }, { "epoch": 0.8144300954519933, "grad_norm": 0.49311670660972595, "learning_rate": 9.880141330241012e-06, "loss": 0.4686, "step": 967 }, { "epoch": 0.815272318921954, "grad_norm": 0.40269705653190613, "learning_rate": 9.879500392285792e-06, "loss": 0.4672, "step": 968 }, { "epoch": 0.8161145423919146, "grad_norm": 0.47277525067329407, "learning_rate": 9.878857766107101e-06, "loss": 0.456, "step": 969 }, { "epoch": 0.8169567658618754, "grad_norm": 0.5428763628005981, "learning_rate": 9.878213451927275e-06, "loss": 0.4771, "step": 970 }, { "epoch": 0.817798989331836, "grad_norm": 0.5426615476608276, "learning_rate": 9.877567449969236e-06, "loss": 0.4851, "step": 971 }, { "epoch": 0.8186412128017967, "grad_norm": 0.49001550674438477, "learning_rate": 9.876919760456492e-06, "loss": 0.4574, "step": 972 }, { "epoch": 0.8194834362717575, "grad_norm": 0.6090008616447449, "learning_rate": 9.876270383613126e-06, "loss": 0.4731, "step": 973 }, { "epoch": 0.8203256597417181, "grad_norm": 0.5504104495048523, "learning_rate": 9.875619319663818e-06, "loss": 0.4798, "step": 974 }, { "epoch": 0.8211678832116789, "grad_norm": 0.47188931703567505, "learning_rate": 9.874966568833822e-06, "loss": 0.4336, "step": 975 }, { "epoch": 0.8220101066816395, "grad_norm": 0.5309103727340698, "learning_rate": 9.87431213134898e-06, "loss": 0.478, "step": 976 }, { "epoch": 0.8228523301516002, "grad_norm": 0.4661477208137512, "learning_rate": 9.873656007435714e-06, "loss": 0.4603, "step": 977 }, { "epoch": 0.823694553621561, "grad_norm": 0.5204543471336365, "learning_rate": 9.872998197321033e-06, "loss": 0.4771, "step": 978 }, { "epoch": 0.8245367770915216, "grad_norm": 0.43733739852905273, "learning_rate": 9.872338701232527e-06, "loss": 0.4593, "step": 979 }, { "epoch": 0.8253790005614823, "grad_norm": 0.4715561866760254, "learning_rate": 9.871677519398372e-06, "loss": 0.4548, "step": 980 }, { "epoch": 0.826221224031443, "grad_norm": 0.49889642000198364, "learning_rate": 9.871014652047324e-06, "loss": 0.4301, "step": 981 }, { "epoch": 0.8270634475014037, "grad_norm": 0.5677552819252014, "learning_rate": 9.870350099408725e-06, "loss": 0.4783, "step": 982 }, { "epoch": 0.8279056709713644, "grad_norm": 0.4806675910949707, "learning_rate": 9.869683861712497e-06, "loss": 0.4743, "step": 983 }, { "epoch": 0.8287478944413251, "grad_norm": 0.4818889796733856, "learning_rate": 9.86901593918915e-06, "loss": 0.4511, "step": 984 }, { "epoch": 0.8295901179112858, "grad_norm": 0.42305275797843933, "learning_rate": 9.868346332069771e-06, "loss": 0.4331, "step": 985 }, { "epoch": 0.8304323413812464, "grad_norm": 0.4770194888114929, "learning_rate": 9.867675040586035e-06, "loss": 0.4667, "step": 986 }, { "epoch": 0.8312745648512072, "grad_norm": 0.46162793040275574, "learning_rate": 9.867002064970193e-06, "loss": 0.4704, "step": 987 }, { "epoch": 0.8321167883211679, "grad_norm": 0.48674070835113525, "learning_rate": 9.866327405455088e-06, "loss": 0.4525, "step": 988 }, { "epoch": 0.8329590117911286, "grad_norm": 0.41060981154441833, "learning_rate": 9.865651062274137e-06, "loss": 0.4445, "step": 989 }, { "epoch": 0.8338012352610893, "grad_norm": 0.44868072867393494, "learning_rate": 9.864973035661345e-06, "loss": 0.4842, "step": 990 }, { "epoch": 0.83464345873105, "grad_norm": 0.48845505714416504, "learning_rate": 9.864293325851297e-06, "loss": 0.4825, "step": 991 }, { "epoch": 0.8354856822010107, "grad_norm": 0.4033015966415405, "learning_rate": 9.863611933079162e-06, "loss": 0.4577, "step": 992 }, { "epoch": 0.8363279056709714, "grad_norm": 0.42306795716285706, "learning_rate": 9.862928857580688e-06, "loss": 0.4505, "step": 993 }, { "epoch": 0.837170129140932, "grad_norm": 0.4473508596420288, "learning_rate": 9.862244099592208e-06, "loss": 0.4917, "step": 994 }, { "epoch": 0.8380123526108928, "grad_norm": 0.4112057387828827, "learning_rate": 9.861557659350639e-06, "loss": 0.4786, "step": 995 }, { "epoch": 0.8388545760808535, "grad_norm": 0.5104999542236328, "learning_rate": 9.860869537093474e-06, "loss": 0.4684, "step": 996 }, { "epoch": 0.8396967995508141, "grad_norm": 0.46732455492019653, "learning_rate": 9.860179733058796e-06, "loss": 0.4414, "step": 997 }, { "epoch": 0.8405390230207749, "grad_norm": 0.5455535650253296, "learning_rate": 9.85948824748526e-06, "loss": 0.445, "step": 998 }, { "epoch": 0.8413812464907355, "grad_norm": 0.6222993731498718, "learning_rate": 9.858795080612113e-06, "loss": 0.4757, "step": 999 }, { "epoch": 0.8422234699606962, "grad_norm": 0.5075825452804565, "learning_rate": 9.858100232679176e-06, "loss": 0.4574, "step": 1000 }, { "epoch": 0.843065693430657, "grad_norm": 0.5645561814308167, "learning_rate": 9.857403703926853e-06, "loss": 0.4688, "step": 1001 }, { "epoch": 0.8439079169006176, "grad_norm": 0.6279970407485962, "learning_rate": 9.856705494596135e-06, "loss": 0.4684, "step": 1002 }, { "epoch": 0.8447501403705783, "grad_norm": 0.47694310545921326, "learning_rate": 9.856005604928588e-06, "loss": 0.4625, "step": 1003 }, { "epoch": 0.845592363840539, "grad_norm": 0.6660037040710449, "learning_rate": 9.855304035166361e-06, "loss": 0.4532, "step": 1004 }, { "epoch": 0.8464345873104997, "grad_norm": 0.4602040946483612, "learning_rate": 9.85460078555219e-06, "loss": 0.4548, "step": 1005 }, { "epoch": 0.8472768107804605, "grad_norm": 0.5448400974273682, "learning_rate": 9.853895856329381e-06, "loss": 0.464, "step": 1006 }, { "epoch": 0.8481190342504211, "grad_norm": 0.46972760558128357, "learning_rate": 9.853189247741832e-06, "loss": 0.4476, "step": 1007 }, { "epoch": 0.8489612577203818, "grad_norm": 0.430246502161026, "learning_rate": 9.852480960034015e-06, "loss": 0.4759, "step": 1008 }, { "epoch": 0.8498034811903425, "grad_norm": 0.4795200824737549, "learning_rate": 9.851770993450987e-06, "loss": 0.4724, "step": 1009 }, { "epoch": 0.8506457046603032, "grad_norm": 0.4719507396221161, "learning_rate": 9.851059348238381e-06, "loss": 0.4699, "step": 1010 }, { "epoch": 0.8514879281302639, "grad_norm": 0.49389660358428955, "learning_rate": 9.850346024642418e-06, "loss": 0.4827, "step": 1011 }, { "epoch": 0.8523301516002246, "grad_norm": 0.44759175181388855, "learning_rate": 9.849631022909893e-06, "loss": 0.4708, "step": 1012 }, { "epoch": 0.8531723750701853, "grad_norm": 0.46587780117988586, "learning_rate": 9.848914343288188e-06, "loss": 0.4707, "step": 1013 }, { "epoch": 0.8540145985401459, "grad_norm": 0.49081918597221375, "learning_rate": 9.848195986025258e-06, "loss": 0.4742, "step": 1014 }, { "epoch": 0.8548568220101067, "grad_norm": 0.4355715215206146, "learning_rate": 9.847475951369642e-06, "loss": 0.4417, "step": 1015 }, { "epoch": 0.8556990454800674, "grad_norm": 0.4987232983112335, "learning_rate": 9.846754239570465e-06, "loss": 0.452, "step": 1016 }, { "epoch": 0.856541268950028, "grad_norm": 0.46841534972190857, "learning_rate": 9.846030850877419e-06, "loss": 0.4714, "step": 1017 }, { "epoch": 0.8573834924199888, "grad_norm": 0.498003751039505, "learning_rate": 9.84530578554079e-06, "loss": 0.5067, "step": 1018 }, { "epoch": 0.8582257158899494, "grad_norm": 0.43070635199546814, "learning_rate": 9.844579043811437e-06, "loss": 0.474, "step": 1019 }, { "epoch": 0.8590679393599102, "grad_norm": 0.5455447435379028, "learning_rate": 9.8438506259408e-06, "loss": 0.4579, "step": 1020 }, { "epoch": 0.8599101628298709, "grad_norm": 0.49477139115333557, "learning_rate": 9.843120532180896e-06, "loss": 0.5024, "step": 1021 }, { "epoch": 0.8607523862998315, "grad_norm": 0.4833908677101135, "learning_rate": 9.842388762784331e-06, "loss": 0.4498, "step": 1022 }, { "epoch": 0.8615946097697923, "grad_norm": 0.5051225423812866, "learning_rate": 9.84165531800428e-06, "loss": 0.4435, "step": 1023 }, { "epoch": 0.862436833239753, "grad_norm": 0.4352179765701294, "learning_rate": 9.840920198094503e-06, "loss": 0.4779, "step": 1024 }, { "epoch": 0.8632790567097136, "grad_norm": 0.47995612025260925, "learning_rate": 9.84018340330934e-06, "loss": 0.4505, "step": 1025 }, { "epoch": 0.8641212801796744, "grad_norm": 0.4928273856639862, "learning_rate": 9.839444933903711e-06, "loss": 0.4654, "step": 1026 }, { "epoch": 0.864963503649635, "grad_norm": 0.5084526538848877, "learning_rate": 9.838704790133108e-06, "loss": 0.4604, "step": 1027 }, { "epoch": 0.8658057271195957, "grad_norm": 0.44980600476264954, "learning_rate": 9.837962972253613e-06, "loss": 0.4627, "step": 1028 }, { "epoch": 0.8666479505895565, "grad_norm": 0.5275464653968811, "learning_rate": 9.83721948052188e-06, "loss": 0.4734, "step": 1029 }, { "epoch": 0.8674901740595171, "grad_norm": 0.39917150139808655, "learning_rate": 9.836474315195148e-06, "loss": 0.4521, "step": 1030 }, { "epoch": 0.8683323975294778, "grad_norm": 0.504328727722168, "learning_rate": 9.835727476531228e-06, "loss": 0.4746, "step": 1031 }, { "epoch": 0.8691746209994385, "grad_norm": 0.447465717792511, "learning_rate": 9.834978964788512e-06, "loss": 0.4679, "step": 1032 }, { "epoch": 0.8700168444693992, "grad_norm": 0.5234307646751404, "learning_rate": 9.834228780225976e-06, "loss": 0.5061, "step": 1033 }, { "epoch": 0.87085906793936, "grad_norm": 0.43534836173057556, "learning_rate": 9.83347692310317e-06, "loss": 0.4831, "step": 1034 }, { "epoch": 0.8717012914093206, "grad_norm": 0.5181679725646973, "learning_rate": 9.832723393680222e-06, "loss": 0.4571, "step": 1035 }, { "epoch": 0.8725435148792813, "grad_norm": 0.4691329300403595, "learning_rate": 9.831968192217841e-06, "loss": 0.4664, "step": 1036 }, { "epoch": 0.873385738349242, "grad_norm": 0.5027342438697815, "learning_rate": 9.831211318977316e-06, "loss": 0.4564, "step": 1037 }, { "epoch": 0.8742279618192027, "grad_norm": 0.4205072522163391, "learning_rate": 9.830452774220507e-06, "loss": 0.4534, "step": 1038 }, { "epoch": 0.8750701852891634, "grad_norm": 0.4738374650478363, "learning_rate": 9.829692558209864e-06, "loss": 0.4352, "step": 1039 }, { "epoch": 0.8759124087591241, "grad_norm": 0.505933940410614, "learning_rate": 9.828930671208403e-06, "loss": 0.4533, "step": 1040 }, { "epoch": 0.8767546322290848, "grad_norm": 0.4729403555393219, "learning_rate": 9.828167113479728e-06, "loss": 0.459, "step": 1041 }, { "epoch": 0.8775968556990454, "grad_norm": 0.4866616725921631, "learning_rate": 9.827401885288014e-06, "loss": 0.4479, "step": 1042 }, { "epoch": 0.8784390791690062, "grad_norm": 0.5079085826873779, "learning_rate": 9.826634986898019e-06, "loss": 0.4628, "step": 1043 }, { "epoch": 0.8792813026389669, "grad_norm": 0.5567856431007385, "learning_rate": 9.825866418575074e-06, "loss": 0.4622, "step": 1044 }, { "epoch": 0.8801235261089275, "grad_norm": 0.5741145610809326, "learning_rate": 9.825096180585093e-06, "loss": 0.4433, "step": 1045 }, { "epoch": 0.8809657495788883, "grad_norm": 0.5637805461883545, "learning_rate": 9.824324273194564e-06, "loss": 0.4763, "step": 1046 }, { "epoch": 0.881807973048849, "grad_norm": 0.45682552456855774, "learning_rate": 9.823550696670554e-06, "loss": 0.4569, "step": 1047 }, { "epoch": 0.8826501965188096, "grad_norm": 0.5551769137382507, "learning_rate": 9.822775451280708e-06, "loss": 0.4492, "step": 1048 }, { "epoch": 0.8834924199887704, "grad_norm": 0.49221324920654297, "learning_rate": 9.821998537293246e-06, "loss": 0.4671, "step": 1049 }, { "epoch": 0.884334643458731, "grad_norm": 0.5118264555931091, "learning_rate": 9.821219954976968e-06, "loss": 0.469, "step": 1050 }, { "epoch": 0.8851768669286918, "grad_norm": 0.4409869313240051, "learning_rate": 9.82043970460125e-06, "loss": 0.4614, "step": 1051 }, { "epoch": 0.8860190903986525, "grad_norm": 0.5592017769813538, "learning_rate": 9.819657786436045e-06, "loss": 0.4537, "step": 1052 }, { "epoch": 0.8868613138686131, "grad_norm": 0.508579671382904, "learning_rate": 9.818874200751883e-06, "loss": 0.4563, "step": 1053 }, { "epoch": 0.8877035373385739, "grad_norm": 0.5337381362915039, "learning_rate": 9.818088947819872e-06, "loss": 0.4977, "step": 1054 }, { "epoch": 0.8885457608085345, "grad_norm": 0.6358832716941833, "learning_rate": 9.817302027911694e-06, "loss": 0.4648, "step": 1055 }, { "epoch": 0.8893879842784952, "grad_norm": 0.5693139433860779, "learning_rate": 9.816513441299614e-06, "loss": 0.4825, "step": 1056 }, { "epoch": 0.890230207748456, "grad_norm": 0.4904390871524811, "learning_rate": 9.815723188256465e-06, "loss": 0.4383, "step": 1057 }, { "epoch": 0.8910724312184166, "grad_norm": 0.5208525061607361, "learning_rate": 9.814931269055665e-06, "loss": 0.4531, "step": 1058 }, { "epoch": 0.8919146546883773, "grad_norm": 0.5323155522346497, "learning_rate": 9.814137683971201e-06, "loss": 0.4562, "step": 1059 }, { "epoch": 0.892756878158338, "grad_norm": 0.5167281031608582, "learning_rate": 9.813342433277642e-06, "loss": 0.4823, "step": 1060 }, { "epoch": 0.8935991016282987, "grad_norm": 0.5660203695297241, "learning_rate": 9.81254551725013e-06, "loss": 0.4705, "step": 1061 }, { "epoch": 0.8944413250982594, "grad_norm": 0.4889480173587799, "learning_rate": 9.811746936164383e-06, "loss": 0.4617, "step": 1062 }, { "epoch": 0.8952835485682201, "grad_norm": 0.49995627999305725, "learning_rate": 9.810946690296699e-06, "loss": 0.4501, "step": 1063 }, { "epoch": 0.8961257720381808, "grad_norm": 0.5357601046562195, "learning_rate": 9.81014477992395e-06, "loss": 0.4374, "step": 1064 }, { "epoch": 0.8969679955081415, "grad_norm": 0.4474814236164093, "learning_rate": 9.809341205323578e-06, "loss": 0.4563, "step": 1065 }, { "epoch": 0.8978102189781022, "grad_norm": 0.5200715661048889, "learning_rate": 9.808535966773611e-06, "loss": 0.4549, "step": 1066 }, { "epoch": 0.8986524424480629, "grad_norm": 0.5115195512771606, "learning_rate": 9.807729064552647e-06, "loss": 0.4322, "step": 1067 }, { "epoch": 0.8994946659180236, "grad_norm": 0.5456582307815552, "learning_rate": 9.80692049893986e-06, "loss": 0.4435, "step": 1068 }, { "epoch": 0.9003368893879843, "grad_norm": 0.4813361167907715, "learning_rate": 9.806110270215e-06, "loss": 0.468, "step": 1069 }, { "epoch": 0.9011791128579449, "grad_norm": 0.5605647563934326, "learning_rate": 9.80529837865839e-06, "loss": 0.4668, "step": 1070 }, { "epoch": 0.9020213363279057, "grad_norm": 0.49479940533638, "learning_rate": 9.804484824550933e-06, "loss": 0.4359, "step": 1071 }, { "epoch": 0.9028635597978664, "grad_norm": 0.5246286988258362, "learning_rate": 9.803669608174102e-06, "loss": 0.4697, "step": 1072 }, { "epoch": 0.903705783267827, "grad_norm": 0.5778121948242188, "learning_rate": 9.802852729809953e-06, "loss": 0.4677, "step": 1073 }, { "epoch": 0.9045480067377878, "grad_norm": 0.4872368574142456, "learning_rate": 9.802034189741108e-06, "loss": 0.4964, "step": 1074 }, { "epoch": 0.9053902302077484, "grad_norm": 0.465804785490036, "learning_rate": 9.801213988250769e-06, "loss": 0.4788, "step": 1075 }, { "epoch": 0.9062324536777091, "grad_norm": 0.5263156890869141, "learning_rate": 9.80039212562271e-06, "loss": 0.4608, "step": 1076 }, { "epoch": 0.9070746771476699, "grad_norm": 0.5131971836090088, "learning_rate": 9.799568602141283e-06, "loss": 0.4572, "step": 1077 }, { "epoch": 0.9079169006176305, "grad_norm": 0.454851359128952, "learning_rate": 9.798743418091411e-06, "loss": 0.4795, "step": 1078 }, { "epoch": 0.9087591240875912, "grad_norm": 0.5148426294326782, "learning_rate": 9.797916573758599e-06, "loss": 0.456, "step": 1079 }, { "epoch": 0.909601347557552, "grad_norm": 0.4767214059829712, "learning_rate": 9.797088069428914e-06, "loss": 0.4622, "step": 1080 }, { "epoch": 0.9104435710275126, "grad_norm": 0.49247437715530396, "learning_rate": 9.79625790538901e-06, "loss": 0.4486, "step": 1081 }, { "epoch": 0.9112857944974734, "grad_norm": 0.4621976912021637, "learning_rate": 9.795426081926106e-06, "loss": 0.4326, "step": 1082 }, { "epoch": 0.912128017967434, "grad_norm": 0.5015374422073364, "learning_rate": 9.794592599328e-06, "loss": 0.4577, "step": 1083 }, { "epoch": 0.9129702414373947, "grad_norm": 0.5998482704162598, "learning_rate": 9.793757457883062e-06, "loss": 0.451, "step": 1084 }, { "epoch": 0.9138124649073555, "grad_norm": 0.5711812973022461, "learning_rate": 9.792920657880236e-06, "loss": 0.4633, "step": 1085 }, { "epoch": 0.9146546883773161, "grad_norm": 0.5440461039543152, "learning_rate": 9.792082199609043e-06, "loss": 0.4527, "step": 1086 }, { "epoch": 0.9154969118472768, "grad_norm": 0.5066556334495544, "learning_rate": 9.791242083359574e-06, "loss": 0.4457, "step": 1087 }, { "epoch": 0.9163391353172375, "grad_norm": 0.5630993843078613, "learning_rate": 9.790400309422493e-06, "loss": 0.4702, "step": 1088 }, { "epoch": 0.9171813587871982, "grad_norm": 0.47043001651763916, "learning_rate": 9.789556878089041e-06, "loss": 0.4558, "step": 1089 }, { "epoch": 0.9180235822571589, "grad_norm": 0.4855842888355255, "learning_rate": 9.78871178965103e-06, "loss": 0.447, "step": 1090 }, { "epoch": 0.9188658057271196, "grad_norm": 0.5265313386917114, "learning_rate": 9.787865044400848e-06, "loss": 0.4659, "step": 1091 }, { "epoch": 0.9197080291970803, "grad_norm": 0.4883732497692108, "learning_rate": 9.787016642631453e-06, "loss": 0.477, "step": 1092 }, { "epoch": 0.9205502526670409, "grad_norm": 0.5430927872657776, "learning_rate": 9.786166584636377e-06, "loss": 0.4801, "step": 1093 }, { "epoch": 0.9213924761370017, "grad_norm": 0.5566699504852295, "learning_rate": 9.785314870709726e-06, "loss": 0.4689, "step": 1094 }, { "epoch": 0.9222346996069624, "grad_norm": 0.5029814839363098, "learning_rate": 9.784461501146178e-06, "loss": 0.4839, "step": 1095 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5338709950447083, "learning_rate": 9.783606476240985e-06, "loss": 0.4675, "step": 1096 }, { "epoch": 0.9239191465468838, "grad_norm": 0.5104808807373047, "learning_rate": 9.78274979628997e-06, "loss": 0.4577, "step": 1097 }, { "epoch": 0.9247613700168444, "grad_norm": 0.5607502460479736, "learning_rate": 9.781891461589531e-06, "loss": 0.4581, "step": 1098 }, { "epoch": 0.9256035934868052, "grad_norm": 0.4793992340564728, "learning_rate": 9.781031472436636e-06, "loss": 0.4583, "step": 1099 }, { "epoch": 0.9264458169567659, "grad_norm": 0.4793117344379425, "learning_rate": 9.780169829128827e-06, "loss": 0.4554, "step": 1100 }, { "epoch": 0.9272880404267265, "grad_norm": 0.5163607001304626, "learning_rate": 9.779306531964217e-06, "loss": 0.4459, "step": 1101 }, { "epoch": 0.9281302638966873, "grad_norm": 0.5086690187454224, "learning_rate": 9.778441581241493e-06, "loss": 0.4888, "step": 1102 }, { "epoch": 0.928972487366648, "grad_norm": 0.4563394784927368, "learning_rate": 9.77757497725991e-06, "loss": 0.433, "step": 1103 }, { "epoch": 0.9298147108366086, "grad_norm": 0.4931967854499817, "learning_rate": 9.776706720319306e-06, "loss": 0.4642, "step": 1104 }, { "epoch": 0.9306569343065694, "grad_norm": 0.47397568821907043, "learning_rate": 9.775836810720075e-06, "loss": 0.4394, "step": 1105 }, { "epoch": 0.93149915777653, "grad_norm": 0.47663673758506775, "learning_rate": 9.774965248763193e-06, "loss": 0.4802, "step": 1106 }, { "epoch": 0.9323413812464907, "grad_norm": 0.466875821352005, "learning_rate": 9.774092034750206e-06, "loss": 0.4535, "step": 1107 }, { "epoch": 0.9331836047164515, "grad_norm": 0.5120683312416077, "learning_rate": 9.773217168983234e-06, "loss": 0.4607, "step": 1108 }, { "epoch": 0.9340258281864121, "grad_norm": 0.4187644124031067, "learning_rate": 9.77234065176496e-06, "loss": 0.443, "step": 1109 }, { "epoch": 0.9348680516563729, "grad_norm": 0.5385982990264893, "learning_rate": 9.771462483398648e-06, "loss": 0.4846, "step": 1110 }, { "epoch": 0.9357102751263335, "grad_norm": 0.5422916412353516, "learning_rate": 9.770582664188126e-06, "loss": 0.4923, "step": 1111 }, { "epoch": 0.9365524985962942, "grad_norm": 0.47451841831207275, "learning_rate": 9.769701194437799e-06, "loss": 0.4604, "step": 1112 }, { "epoch": 0.937394722066255, "grad_norm": 0.5698213577270508, "learning_rate": 9.768818074452642e-06, "loss": 0.4587, "step": 1113 }, { "epoch": 0.9382369455362156, "grad_norm": 0.4502493143081665, "learning_rate": 9.767933304538196e-06, "loss": 0.4442, "step": 1114 }, { "epoch": 0.9390791690061763, "grad_norm": 0.5210639834403992, "learning_rate": 9.767046885000575e-06, "loss": 0.4491, "step": 1115 }, { "epoch": 0.939921392476137, "grad_norm": 0.6040498614311218, "learning_rate": 9.76615881614647e-06, "loss": 0.4849, "step": 1116 }, { "epoch": 0.9407636159460977, "grad_norm": 0.44690412282943726, "learning_rate": 9.765269098283132e-06, "loss": 0.4766, "step": 1117 }, { "epoch": 0.9416058394160584, "grad_norm": 0.5467404723167419, "learning_rate": 9.764377731718393e-06, "loss": 0.4278, "step": 1118 }, { "epoch": 0.9424480628860191, "grad_norm": 0.5923998951911926, "learning_rate": 9.763484716760649e-06, "loss": 0.4589, "step": 1119 }, { "epoch": 0.9432902863559798, "grad_norm": 0.5820344090461731, "learning_rate": 9.762590053718866e-06, "loss": 0.4776, "step": 1120 }, { "epoch": 0.9441325098259404, "grad_norm": 0.542753279209137, "learning_rate": 9.761693742902584e-06, "loss": 0.4523, "step": 1121 }, { "epoch": 0.9449747332959012, "grad_norm": 0.5885534882545471, "learning_rate": 9.76079578462191e-06, "loss": 0.447, "step": 1122 }, { "epoch": 0.9458169567658619, "grad_norm": 0.53839510679245, "learning_rate": 9.759896179187523e-06, "loss": 0.4816, "step": 1123 }, { "epoch": 0.9466591802358225, "grad_norm": 0.6585684418678284, "learning_rate": 9.758994926910671e-06, "loss": 0.4525, "step": 1124 }, { "epoch": 0.9475014037057833, "grad_norm": 0.6508302092552185, "learning_rate": 9.758092028103173e-06, "loss": 0.4707, "step": 1125 }, { "epoch": 0.9483436271757439, "grad_norm": 0.5071263909339905, "learning_rate": 9.757187483077413e-06, "loss": 0.4542, "step": 1126 }, { "epoch": 0.9491858506457047, "grad_norm": 0.5404795408248901, "learning_rate": 9.75628129214635e-06, "loss": 0.447, "step": 1127 }, { "epoch": 0.9500280741156654, "grad_norm": 0.5309326648712158, "learning_rate": 9.755373455623513e-06, "loss": 0.4601, "step": 1128 }, { "epoch": 0.950870297585626, "grad_norm": 0.4837949872016907, "learning_rate": 9.754463973822993e-06, "loss": 0.4622, "step": 1129 }, { "epoch": 0.9517125210555868, "grad_norm": 0.5418076515197754, "learning_rate": 9.753552847059459e-06, "loss": 0.4384, "step": 1130 }, { "epoch": 0.9525547445255474, "grad_norm": 0.4526481628417969, "learning_rate": 9.752640075648145e-06, "loss": 0.4832, "step": 1131 }, { "epoch": 0.9533969679955081, "grad_norm": 0.4950000047683716, "learning_rate": 9.751725659904853e-06, "loss": 0.4612, "step": 1132 }, { "epoch": 0.9542391914654689, "grad_norm": 0.5519930124282837, "learning_rate": 9.750809600145955e-06, "loss": 0.4232, "step": 1133 }, { "epoch": 0.9550814149354295, "grad_norm": 0.48338988423347473, "learning_rate": 9.749891896688392e-06, "loss": 0.4471, "step": 1134 }, { "epoch": 0.9559236384053902, "grad_norm": 0.5226576328277588, "learning_rate": 9.748972549849674e-06, "loss": 0.4434, "step": 1135 }, { "epoch": 0.956765861875351, "grad_norm": 0.4434978663921356, "learning_rate": 9.748051559947879e-06, "loss": 0.4684, "step": 1136 }, { "epoch": 0.9576080853453116, "grad_norm": 0.6407195329666138, "learning_rate": 9.747128927301654e-06, "loss": 0.4719, "step": 1137 }, { "epoch": 0.9584503088152723, "grad_norm": 0.467170774936676, "learning_rate": 9.746204652230215e-06, "loss": 0.4618, "step": 1138 }, { "epoch": 0.959292532285233, "grad_norm": 0.553452730178833, "learning_rate": 9.745278735053345e-06, "loss": 0.4833, "step": 1139 }, { "epoch": 0.9601347557551937, "grad_norm": 0.506420373916626, "learning_rate": 9.744351176091394e-06, "loss": 0.4835, "step": 1140 }, { "epoch": 0.9609769792251545, "grad_norm": 0.5042441487312317, "learning_rate": 9.743421975665281e-06, "loss": 0.4379, "step": 1141 }, { "epoch": 0.9618192026951151, "grad_norm": 0.5174967646598816, "learning_rate": 9.742491134096497e-06, "loss": 0.4356, "step": 1142 }, { "epoch": 0.9626614261650758, "grad_norm": 0.41739439964294434, "learning_rate": 9.741558651707092e-06, "loss": 0.4239, "step": 1143 }, { "epoch": 0.9635036496350365, "grad_norm": 0.5525065660476685, "learning_rate": 9.740624528819696e-06, "loss": 0.4723, "step": 1144 }, { "epoch": 0.9643458731049972, "grad_norm": 0.5253375172615051, "learning_rate": 9.739688765757493e-06, "loss": 0.4462, "step": 1145 }, { "epoch": 0.9651880965749579, "grad_norm": 0.5363443493843079, "learning_rate": 9.738751362844244e-06, "loss": 0.4362, "step": 1146 }, { "epoch": 0.9660303200449186, "grad_norm": 0.5195977091789246, "learning_rate": 9.737812320404271e-06, "loss": 0.4469, "step": 1147 }, { "epoch": 0.9668725435148793, "grad_norm": 0.48481205105781555, "learning_rate": 9.736871638762473e-06, "loss": 0.4599, "step": 1148 }, { "epoch": 0.9677147669848399, "grad_norm": 0.4530254006385803, "learning_rate": 9.735929318244306e-06, "loss": 0.4594, "step": 1149 }, { "epoch": 0.9685569904548007, "grad_norm": 0.4565606415271759, "learning_rate": 9.734985359175795e-06, "loss": 0.4579, "step": 1150 }, { "epoch": 0.9693992139247614, "grad_norm": 0.5946236252784729, "learning_rate": 9.734039761883536e-06, "loss": 0.4588, "step": 1151 }, { "epoch": 0.970241437394722, "grad_norm": 0.48041626811027527, "learning_rate": 9.733092526694687e-06, "loss": 0.4388, "step": 1152 }, { "epoch": 0.9710836608646828, "grad_norm": 0.5837200880050659, "learning_rate": 9.73214365393698e-06, "loss": 0.4388, "step": 1153 }, { "epoch": 0.9719258843346434, "grad_norm": 0.45728570222854614, "learning_rate": 9.731193143938704e-06, "loss": 0.4444, "step": 1154 }, { "epoch": 0.9727681078046042, "grad_norm": 0.49400603771209717, "learning_rate": 9.730240997028721e-06, "loss": 0.4269, "step": 1155 }, { "epoch": 0.9736103312745649, "grad_norm": 0.5450953841209412, "learning_rate": 9.729287213536458e-06, "loss": 0.4704, "step": 1156 }, { "epoch": 0.9744525547445255, "grad_norm": 0.4544368386268616, "learning_rate": 9.728331793791908e-06, "loss": 0.4822, "step": 1157 }, { "epoch": 0.9752947782144863, "grad_norm": 0.5892021656036377, "learning_rate": 9.727374738125628e-06, "loss": 0.4672, "step": 1158 }, { "epoch": 0.976137001684447, "grad_norm": 0.457746297121048, "learning_rate": 9.726416046868743e-06, "loss": 0.45, "step": 1159 }, { "epoch": 0.9769792251544076, "grad_norm": 0.51177978515625, "learning_rate": 9.725455720352945e-06, "loss": 0.4782, "step": 1160 }, { "epoch": 0.9778214486243684, "grad_norm": 0.47894999384880066, "learning_rate": 9.724493758910491e-06, "loss": 0.4559, "step": 1161 }, { "epoch": 0.978663672094329, "grad_norm": 0.46975401043891907, "learning_rate": 9.723530162874202e-06, "loss": 0.4509, "step": 1162 }, { "epoch": 0.9795058955642897, "grad_norm": 0.5324963331222534, "learning_rate": 9.722564932577465e-06, "loss": 0.4765, "step": 1163 }, { "epoch": 0.9803481190342505, "grad_norm": 0.5399520993232727, "learning_rate": 9.721598068354234e-06, "loss": 0.4568, "step": 1164 }, { "epoch": 0.9811903425042111, "grad_norm": 0.4360414147377014, "learning_rate": 9.720629570539029e-06, "loss": 0.4618, "step": 1165 }, { "epoch": 0.9820325659741718, "grad_norm": 0.5042654871940613, "learning_rate": 9.719659439466931e-06, "loss": 0.4476, "step": 1166 }, { "epoch": 0.9828747894441325, "grad_norm": 0.4801131784915924, "learning_rate": 9.71868767547359e-06, "loss": 0.4524, "step": 1167 }, { "epoch": 0.9837170129140932, "grad_norm": 0.49291130900382996, "learning_rate": 9.71771427889522e-06, "loss": 0.4513, "step": 1168 }, { "epoch": 0.9845592363840538, "grad_norm": 0.4933064877986908, "learning_rate": 9.716739250068598e-06, "loss": 0.4443, "step": 1169 }, { "epoch": 0.9854014598540146, "grad_norm": 0.5357077717781067, "learning_rate": 9.71576258933107e-06, "loss": 0.4781, "step": 1170 }, { "epoch": 0.9862436833239753, "grad_norm": 0.5210580825805664, "learning_rate": 9.714784297020541e-06, "loss": 0.4849, "step": 1171 }, { "epoch": 0.987085906793936, "grad_norm": 0.4982260763645172, "learning_rate": 9.713804373475484e-06, "loss": 0.4612, "step": 1172 }, { "epoch": 0.9879281302638967, "grad_norm": 0.5087005496025085, "learning_rate": 9.712822819034939e-06, "loss": 0.4764, "step": 1173 }, { "epoch": 0.9887703537338574, "grad_norm": 0.47384798526763916, "learning_rate": 9.711839634038502e-06, "loss": 0.476, "step": 1174 }, { "epoch": 0.9896125772038181, "grad_norm": 0.4493638575077057, "learning_rate": 9.710854818826341e-06, "loss": 0.4467, "step": 1175 }, { "epoch": 0.9904548006737788, "grad_norm": 0.5004761219024658, "learning_rate": 9.709868373739184e-06, "loss": 0.4378, "step": 1176 }, { "epoch": 0.9912970241437394, "grad_norm": 0.5071452260017395, "learning_rate": 9.708880299118326e-06, "loss": 0.456, "step": 1177 }, { "epoch": 0.9921392476137002, "grad_norm": 0.45394134521484375, "learning_rate": 9.707890595305621e-06, "loss": 0.4598, "step": 1178 }, { "epoch": 0.9929814710836609, "grad_norm": 0.4973567724227905, "learning_rate": 9.706899262643493e-06, "loss": 0.4773, "step": 1179 }, { "epoch": 0.9938236945536215, "grad_norm": 0.4536394476890564, "learning_rate": 9.705906301474922e-06, "loss": 0.4704, "step": 1180 }, { "epoch": 0.9946659180235823, "grad_norm": 0.4466317594051361, "learning_rate": 9.70491171214346e-06, "loss": 0.4474, "step": 1181 }, { "epoch": 0.9955081414935429, "grad_norm": 0.4195629060268402, "learning_rate": 9.703915494993215e-06, "loss": 0.4287, "step": 1182 }, { "epoch": 0.9963503649635036, "grad_norm": 0.5238949656486511, "learning_rate": 9.702917650368861e-06, "loss": 0.4734, "step": 1183 }, { "epoch": 0.9971925884334644, "grad_norm": 0.4340732991695404, "learning_rate": 9.701918178615637e-06, "loss": 0.4792, "step": 1184 }, { "epoch": 0.998034811903425, "grad_norm": 0.45991605520248413, "learning_rate": 9.700917080079342e-06, "loss": 0.4431, "step": 1185 }, { "epoch": 0.9988770353733858, "grad_norm": 0.5338580012321472, "learning_rate": 9.699914355106337e-06, "loss": 0.469, "step": 1186 }, { "epoch": 0.9997192588433464, "grad_norm": 0.45689693093299866, "learning_rate": 9.698910004043551e-06, "loss": 0.4349, "step": 1187 }, { "epoch": 1.000561482313307, "grad_norm": 0.8191329836845398, "learning_rate": 9.697904027238472e-06, "loss": 0.7107, "step": 1188 }, { "epoch": 1.0014037057832679, "grad_norm": 0.5428515076637268, "learning_rate": 9.696896425039147e-06, "loss": 0.4389, "step": 1189 }, { "epoch": 1.0022459292532284, "grad_norm": 0.5251046419143677, "learning_rate": 9.695887197794193e-06, "loss": 0.4122, "step": 1190 }, { "epoch": 1.0030881527231892, "grad_norm": 0.5156293511390686, "learning_rate": 9.694876345852784e-06, "loss": 0.4344, "step": 1191 }, { "epoch": 1.00393037619315, "grad_norm": 0.5264970064163208, "learning_rate": 9.693863869564656e-06, "loss": 0.4566, "step": 1192 }, { "epoch": 1.0047725996631107, "grad_norm": 0.4438953399658203, "learning_rate": 9.69284976928011e-06, "loss": 0.3724, "step": 1193 }, { "epoch": 1.0056148231330713, "grad_norm": 0.5444672107696533, "learning_rate": 9.691834045350007e-06, "loss": 0.4363, "step": 1194 }, { "epoch": 1.006457046603032, "grad_norm": 0.46607303619384766, "learning_rate": 9.69081669812577e-06, "loss": 0.4491, "step": 1195 }, { "epoch": 1.0072992700729928, "grad_norm": 0.6608579754829407, "learning_rate": 9.689797727959387e-06, "loss": 0.4185, "step": 1196 }, { "epoch": 1.0081414935429533, "grad_norm": 0.5215677618980408, "learning_rate": 9.688777135203397e-06, "loss": 0.44, "step": 1197 }, { "epoch": 1.0089837170129141, "grad_norm": 0.5729716420173645, "learning_rate": 9.687754920210915e-06, "loss": 0.3959, "step": 1198 }, { "epoch": 1.0098259404828749, "grad_norm": 0.5465942025184631, "learning_rate": 9.686731083335604e-06, "loss": 0.417, "step": 1199 }, { "epoch": 1.0106681639528354, "grad_norm": 0.5511727333068848, "learning_rate": 9.685705624931698e-06, "loss": 0.4189, "step": 1200 }, { "epoch": 1.0115103874227962, "grad_norm": 0.5469289422035217, "learning_rate": 9.684678545353985e-06, "loss": 0.4751, "step": 1201 }, { "epoch": 1.012352610892757, "grad_norm": 0.4392673969268799, "learning_rate": 9.683649844957819e-06, "loss": 0.3999, "step": 1202 }, { "epoch": 1.0131948343627175, "grad_norm": 0.7031635046005249, "learning_rate": 9.682619524099113e-06, "loss": 0.4574, "step": 1203 }, { "epoch": 1.0140370578326783, "grad_norm": 0.445004403591156, "learning_rate": 9.681587583134339e-06, "loss": 0.435, "step": 1204 }, { "epoch": 1.014879281302639, "grad_norm": 0.6174226403236389, "learning_rate": 9.680554022420531e-06, "loss": 0.4294, "step": 1205 }, { "epoch": 1.0157215047725996, "grad_norm": 0.47825944423675537, "learning_rate": 9.679518842315284e-06, "loss": 0.4168, "step": 1206 }, { "epoch": 1.0165637282425604, "grad_norm": 0.5130305290222168, "learning_rate": 9.678482043176752e-06, "loss": 0.3949, "step": 1207 }, { "epoch": 1.0174059517125211, "grad_norm": 0.532730758190155, "learning_rate": 9.67744362536365e-06, "loss": 0.4278, "step": 1208 }, { "epoch": 1.0182481751824817, "grad_norm": 0.5435667037963867, "learning_rate": 9.676403589235252e-06, "loss": 0.4123, "step": 1209 }, { "epoch": 1.0190903986524424, "grad_norm": 0.5348758697509766, "learning_rate": 9.675361935151395e-06, "loss": 0.4113, "step": 1210 }, { "epoch": 1.0199326221224032, "grad_norm": 0.5058181285858154, "learning_rate": 9.674318663472472e-06, "loss": 0.473, "step": 1211 }, { "epoch": 1.0207748455923638, "grad_norm": 0.504833459854126, "learning_rate": 9.673273774559435e-06, "loss": 0.3932, "step": 1212 }, { "epoch": 1.0216170690623245, "grad_norm": 0.522858202457428, "learning_rate": 9.672227268773802e-06, "loss": 0.4662, "step": 1213 }, { "epoch": 1.0224592925322853, "grad_norm": 0.48088929057121277, "learning_rate": 9.671179146477642e-06, "loss": 0.4255, "step": 1214 }, { "epoch": 1.0233015160022458, "grad_norm": 0.5142372846603394, "learning_rate": 9.670129408033589e-06, "loss": 0.4746, "step": 1215 }, { "epoch": 1.0241437394722066, "grad_norm": 0.43512871861457825, "learning_rate": 9.669078053804834e-06, "loss": 0.3998, "step": 1216 }, { "epoch": 1.0249859629421674, "grad_norm": 0.5050188899040222, "learning_rate": 9.66802508415513e-06, "loss": 0.4646, "step": 1217 }, { "epoch": 1.025828186412128, "grad_norm": 0.4166233539581299, "learning_rate": 9.666970499448783e-06, "loss": 0.3908, "step": 1218 }, { "epoch": 1.0266704098820887, "grad_norm": 0.46852126717567444, "learning_rate": 9.665914300050663e-06, "loss": 0.4432, "step": 1219 }, { "epoch": 1.0275126333520495, "grad_norm": 0.4415505528450012, "learning_rate": 9.664856486326197e-06, "loss": 0.4295, "step": 1220 }, { "epoch": 1.02835485682201, "grad_norm": 0.4543525278568268, "learning_rate": 9.66379705864137e-06, "loss": 0.4089, "step": 1221 }, { "epoch": 1.0291970802919708, "grad_norm": 0.4988309144973755, "learning_rate": 9.662736017362725e-06, "loss": 0.42, "step": 1222 }, { "epoch": 1.0300393037619315, "grad_norm": 0.50452721118927, "learning_rate": 9.661673362857367e-06, "loss": 0.4442, "step": 1223 }, { "epoch": 1.0308815272318923, "grad_norm": 0.616641104221344, "learning_rate": 9.660609095492953e-06, "loss": 0.4816, "step": 1224 }, { "epoch": 1.0317237507018528, "grad_norm": 0.4556799530982971, "learning_rate": 9.6595432156377e-06, "loss": 0.4293, "step": 1225 }, { "epoch": 1.0325659741718136, "grad_norm": 0.6814255118370056, "learning_rate": 9.65847572366039e-06, "loss": 0.4413, "step": 1226 }, { "epoch": 1.0334081976417744, "grad_norm": 0.4272933304309845, "learning_rate": 9.657406619930351e-06, "loss": 0.4308, "step": 1227 }, { "epoch": 1.034250421111735, "grad_norm": 0.5714081525802612, "learning_rate": 9.656335904817479e-06, "loss": 0.4507, "step": 1228 }, { "epoch": 1.0350926445816957, "grad_norm": 0.5789410471916199, "learning_rate": 9.65526357869222e-06, "loss": 0.4416, "step": 1229 }, { "epoch": 1.0359348680516565, "grad_norm": 0.4514707326889038, "learning_rate": 9.654189641925582e-06, "loss": 0.4487, "step": 1230 }, { "epoch": 1.036777091521617, "grad_norm": 0.4721042215824127, "learning_rate": 9.653114094889128e-06, "loss": 0.4154, "step": 1231 }, { "epoch": 1.0376193149915778, "grad_norm": 0.4756459593772888, "learning_rate": 9.652036937954979e-06, "loss": 0.4141, "step": 1232 }, { "epoch": 1.0384615384615385, "grad_norm": 0.4830354154109955, "learning_rate": 9.650958171495813e-06, "loss": 0.4772, "step": 1233 }, { "epoch": 1.039303761931499, "grad_norm": 0.4378911256790161, "learning_rate": 9.649877795884865e-06, "loss": 0.4178, "step": 1234 }, { "epoch": 1.0401459854014599, "grad_norm": 0.527038037776947, "learning_rate": 9.648795811495925e-06, "loss": 0.4402, "step": 1235 }, { "epoch": 1.0409882088714206, "grad_norm": 0.46200090646743774, "learning_rate": 9.647712218703345e-06, "loss": 0.4022, "step": 1236 }, { "epoch": 1.0418304323413812, "grad_norm": 0.5417786836624146, "learning_rate": 9.646627017882023e-06, "loss": 0.4408, "step": 1237 }, { "epoch": 1.042672655811342, "grad_norm": 0.4952846169471741, "learning_rate": 9.645540209407426e-06, "loss": 0.4114, "step": 1238 }, { "epoch": 1.0435148792813027, "grad_norm": 0.5089791417121887, "learning_rate": 9.644451793655567e-06, "loss": 0.4868, "step": 1239 }, { "epoch": 1.0443571027512633, "grad_norm": 0.43424633145332336, "learning_rate": 9.64336177100302e-06, "loss": 0.4063, "step": 1240 }, { "epoch": 1.045199326221224, "grad_norm": 0.5828599333763123, "learning_rate": 9.64227014182692e-06, "loss": 0.448, "step": 1241 }, { "epoch": 1.0460415496911848, "grad_norm": 0.48580554127693176, "learning_rate": 9.641176906504943e-06, "loss": 0.4434, "step": 1242 }, { "epoch": 1.0468837731611453, "grad_norm": 0.5283637046813965, "learning_rate": 9.640082065415333e-06, "loss": 0.3826, "step": 1243 }, { "epoch": 1.047725996631106, "grad_norm": 0.6389682292938232, "learning_rate": 9.63898561893689e-06, "loss": 0.4582, "step": 1244 }, { "epoch": 1.0485682201010669, "grad_norm": 0.4765123426914215, "learning_rate": 9.63788756744896e-06, "loss": 0.4347, "step": 1245 }, { "epoch": 1.0494104435710274, "grad_norm": 0.7300904989242554, "learning_rate": 9.636787911331452e-06, "loss": 0.4493, "step": 1246 }, { "epoch": 1.0502526670409882, "grad_norm": 0.49095314741134644, "learning_rate": 9.63568665096483e-06, "loss": 0.4364, "step": 1247 }, { "epoch": 1.051094890510949, "grad_norm": 0.5879659652709961, "learning_rate": 9.63458378673011e-06, "loss": 0.4528, "step": 1248 }, { "epoch": 1.0519371139809095, "grad_norm": 0.5385510325431824, "learning_rate": 9.633479319008862e-06, "loss": 0.434, "step": 1249 }, { "epoch": 1.0527793374508703, "grad_norm": 0.5122434496879578, "learning_rate": 9.632373248183217e-06, "loss": 0.3969, "step": 1250 }, { "epoch": 1.053621560920831, "grad_norm": 0.565267026424408, "learning_rate": 9.631265574635855e-06, "loss": 0.4379, "step": 1251 }, { "epoch": 1.0544637843907916, "grad_norm": 0.5680955648422241, "learning_rate": 9.63015629875001e-06, "loss": 0.4715, "step": 1252 }, { "epoch": 1.0553060078607523, "grad_norm": 0.42849525809288025, "learning_rate": 9.629045420909476e-06, "loss": 0.4195, "step": 1253 }, { "epoch": 1.0561482313307131, "grad_norm": 0.5465871095657349, "learning_rate": 9.627932941498594e-06, "loss": 0.4074, "step": 1254 }, { "epoch": 1.0569904548006739, "grad_norm": 0.49935638904571533, "learning_rate": 9.626818860902265e-06, "loss": 0.4763, "step": 1255 }, { "epoch": 1.0578326782706344, "grad_norm": 0.4615282416343689, "learning_rate": 9.62570317950594e-06, "loss": 0.4137, "step": 1256 }, { "epoch": 1.0586749017405952, "grad_norm": 0.5720377564430237, "learning_rate": 9.624585897695628e-06, "loss": 0.4376, "step": 1257 }, { "epoch": 1.059517125210556, "grad_norm": 0.5091979503631592, "learning_rate": 9.623467015857888e-06, "loss": 0.4742, "step": 1258 }, { "epoch": 1.0603593486805165, "grad_norm": 0.4695676267147064, "learning_rate": 9.622346534379834e-06, "loss": 0.4104, "step": 1259 }, { "epoch": 1.0612015721504773, "grad_norm": 0.5443032383918762, "learning_rate": 9.621224453649133e-06, "loss": 0.4381, "step": 1260 }, { "epoch": 1.062043795620438, "grad_norm": 0.490431547164917, "learning_rate": 9.620100774054006e-06, "loss": 0.4272, "step": 1261 }, { "epoch": 1.0628860190903986, "grad_norm": 0.4549565613269806, "learning_rate": 9.618975495983228e-06, "loss": 0.418, "step": 1262 }, { "epoch": 1.0637282425603594, "grad_norm": 0.49733585119247437, "learning_rate": 9.617848619826125e-06, "loss": 0.4356, "step": 1263 }, { "epoch": 1.0645704660303201, "grad_norm": 0.5667881965637207, "learning_rate": 9.616720145972576e-06, "loss": 0.4374, "step": 1264 }, { "epoch": 1.0654126895002807, "grad_norm": 0.46249035000801086, "learning_rate": 9.615590074813015e-06, "loss": 0.3975, "step": 1265 }, { "epoch": 1.0662549129702414, "grad_norm": 0.5633223652839661, "learning_rate": 9.614458406738427e-06, "loss": 0.4301, "step": 1266 }, { "epoch": 1.0670971364402022, "grad_norm": 0.5028617978096008, "learning_rate": 9.61332514214035e-06, "loss": 0.4506, "step": 1267 }, { "epoch": 1.0679393599101628, "grad_norm": 0.4664643704891205, "learning_rate": 9.612190281410873e-06, "loss": 0.4323, "step": 1268 }, { "epoch": 1.0687815833801235, "grad_norm": 0.5650041699409485, "learning_rate": 9.611053824942638e-06, "loss": 0.4366, "step": 1269 }, { "epoch": 1.0696238068500843, "grad_norm": 0.49832168221473694, "learning_rate": 9.609915773128842e-06, "loss": 0.471, "step": 1270 }, { "epoch": 1.0704660303200448, "grad_norm": 0.4530488848686218, "learning_rate": 9.608776126363227e-06, "loss": 0.3758, "step": 1271 }, { "epoch": 1.0713082537900056, "grad_norm": 0.46219685673713684, "learning_rate": 9.607634885040096e-06, "loss": 0.4336, "step": 1272 }, { "epoch": 1.0721504772599664, "grad_norm": 0.4902309477329254, "learning_rate": 9.606492049554297e-06, "loss": 0.4478, "step": 1273 }, { "epoch": 1.072992700729927, "grad_norm": 0.38795357942581177, "learning_rate": 9.605347620301231e-06, "loss": 0.4144, "step": 1274 }, { "epoch": 1.0738349241998877, "grad_norm": 0.4210546016693115, "learning_rate": 9.604201597676852e-06, "loss": 0.4265, "step": 1275 }, { "epoch": 1.0746771476698485, "grad_norm": 0.5192456841468811, "learning_rate": 9.603053982077662e-06, "loss": 0.4601, "step": 1276 }, { "epoch": 1.075519371139809, "grad_norm": 0.4593535363674164, "learning_rate": 9.601904773900719e-06, "loss": 0.4332, "step": 1277 }, { "epoch": 1.0763615946097698, "grad_norm": 0.4413716197013855, "learning_rate": 9.600753973543628e-06, "loss": 0.4291, "step": 1278 }, { "epoch": 1.0772038180797305, "grad_norm": 0.4190436899662018, "learning_rate": 9.599601581404546e-06, "loss": 0.4476, "step": 1279 }, { "epoch": 1.078046041549691, "grad_norm": 0.39402490854263306, "learning_rate": 9.598447597882181e-06, "loss": 0.4209, "step": 1280 }, { "epoch": 1.0788882650196518, "grad_norm": 0.4548156261444092, "learning_rate": 9.597292023375792e-06, "loss": 0.4177, "step": 1281 }, { "epoch": 1.0797304884896126, "grad_norm": 0.47732728719711304, "learning_rate": 9.59613485828519e-06, "loss": 0.4425, "step": 1282 }, { "epoch": 1.0805727119595732, "grad_norm": 0.4119926989078522, "learning_rate": 9.59497610301073e-06, "loss": 0.3949, "step": 1283 }, { "epoch": 1.081414935429534, "grad_norm": 0.4402223825454712, "learning_rate": 9.593815757953322e-06, "loss": 0.441, "step": 1284 }, { "epoch": 1.0822571588994947, "grad_norm": 0.4330277144908905, "learning_rate": 9.592653823514429e-06, "loss": 0.4283, "step": 1285 }, { "epoch": 1.0830993823694555, "grad_norm": 0.4392620921134949, "learning_rate": 9.591490300096057e-06, "loss": 0.4316, "step": 1286 }, { "epoch": 1.083941605839416, "grad_norm": 0.5163503289222717, "learning_rate": 9.590325188100768e-06, "loss": 0.4161, "step": 1287 }, { "epoch": 1.0847838293093768, "grad_norm": 0.5136776566505432, "learning_rate": 9.589158487931667e-06, "loss": 0.4573, "step": 1288 }, { "epoch": 1.0856260527793375, "grad_norm": 0.5370907187461853, "learning_rate": 9.587990199992417e-06, "loss": 0.4409, "step": 1289 }, { "epoch": 1.086468276249298, "grad_norm": 0.6567516326904297, "learning_rate": 9.586820324687223e-06, "loss": 0.4349, "step": 1290 }, { "epoch": 1.0873104997192589, "grad_norm": 0.5307185053825378, "learning_rate": 9.58564886242084e-06, "loss": 0.4645, "step": 1291 }, { "epoch": 1.0881527231892196, "grad_norm": 0.5629045367240906, "learning_rate": 9.584475813598578e-06, "loss": 0.4026, "step": 1292 }, { "epoch": 1.0889949466591802, "grad_norm": 0.47548866271972656, "learning_rate": 9.583301178626289e-06, "loss": 0.4378, "step": 1293 }, { "epoch": 1.089837170129141, "grad_norm": 0.6291982531547546, "learning_rate": 9.582124957910377e-06, "loss": 0.3998, "step": 1294 }, { "epoch": 1.0906793935991017, "grad_norm": 0.4591591954231262, "learning_rate": 9.580947151857793e-06, "loss": 0.4424, "step": 1295 }, { "epoch": 1.0915216170690623, "grad_norm": 0.6326950788497925, "learning_rate": 9.57976776087604e-06, "loss": 0.4216, "step": 1296 }, { "epoch": 1.092363840539023, "grad_norm": 0.5887095332145691, "learning_rate": 9.578586785373167e-06, "loss": 0.4216, "step": 1297 }, { "epoch": 1.0932060640089838, "grad_norm": 0.5387338399887085, "learning_rate": 9.577404225757769e-06, "loss": 0.488, "step": 1298 }, { "epoch": 1.0940482874789443, "grad_norm": 0.5055761337280273, "learning_rate": 9.576220082438992e-06, "loss": 0.3665, "step": 1299 }, { "epoch": 1.094890510948905, "grad_norm": 0.5401684045791626, "learning_rate": 9.575034355826532e-06, "loss": 0.442, "step": 1300 }, { "epoch": 1.0957327344188659, "grad_norm": 0.5300390720367432, "learning_rate": 9.573847046330627e-06, "loss": 0.4625, "step": 1301 }, { "epoch": 1.0965749578888264, "grad_norm": 0.5140044689178467, "learning_rate": 9.572658154362067e-06, "loss": 0.4238, "step": 1302 }, { "epoch": 1.0974171813587872, "grad_norm": 0.5266305804252625, "learning_rate": 9.571467680332189e-06, "loss": 0.3951, "step": 1303 }, { "epoch": 1.098259404828748, "grad_norm": 0.42349129915237427, "learning_rate": 9.570275624652874e-06, "loss": 0.4369, "step": 1304 }, { "epoch": 1.0991016282987085, "grad_norm": 0.5630435943603516, "learning_rate": 9.569081987736556e-06, "loss": 0.446, "step": 1305 }, { "epoch": 1.0999438517686693, "grad_norm": 0.5384799242019653, "learning_rate": 9.56788676999621e-06, "loss": 0.4047, "step": 1306 }, { "epoch": 1.10078607523863, "grad_norm": 0.503005862236023, "learning_rate": 9.566689971845364e-06, "loss": 0.4433, "step": 1307 }, { "epoch": 1.1016282987085906, "grad_norm": 0.5054175853729248, "learning_rate": 9.565491593698087e-06, "loss": 0.4117, "step": 1308 }, { "epoch": 1.1024705221785513, "grad_norm": 0.6179901957511902, "learning_rate": 9.564291635968998e-06, "loss": 0.4925, "step": 1309 }, { "epoch": 1.1033127456485121, "grad_norm": 0.4606417119503021, "learning_rate": 9.56309009907326e-06, "loss": 0.4374, "step": 1310 }, { "epoch": 1.1041549691184729, "grad_norm": 0.548001766204834, "learning_rate": 9.56188698342659e-06, "loss": 0.3988, "step": 1311 }, { "epoch": 1.1049971925884334, "grad_norm": 0.5044772624969482, "learning_rate": 9.560682289445238e-06, "loss": 0.4784, "step": 1312 }, { "epoch": 1.1058394160583942, "grad_norm": 0.4362800419330597, "learning_rate": 9.559476017546013e-06, "loss": 0.4053, "step": 1313 }, { "epoch": 1.1066816395283547, "grad_norm": 0.5626078844070435, "learning_rate": 9.558268168146262e-06, "loss": 0.4824, "step": 1314 }, { "epoch": 1.1075238629983155, "grad_norm": 0.4736645221710205, "learning_rate": 9.55705874166388e-06, "loss": 0.4252, "step": 1315 }, { "epoch": 1.1083660864682763, "grad_norm": 0.47635021805763245, "learning_rate": 9.55584773851731e-06, "loss": 0.4153, "step": 1316 }, { "epoch": 1.109208309938237, "grad_norm": 0.4807952344417572, "learning_rate": 9.554635159125533e-06, "loss": 0.4024, "step": 1317 }, { "epoch": 1.1100505334081976, "grad_norm": 0.41211891174316406, "learning_rate": 9.553421003908088e-06, "loss": 0.4054, "step": 1318 }, { "epoch": 1.1108927568781584, "grad_norm": 0.4828183948993683, "learning_rate": 9.552205273285047e-06, "loss": 0.4214, "step": 1319 }, { "epoch": 1.1117349803481191, "grad_norm": 0.4794031083583832, "learning_rate": 9.550987967677034e-06, "loss": 0.4328, "step": 1320 }, { "epoch": 1.1125772038180797, "grad_norm": 0.42873385548591614, "learning_rate": 9.549769087505213e-06, "loss": 0.4264, "step": 1321 }, { "epoch": 1.1134194272880404, "grad_norm": 0.48716577887535095, "learning_rate": 9.548548633191299e-06, "loss": 0.4025, "step": 1322 }, { "epoch": 1.1142616507580012, "grad_norm": 0.5338295102119446, "learning_rate": 9.547326605157548e-06, "loss": 0.4667, "step": 1323 }, { "epoch": 1.1151038742279618, "grad_norm": 0.4397370219230652, "learning_rate": 9.54610300382676e-06, "loss": 0.4362, "step": 1324 }, { "epoch": 1.1159460976979225, "grad_norm": 0.3872171938419342, "learning_rate": 9.544877829622276e-06, "loss": 0.4146, "step": 1325 }, { "epoch": 1.1167883211678833, "grad_norm": 0.48833733797073364, "learning_rate": 9.543651082967993e-06, "loss": 0.4547, "step": 1326 }, { "epoch": 1.1176305446378438, "grad_norm": 0.40408065915107727, "learning_rate": 9.542422764288339e-06, "loss": 0.4134, "step": 1327 }, { "epoch": 1.1184727681078046, "grad_norm": 0.45353075861930847, "learning_rate": 9.541192874008293e-06, "loss": 0.4546, "step": 1328 }, { "epoch": 1.1193149915777654, "grad_norm": 0.42965278029441833, "learning_rate": 9.539961412553375e-06, "loss": 0.4491, "step": 1329 }, { "epoch": 1.120157215047726, "grad_norm": 0.4404903054237366, "learning_rate": 9.53872838034965e-06, "loss": 0.4154, "step": 1330 }, { "epoch": 1.1209994385176867, "grad_norm": 0.44053342938423157, "learning_rate": 9.537493777823727e-06, "loss": 0.427, "step": 1331 }, { "epoch": 1.1218416619876475, "grad_norm": 0.37729063630104065, "learning_rate": 9.536257605402755e-06, "loss": 0.4114, "step": 1332 }, { "epoch": 1.122683885457608, "grad_norm": 0.5068271160125732, "learning_rate": 9.535019863514433e-06, "loss": 0.459, "step": 1333 }, { "epoch": 1.1235261089275688, "grad_norm": 0.448238730430603, "learning_rate": 9.53378055258699e-06, "loss": 0.4553, "step": 1334 }, { "epoch": 1.1243683323975295, "grad_norm": 0.42638564109802246, "learning_rate": 9.532539673049216e-06, "loss": 0.4354, "step": 1335 }, { "epoch": 1.12521055586749, "grad_norm": 0.4561554193496704, "learning_rate": 9.531297225330429e-06, "loss": 0.4077, "step": 1336 }, { "epoch": 1.1260527793374508, "grad_norm": 0.44876930117607117, "learning_rate": 9.530053209860495e-06, "loss": 0.3989, "step": 1337 }, { "epoch": 1.1268950028074116, "grad_norm": 0.5059868693351746, "learning_rate": 9.528807627069821e-06, "loss": 0.4784, "step": 1338 }, { "epoch": 1.1277372262773722, "grad_norm": 0.4401852786540985, "learning_rate": 9.527560477389359e-06, "loss": 0.4237, "step": 1339 }, { "epoch": 1.128579449747333, "grad_norm": 0.4868219494819641, "learning_rate": 9.526311761250603e-06, "loss": 0.4397, "step": 1340 }, { "epoch": 1.1294216732172937, "grad_norm": 0.44572484493255615, "learning_rate": 9.525061479085585e-06, "loss": 0.4328, "step": 1341 }, { "epoch": 1.1302638966872545, "grad_norm": 0.5002520680427551, "learning_rate": 9.52380963132688e-06, "loss": 0.4282, "step": 1342 }, { "epoch": 1.131106120157215, "grad_norm": 0.4971974492073059, "learning_rate": 9.522556218407609e-06, "loss": 0.4465, "step": 1343 }, { "epoch": 1.1319483436271758, "grad_norm": 0.4097907543182373, "learning_rate": 9.521301240761428e-06, "loss": 0.3916, "step": 1344 }, { "epoch": 1.1327905670971363, "grad_norm": 0.5595524907112122, "learning_rate": 9.52004469882254e-06, "loss": 0.4283, "step": 1345 }, { "epoch": 1.133632790567097, "grad_norm": 0.49993282556533813, "learning_rate": 9.518786593025689e-06, "loss": 0.466, "step": 1346 }, { "epoch": 1.1344750140370579, "grad_norm": 0.4962349236011505, "learning_rate": 9.517526923806153e-06, "loss": 0.4579, "step": 1347 }, { "epoch": 1.1353172375070186, "grad_norm": 0.5540683269500732, "learning_rate": 9.516265691599759e-06, "loss": 0.4369, "step": 1348 }, { "epoch": 1.1361594609769792, "grad_norm": 0.4795381724834442, "learning_rate": 9.51500289684287e-06, "loss": 0.4224, "step": 1349 }, { "epoch": 1.13700168444694, "grad_norm": 0.5390043258666992, "learning_rate": 9.513738539972395e-06, "loss": 0.4825, "step": 1350 }, { "epoch": 1.1378439079169007, "grad_norm": 0.5235761404037476, "learning_rate": 9.512472621425777e-06, "loss": 0.4302, "step": 1351 }, { "epoch": 1.1386861313868613, "grad_norm": 0.43177443742752075, "learning_rate": 9.511205141641002e-06, "loss": 0.4384, "step": 1352 }, { "epoch": 1.139528354856822, "grad_norm": 0.45005884766578674, "learning_rate": 9.509936101056596e-06, "loss": 0.4637, "step": 1353 }, { "epoch": 1.1403705783267828, "grad_norm": 0.46435850858688354, "learning_rate": 9.508665500111629e-06, "loss": 0.409, "step": 1354 }, { "epoch": 1.1412128017967433, "grad_norm": 0.4604218304157257, "learning_rate": 9.5073933392457e-06, "loss": 0.4083, "step": 1355 }, { "epoch": 1.142055025266704, "grad_norm": 0.5547798871994019, "learning_rate": 9.50611961889896e-06, "loss": 0.4124, "step": 1356 }, { "epoch": 1.1428972487366649, "grad_norm": 0.4328976273536682, "learning_rate": 9.504844339512096e-06, "loss": 0.4204, "step": 1357 }, { "epoch": 1.1437394722066254, "grad_norm": 0.5193594098091125, "learning_rate": 9.503567501526329e-06, "loss": 0.4341, "step": 1358 }, { "epoch": 1.1445816956765862, "grad_norm": 0.4981316030025482, "learning_rate": 9.502289105383425e-06, "loss": 0.424, "step": 1359 }, { "epoch": 1.145423919146547, "grad_norm": 0.42908287048339844, "learning_rate": 9.501009151525686e-06, "loss": 0.4163, "step": 1360 }, { "epoch": 1.1462661426165075, "grad_norm": 0.4876088500022888, "learning_rate": 9.499727640395953e-06, "loss": 0.4377, "step": 1361 }, { "epoch": 1.1471083660864683, "grad_norm": 0.453823059797287, "learning_rate": 9.49844457243761e-06, "loss": 0.4488, "step": 1362 }, { "epoch": 1.147950589556429, "grad_norm": 0.4534013271331787, "learning_rate": 9.497159948094575e-06, "loss": 0.4402, "step": 1363 }, { "epoch": 1.1487928130263896, "grad_norm": 0.4050256609916687, "learning_rate": 9.495873767811304e-06, "loss": 0.3779, "step": 1364 }, { "epoch": 1.1496350364963503, "grad_norm": 0.49813634157180786, "learning_rate": 9.494586032032798e-06, "loss": 0.4633, "step": 1365 }, { "epoch": 1.1504772599663111, "grad_norm": 0.43007758259773254, "learning_rate": 9.493296741204587e-06, "loss": 0.4118, "step": 1366 }, { "epoch": 1.1513194834362717, "grad_norm": 0.46501150727272034, "learning_rate": 9.492005895772745e-06, "loss": 0.4563, "step": 1367 }, { "epoch": 1.1521617069062324, "grad_norm": 0.5125041604042053, "learning_rate": 9.490713496183884e-06, "loss": 0.4428, "step": 1368 }, { "epoch": 1.1530039303761932, "grad_norm": 0.4124051034450531, "learning_rate": 9.48941954288515e-06, "loss": 0.4295, "step": 1369 }, { "epoch": 1.1538461538461537, "grad_norm": 0.46403422951698303, "learning_rate": 9.48812403632423e-06, "loss": 0.4258, "step": 1370 }, { "epoch": 1.1546883773161145, "grad_norm": 0.4316796064376831, "learning_rate": 9.486826976949344e-06, "loss": 0.4329, "step": 1371 }, { "epoch": 1.1555306007860753, "grad_norm": 0.4467044770717621, "learning_rate": 9.485528365209257e-06, "loss": 0.4544, "step": 1372 }, { "epoch": 1.156372824256036, "grad_norm": 0.5264897346496582, "learning_rate": 9.484228201553265e-06, "loss": 0.4487, "step": 1373 }, { "epoch": 1.1572150477259966, "grad_norm": 0.5303399562835693, "learning_rate": 9.482926486431199e-06, "loss": 0.3907, "step": 1374 }, { "epoch": 1.1580572711959574, "grad_norm": 0.5192993879318237, "learning_rate": 9.481623220293434e-06, "loss": 0.4033, "step": 1375 }, { "epoch": 1.158899494665918, "grad_norm": 0.5532940030097961, "learning_rate": 9.480318403590878e-06, "loss": 0.4673, "step": 1376 }, { "epoch": 1.1597417181358787, "grad_norm": 0.5712370872497559, "learning_rate": 9.479012036774972e-06, "loss": 0.4405, "step": 1377 }, { "epoch": 1.1605839416058394, "grad_norm": 0.5300625562667847, "learning_rate": 9.477704120297698e-06, "loss": 0.4573, "step": 1378 }, { "epoch": 1.1614261650758002, "grad_norm": 0.4342583119869232, "learning_rate": 9.476394654611571e-06, "loss": 0.4671, "step": 1379 }, { "epoch": 1.1622683885457608, "grad_norm": 0.5755478143692017, "learning_rate": 9.475083640169648e-06, "loss": 0.4229, "step": 1380 }, { "epoch": 1.1631106120157215, "grad_norm": 0.4207504987716675, "learning_rate": 9.473771077425516e-06, "loss": 0.4028, "step": 1381 }, { "epoch": 1.1639528354856823, "grad_norm": 0.4332772493362427, "learning_rate": 9.472456966833295e-06, "loss": 0.4662, "step": 1382 }, { "epoch": 1.1647950589556428, "grad_norm": 0.41623443365097046, "learning_rate": 9.471141308847649e-06, "loss": 0.4172, "step": 1383 }, { "epoch": 1.1656372824256036, "grad_norm": 0.4746728241443634, "learning_rate": 9.469824103923773e-06, "loss": 0.4494, "step": 1384 }, { "epoch": 1.1664795058955644, "grad_norm": 0.4462672472000122, "learning_rate": 9.468505352517394e-06, "loss": 0.4039, "step": 1385 }, { "epoch": 1.167321729365525, "grad_norm": 0.4356045126914978, "learning_rate": 9.467185055084782e-06, "loss": 0.4482, "step": 1386 }, { "epoch": 1.1681639528354857, "grad_norm": 0.3922847509384155, "learning_rate": 9.465863212082734e-06, "loss": 0.3894, "step": 1387 }, { "epoch": 1.1690061763054465, "grad_norm": 0.5469494462013245, "learning_rate": 9.464539823968586e-06, "loss": 0.4356, "step": 1388 }, { "epoch": 1.169848399775407, "grad_norm": 0.4529193639755249, "learning_rate": 9.463214891200207e-06, "loss": 0.3912, "step": 1389 }, { "epoch": 1.1706906232453678, "grad_norm": 0.4815426170825958, "learning_rate": 9.461888414236002e-06, "loss": 0.4168, "step": 1390 }, { "epoch": 1.1715328467153285, "grad_norm": 0.4664521813392639, "learning_rate": 9.46056039353491e-06, "loss": 0.4071, "step": 1391 }, { "epoch": 1.172375070185289, "grad_norm": 0.5599628686904907, "learning_rate": 9.459230829556402e-06, "loss": 0.4555, "step": 1392 }, { "epoch": 1.1732172936552498, "grad_norm": 0.5618027448654175, "learning_rate": 9.457899722760483e-06, "loss": 0.3978, "step": 1393 }, { "epoch": 1.1740595171252106, "grad_norm": 0.44468697905540466, "learning_rate": 9.456567073607697e-06, "loss": 0.4569, "step": 1394 }, { "epoch": 1.1749017405951712, "grad_norm": 0.5402093529701233, "learning_rate": 9.455232882559113e-06, "loss": 0.4348, "step": 1395 }, { "epoch": 1.175743964065132, "grad_norm": 0.5720494985580444, "learning_rate": 9.453897150076343e-06, "loss": 0.4582, "step": 1396 }, { "epoch": 1.1765861875350927, "grad_norm": 0.4830198287963867, "learning_rate": 9.452559876621524e-06, "loss": 0.4494, "step": 1397 }, { "epoch": 1.1774284110050532, "grad_norm": 0.5316833257675171, "learning_rate": 9.451221062657332e-06, "loss": 0.4223, "step": 1398 }, { "epoch": 1.178270634475014, "grad_norm": 0.44094136357307434, "learning_rate": 9.449880708646972e-06, "loss": 0.4319, "step": 1399 }, { "epoch": 1.1791128579449748, "grad_norm": 0.5846485495567322, "learning_rate": 9.448538815054182e-06, "loss": 0.4315, "step": 1400 }, { "epoch": 1.1799550814149353, "grad_norm": 0.4723174273967743, "learning_rate": 9.44719538234324e-06, "loss": 0.4532, "step": 1401 }, { "epoch": 1.180797304884896, "grad_norm": 0.4285380244255066, "learning_rate": 9.445850410978945e-06, "loss": 0.3989, "step": 1402 }, { "epoch": 1.1816395283548569, "grad_norm": 0.5413432717323303, "learning_rate": 9.444503901426636e-06, "loss": 0.458, "step": 1403 }, { "epoch": 1.1824817518248176, "grad_norm": 0.49491626024246216, "learning_rate": 9.443155854152181e-06, "loss": 0.4635, "step": 1404 }, { "epoch": 1.1833239752947782, "grad_norm": 0.44795137643814087, "learning_rate": 9.441806269621984e-06, "loss": 0.4272, "step": 1405 }, { "epoch": 1.184166198764739, "grad_norm": 0.44519078731536865, "learning_rate": 9.440455148302978e-06, "loss": 0.4575, "step": 1406 }, { "epoch": 1.1850084222346995, "grad_norm": 0.5618880987167358, "learning_rate": 9.439102490662625e-06, "loss": 0.4332, "step": 1407 }, { "epoch": 1.1858506457046603, "grad_norm": 0.37515947222709656, "learning_rate": 9.437748297168923e-06, "loss": 0.3966, "step": 1408 }, { "epoch": 1.186692869174621, "grad_norm": 0.5433194637298584, "learning_rate": 9.4363925682904e-06, "loss": 0.4322, "step": 1409 }, { "epoch": 1.1875350926445818, "grad_norm": 0.4607211947441101, "learning_rate": 9.435035304496111e-06, "loss": 0.4933, "step": 1410 }, { "epoch": 1.1883773161145423, "grad_norm": 0.4070110023021698, "learning_rate": 9.433676506255653e-06, "loss": 0.3965, "step": 1411 }, { "epoch": 1.189219539584503, "grad_norm": 0.49857422709465027, "learning_rate": 9.432316174039143e-06, "loss": 0.4393, "step": 1412 }, { "epoch": 1.1900617630544639, "grad_norm": 0.4364931285381317, "learning_rate": 9.430954308317232e-06, "loss": 0.4233, "step": 1413 }, { "epoch": 1.1909039865244244, "grad_norm": 0.5577917695045471, "learning_rate": 9.429590909561104e-06, "loss": 0.4417, "step": 1414 }, { "epoch": 1.1917462099943852, "grad_norm": 0.44747334718704224, "learning_rate": 9.42822597824247e-06, "loss": 0.4237, "step": 1415 }, { "epoch": 1.192588433464346, "grad_norm": 0.585789144039154, "learning_rate": 9.426859514833573e-06, "loss": 0.4484, "step": 1416 }, { "epoch": 1.1934306569343065, "grad_norm": 0.5086108446121216, "learning_rate": 9.425491519807188e-06, "loss": 0.4612, "step": 1417 }, { "epoch": 1.1942728804042673, "grad_norm": 0.478468120098114, "learning_rate": 9.424121993636614e-06, "loss": 0.3855, "step": 1418 }, { "epoch": 1.195115103874228, "grad_norm": 0.5926589965820312, "learning_rate": 9.422750936795689e-06, "loss": 0.4389, "step": 1419 }, { "epoch": 1.1959573273441886, "grad_norm": 0.49389249086380005, "learning_rate": 9.42137834975877e-06, "loss": 0.4322, "step": 1420 }, { "epoch": 1.1967995508141493, "grad_norm": 0.5806136131286621, "learning_rate": 9.42000423300075e-06, "loss": 0.4572, "step": 1421 }, { "epoch": 1.1976417742841101, "grad_norm": 0.42879927158355713, "learning_rate": 9.418628586997051e-06, "loss": 0.4253, "step": 1422 }, { "epoch": 1.1984839977540707, "grad_norm": 0.5117987394332886, "learning_rate": 9.417251412223623e-06, "loss": 0.4133, "step": 1423 }, { "epoch": 1.1993262212240314, "grad_norm": 0.4817526340484619, "learning_rate": 9.415872709156945e-06, "loss": 0.3987, "step": 1424 }, { "epoch": 1.2001684446939922, "grad_norm": 0.5416394472122192, "learning_rate": 9.414492478274022e-06, "loss": 0.4453, "step": 1425 }, { "epoch": 1.2010106681639527, "grad_norm": 0.5028197169303894, "learning_rate": 9.413110720052394e-06, "loss": 0.4115, "step": 1426 }, { "epoch": 1.2018528916339135, "grad_norm": 0.5515120625495911, "learning_rate": 9.411727434970121e-06, "loss": 0.4434, "step": 1427 }, { "epoch": 1.2026951151038743, "grad_norm": 0.49365147948265076, "learning_rate": 9.410342623505801e-06, "loss": 0.485, "step": 1428 }, { "epoch": 1.203537338573835, "grad_norm": 0.4848344027996063, "learning_rate": 9.408956286138553e-06, "loss": 0.3612, "step": 1429 }, { "epoch": 1.2043795620437956, "grad_norm": 0.5056187510490417, "learning_rate": 9.407568423348023e-06, "loss": 0.4027, "step": 1430 }, { "epoch": 1.2052217855137564, "grad_norm": 0.5096155405044556, "learning_rate": 9.406179035614394e-06, "loss": 0.4554, "step": 1431 }, { "epoch": 1.206064008983717, "grad_norm": 0.44587090611457825, "learning_rate": 9.404788123418364e-06, "loss": 0.4374, "step": 1432 }, { "epoch": 1.2069062324536777, "grad_norm": 0.4679119288921356, "learning_rate": 9.403395687241168e-06, "loss": 0.4342, "step": 1433 }, { "epoch": 1.2077484559236384, "grad_norm": 0.4562222361564636, "learning_rate": 9.402001727564564e-06, "loss": 0.413, "step": 1434 }, { "epoch": 1.2085906793935992, "grad_norm": 0.5381638407707214, "learning_rate": 9.400606244870841e-06, "loss": 0.4528, "step": 1435 }, { "epoch": 1.2094329028635598, "grad_norm": 0.4582306444644928, "learning_rate": 9.399209239642806e-06, "loss": 0.423, "step": 1436 }, { "epoch": 1.2102751263335205, "grad_norm": 0.48180460929870605, "learning_rate": 9.397810712363805e-06, "loss": 0.4256, "step": 1437 }, { "epoch": 1.211117349803481, "grad_norm": 0.5387073159217834, "learning_rate": 9.396410663517702e-06, "loss": 0.4065, "step": 1438 }, { "epoch": 1.2119595732734418, "grad_norm": 0.45201632380485535, "learning_rate": 9.39500909358889e-06, "loss": 0.409, "step": 1439 }, { "epoch": 1.2128017967434026, "grad_norm": 0.4696480333805084, "learning_rate": 9.393606003062287e-06, "loss": 0.4473, "step": 1440 }, { "epoch": 1.2136440202133634, "grad_norm": 0.4960535168647766, "learning_rate": 9.392201392423342e-06, "loss": 0.4211, "step": 1441 }, { "epoch": 1.214486243683324, "grad_norm": 0.4431801736354828, "learning_rate": 9.39079526215802e-06, "loss": 0.411, "step": 1442 }, { "epoch": 1.2153284671532847, "grad_norm": 0.5112121105194092, "learning_rate": 9.389387612752822e-06, "loss": 0.4079, "step": 1443 }, { "epoch": 1.2161706906232455, "grad_norm": 0.5141167640686035, "learning_rate": 9.387978444694773e-06, "loss": 0.4407, "step": 1444 }, { "epoch": 1.217012914093206, "grad_norm": 0.4934862554073334, "learning_rate": 9.386567758471416e-06, "loss": 0.4634, "step": 1445 }, { "epoch": 1.2178551375631668, "grad_norm": 0.5329363942146301, "learning_rate": 9.385155554570826e-06, "loss": 0.4153, "step": 1446 }, { "epoch": 1.2186973610331275, "grad_norm": 0.4921463429927826, "learning_rate": 9.383741833481603e-06, "loss": 0.4506, "step": 1447 }, { "epoch": 1.219539584503088, "grad_norm": 0.4571564793586731, "learning_rate": 9.382326595692869e-06, "loss": 0.3942, "step": 1448 }, { "epoch": 1.2203818079730488, "grad_norm": 0.5339667201042175, "learning_rate": 9.380909841694271e-06, "loss": 0.4322, "step": 1449 }, { "epoch": 1.2212240314430096, "grad_norm": 0.5822999477386475, "learning_rate": 9.379491571975983e-06, "loss": 0.4696, "step": 1450 }, { "epoch": 1.2220662549129702, "grad_norm": 0.49499544501304626, "learning_rate": 9.378071787028701e-06, "loss": 0.4299, "step": 1451 }, { "epoch": 1.222908478382931, "grad_norm": 0.582647442817688, "learning_rate": 9.376650487343646e-06, "loss": 0.4285, "step": 1452 }, { "epoch": 1.2237507018528917, "grad_norm": 0.442227303981781, "learning_rate": 9.375227673412566e-06, "loss": 0.4218, "step": 1453 }, { "epoch": 1.2245929253228522, "grad_norm": 0.47219318151474, "learning_rate": 9.373803345727727e-06, "loss": 0.4228, "step": 1454 }, { "epoch": 1.225435148792813, "grad_norm": 0.5541886687278748, "learning_rate": 9.372377504781925e-06, "loss": 0.4087, "step": 1455 }, { "epoch": 1.2262773722627738, "grad_norm": 0.43503913283348083, "learning_rate": 9.370950151068474e-06, "loss": 0.396, "step": 1456 }, { "epoch": 1.2271195957327343, "grad_norm": 0.47749853134155273, "learning_rate": 9.369521285081213e-06, "loss": 0.4594, "step": 1457 }, { "epoch": 1.227961819202695, "grad_norm": 0.4804784953594208, "learning_rate": 9.368090907314509e-06, "loss": 0.465, "step": 1458 }, { "epoch": 1.2288040426726559, "grad_norm": 0.47708025574684143, "learning_rate": 9.366659018263244e-06, "loss": 0.4272, "step": 1459 }, { "epoch": 1.2296462661426166, "grad_norm": 0.4732118844985962, "learning_rate": 9.36522561842283e-06, "loss": 0.3841, "step": 1460 }, { "epoch": 1.2304884896125772, "grad_norm": 0.5576649308204651, "learning_rate": 9.3637907082892e-06, "loss": 0.476, "step": 1461 }, { "epoch": 1.231330713082538, "grad_norm": 0.5701187252998352, "learning_rate": 9.362354288358804e-06, "loss": 0.4813, "step": 1462 }, { "epoch": 1.2321729365524985, "grad_norm": 0.4351637363433838, "learning_rate": 9.36091635912862e-06, "loss": 0.3738, "step": 1463 }, { "epoch": 1.2330151600224593, "grad_norm": 0.7020814418792725, "learning_rate": 9.35947692109615e-06, "loss": 0.4584, "step": 1464 }, { "epoch": 1.23385738349242, "grad_norm": 0.5135090351104736, "learning_rate": 9.35803597475941e-06, "loss": 0.4104, "step": 1465 }, { "epoch": 1.2346996069623808, "grad_norm": 0.49154019355773926, "learning_rate": 9.356593520616948e-06, "loss": 0.433, "step": 1466 }, { "epoch": 1.2355418304323413, "grad_norm": 0.5765358805656433, "learning_rate": 9.355149559167825e-06, "loss": 0.4583, "step": 1467 }, { "epoch": 1.236384053902302, "grad_norm": 0.4128934144973755, "learning_rate": 9.353704090911627e-06, "loss": 0.4204, "step": 1468 }, { "epoch": 1.2372262773722627, "grad_norm": 0.6043089628219604, "learning_rate": 9.35225711634846e-06, "loss": 0.4911, "step": 1469 }, { "epoch": 1.2380685008422234, "grad_norm": 0.5171332359313965, "learning_rate": 9.350808635978956e-06, "loss": 0.4182, "step": 1470 }, { "epoch": 1.2389107243121842, "grad_norm": 0.49609053134918213, "learning_rate": 9.349358650304264e-06, "loss": 0.4205, "step": 1471 }, { "epoch": 1.239752947782145, "grad_norm": 0.5156107544898987, "learning_rate": 9.34790715982605e-06, "loss": 0.4318, "step": 1472 }, { "epoch": 1.2405951712521055, "grad_norm": 0.5783904194831848, "learning_rate": 9.34645416504651e-06, "loss": 0.4462, "step": 1473 }, { "epoch": 1.2414373947220663, "grad_norm": 0.41050252318382263, "learning_rate": 9.34499966646835e-06, "loss": 0.3873, "step": 1474 }, { "epoch": 1.242279618192027, "grad_norm": 0.4632067382335663, "learning_rate": 9.343543664594807e-06, "loss": 0.4352, "step": 1475 }, { "epoch": 1.2431218416619876, "grad_norm": 0.597320020198822, "learning_rate": 9.342086159929629e-06, "loss": 0.4266, "step": 1476 }, { "epoch": 1.2439640651319483, "grad_norm": 0.42370837926864624, "learning_rate": 9.340627152977092e-06, "loss": 0.4314, "step": 1477 }, { "epoch": 1.2448062886019091, "grad_norm": 0.5238741636276245, "learning_rate": 9.33916664424198e-06, "loss": 0.4094, "step": 1478 }, { "epoch": 1.2456485120718697, "grad_norm": 0.5168343186378479, "learning_rate": 9.33770463422961e-06, "loss": 0.4251, "step": 1479 }, { "epoch": 1.2464907355418304, "grad_norm": 0.5402882099151611, "learning_rate": 9.336241123445809e-06, "loss": 0.4624, "step": 1480 }, { "epoch": 1.2473329590117912, "grad_norm": 0.4778558015823364, "learning_rate": 9.334776112396929e-06, "loss": 0.4655, "step": 1481 }, { "epoch": 1.2481751824817517, "grad_norm": 0.4822431206703186, "learning_rate": 9.333309601589837e-06, "loss": 0.3945, "step": 1482 }, { "epoch": 1.2490174059517125, "grad_norm": 0.46297818422317505, "learning_rate": 9.331841591531923e-06, "loss": 0.4175, "step": 1483 }, { "epoch": 1.2498596294216733, "grad_norm": 0.48883920907974243, "learning_rate": 9.33037208273109e-06, "loss": 0.4093, "step": 1484 }, { "epoch": 1.250701852891634, "grad_norm": 0.4616517722606659, "learning_rate": 9.328901075695766e-06, "loss": 0.4142, "step": 1485 }, { "epoch": 1.2515440763615946, "grad_norm": 0.412856787443161, "learning_rate": 9.327428570934894e-06, "loss": 0.4338, "step": 1486 }, { "epoch": 1.2523862998315554, "grad_norm": 0.5075788497924805, "learning_rate": 9.32595456895793e-06, "loss": 0.4392, "step": 1487 }, { "epoch": 1.253228523301516, "grad_norm": 0.41523411870002747, "learning_rate": 9.324479070274862e-06, "loss": 0.3951, "step": 1488 }, { "epoch": 1.2540707467714767, "grad_norm": 0.5246259570121765, "learning_rate": 9.323002075396182e-06, "loss": 0.4757, "step": 1489 }, { "epoch": 1.2549129702414374, "grad_norm": 0.38355544209480286, "learning_rate": 9.321523584832906e-06, "loss": 0.4235, "step": 1490 }, { "epoch": 1.2557551937113982, "grad_norm": 0.43053680658340454, "learning_rate": 9.320043599096564e-06, "loss": 0.4267, "step": 1491 }, { "epoch": 1.2565974171813588, "grad_norm": 0.4424111545085907, "learning_rate": 9.31856211869921e-06, "loss": 0.4021, "step": 1492 }, { "epoch": 1.2574396406513195, "grad_norm": 0.44594448804855347, "learning_rate": 9.317079144153407e-06, "loss": 0.4479, "step": 1493 }, { "epoch": 1.25828186412128, "grad_norm": 0.4398069679737091, "learning_rate": 9.315594675972241e-06, "loss": 0.4357, "step": 1494 }, { "epoch": 1.2591240875912408, "grad_norm": 0.42084088921546936, "learning_rate": 9.314108714669312e-06, "loss": 0.3861, "step": 1495 }, { "epoch": 1.2599663110612016, "grad_norm": 0.4598964750766754, "learning_rate": 9.312621260758737e-06, "loss": 0.4622, "step": 1496 }, { "epoch": 1.2608085345311624, "grad_norm": 0.5394331216812134, "learning_rate": 9.31113231475515e-06, "loss": 0.4326, "step": 1497 }, { "epoch": 1.261650758001123, "grad_norm": 0.4102165400981903, "learning_rate": 9.3096418771737e-06, "loss": 0.4137, "step": 1498 }, { "epoch": 1.2624929814710837, "grad_norm": 0.4407392144203186, "learning_rate": 9.308149948530055e-06, "loss": 0.4246, "step": 1499 }, { "epoch": 1.2633352049410442, "grad_norm": 0.4673311114311218, "learning_rate": 9.306656529340392e-06, "loss": 0.4522, "step": 1500 }, { "epoch": 1.264177428411005, "grad_norm": 0.4583197832107544, "learning_rate": 9.305161620121415e-06, "loss": 0.4207, "step": 1501 }, { "epoch": 1.2650196518809658, "grad_norm": 0.4613693654537201, "learning_rate": 9.30366522139033e-06, "loss": 0.4157, "step": 1502 }, { "epoch": 1.2658618753509265, "grad_norm": 0.45566025376319885, "learning_rate": 9.30216733366487e-06, "loss": 0.4399, "step": 1503 }, { "epoch": 1.266704098820887, "grad_norm": 0.39440760016441345, "learning_rate": 9.300667957463279e-06, "loss": 0.426, "step": 1504 }, { "epoch": 1.2675463222908478, "grad_norm": 0.531906008720398, "learning_rate": 9.299167093304312e-06, "loss": 0.4209, "step": 1505 }, { "epoch": 1.2683885457608086, "grad_norm": 0.4977714419364929, "learning_rate": 9.297664741707244e-06, "loss": 0.4168, "step": 1506 }, { "epoch": 1.2692307692307692, "grad_norm": 0.47790104150772095, "learning_rate": 9.296160903191862e-06, "loss": 0.4056, "step": 1507 }, { "epoch": 1.27007299270073, "grad_norm": 0.5144535899162292, "learning_rate": 9.294655578278471e-06, "loss": 0.4305, "step": 1508 }, { "epoch": 1.2709152161706907, "grad_norm": 0.45334023237228394, "learning_rate": 9.293148767487885e-06, "loss": 0.4583, "step": 1509 }, { "epoch": 1.2717574396406512, "grad_norm": 0.49512168765068054, "learning_rate": 9.291640471341435e-06, "loss": 0.4235, "step": 1510 }, { "epoch": 1.272599663110612, "grad_norm": 0.4339429438114166, "learning_rate": 9.290130690360966e-06, "loss": 0.4203, "step": 1511 }, { "epoch": 1.2734418865805728, "grad_norm": 0.5140374302864075, "learning_rate": 9.288619425068837e-06, "loss": 0.4602, "step": 1512 }, { "epoch": 1.2742841100505333, "grad_norm": 0.4703027307987213, "learning_rate": 9.287106675987918e-06, "loss": 0.4396, "step": 1513 }, { "epoch": 1.275126333520494, "grad_norm": 0.4573381245136261, "learning_rate": 9.285592443641596e-06, "loss": 0.4439, "step": 1514 }, { "epoch": 1.2759685569904549, "grad_norm": 0.43563997745513916, "learning_rate": 9.28407672855377e-06, "loss": 0.4209, "step": 1515 }, { "epoch": 1.2768107804604156, "grad_norm": 0.4419814646244049, "learning_rate": 9.282559531248849e-06, "loss": 0.4397, "step": 1516 }, { "epoch": 1.2776530039303762, "grad_norm": 0.4481181800365448, "learning_rate": 9.28104085225176e-06, "loss": 0.4301, "step": 1517 }, { "epoch": 1.278495227400337, "grad_norm": 0.426358163356781, "learning_rate": 9.279520692087937e-06, "loss": 0.4111, "step": 1518 }, { "epoch": 1.2793374508702975, "grad_norm": 0.45936739444732666, "learning_rate": 9.277999051283333e-06, "loss": 0.4315, "step": 1519 }, { "epoch": 1.2801796743402583, "grad_norm": 0.4258275628089905, "learning_rate": 9.276475930364406e-06, "loss": 0.4552, "step": 1520 }, { "epoch": 1.281021897810219, "grad_norm": 0.42794010043144226, "learning_rate": 9.274951329858133e-06, "loss": 0.37, "step": 1521 }, { "epoch": 1.2818641212801798, "grad_norm": 0.5049923658370972, "learning_rate": 9.273425250291995e-06, "loss": 0.4926, "step": 1522 }, { "epoch": 1.2827063447501403, "grad_norm": 0.4232518672943115, "learning_rate": 9.271897692193993e-06, "loss": 0.3951, "step": 1523 }, { "epoch": 1.283548568220101, "grad_norm": 0.545866072177887, "learning_rate": 9.270368656092637e-06, "loss": 0.4681, "step": 1524 }, { "epoch": 1.2843907916900617, "grad_norm": 0.518126904964447, "learning_rate": 9.268838142516943e-06, "loss": 0.4266, "step": 1525 }, { "epoch": 1.2852330151600224, "grad_norm": 0.5237314701080322, "learning_rate": 9.267306151996447e-06, "loss": 0.409, "step": 1526 }, { "epoch": 1.2860752386299832, "grad_norm": 0.4888157248497009, "learning_rate": 9.265772685061186e-06, "loss": 0.4513, "step": 1527 }, { "epoch": 1.286917462099944, "grad_norm": 0.6658836007118225, "learning_rate": 9.264237742241722e-06, "loss": 0.4285, "step": 1528 }, { "epoch": 1.2877596855699045, "grad_norm": 0.525274395942688, "learning_rate": 9.26270132406911e-06, "loss": 0.4253, "step": 1529 }, { "epoch": 1.2886019090398653, "grad_norm": 0.5907583236694336, "learning_rate": 9.261163431074927e-06, "loss": 0.4551, "step": 1530 }, { "epoch": 1.2894441325098258, "grad_norm": 0.4154834747314453, "learning_rate": 9.25962406379126e-06, "loss": 0.3851, "step": 1531 }, { "epoch": 1.2902863559797866, "grad_norm": 0.5017328858375549, "learning_rate": 9.258083222750702e-06, "loss": 0.4561, "step": 1532 }, { "epoch": 1.2911285794497473, "grad_norm": 0.5060775279998779, "learning_rate": 9.256540908486358e-06, "loss": 0.434, "step": 1533 }, { "epoch": 1.2919708029197081, "grad_norm": 0.5176235437393188, "learning_rate": 9.25499712153184e-06, "loss": 0.4066, "step": 1534 }, { "epoch": 1.2928130263896687, "grad_norm": 0.44604283571243286, "learning_rate": 9.253451862421276e-06, "loss": 0.4583, "step": 1535 }, { "epoch": 1.2936552498596294, "grad_norm": 0.5782433152198792, "learning_rate": 9.251905131689295e-06, "loss": 0.451, "step": 1536 }, { "epoch": 1.2944974733295902, "grad_norm": 0.4580385088920593, "learning_rate": 9.250356929871045e-06, "loss": 0.4085, "step": 1537 }, { "epoch": 1.2953396967995507, "grad_norm": 0.43648722767829895, "learning_rate": 9.248807257502171e-06, "loss": 0.4549, "step": 1538 }, { "epoch": 1.2961819202695115, "grad_norm": 0.45444145798683167, "learning_rate": 9.247256115118835e-06, "loss": 0.3991, "step": 1539 }, { "epoch": 1.2970241437394723, "grad_norm": 0.4980778992176056, "learning_rate": 9.245703503257709e-06, "loss": 0.3881, "step": 1540 }, { "epoch": 1.2978663672094328, "grad_norm": 0.3903530240058899, "learning_rate": 9.244149422455964e-06, "loss": 0.4484, "step": 1541 }, { "epoch": 1.2987085906793936, "grad_norm": 0.4893859028816223, "learning_rate": 9.242593873251291e-06, "loss": 0.4359, "step": 1542 }, { "epoch": 1.2995508141493544, "grad_norm": 0.4726598262786865, "learning_rate": 9.241036856181879e-06, "loss": 0.4134, "step": 1543 }, { "epoch": 1.300393037619315, "grad_norm": 0.4545988440513611, "learning_rate": 9.239478371786432e-06, "loss": 0.4392, "step": 1544 }, { "epoch": 1.3012352610892757, "grad_norm": 0.43904030323028564, "learning_rate": 9.23791842060416e-06, "loss": 0.3973, "step": 1545 }, { "epoch": 1.3020774845592364, "grad_norm": 0.5044408440589905, "learning_rate": 9.236357003174777e-06, "loss": 0.4383, "step": 1546 }, { "epoch": 1.3029197080291972, "grad_norm": 0.39686319231987, "learning_rate": 9.234794120038504e-06, "loss": 0.396, "step": 1547 }, { "epoch": 1.3037619314991578, "grad_norm": 0.4952682852745056, "learning_rate": 9.233229771736078e-06, "loss": 0.4674, "step": 1548 }, { "epoch": 1.3046041549691185, "grad_norm": 0.4831187129020691, "learning_rate": 9.231663958808733e-06, "loss": 0.4496, "step": 1549 }, { "epoch": 1.305446378439079, "grad_norm": 0.4072377681732178, "learning_rate": 9.230096681798213e-06, "loss": 0.4299, "step": 1550 }, { "epoch": 1.3062886019090398, "grad_norm": 0.4859127402305603, "learning_rate": 9.228527941246772e-06, "loss": 0.4336, "step": 1551 }, { "epoch": 1.3071308253790006, "grad_norm": 0.4419572055339813, "learning_rate": 9.226957737697164e-06, "loss": 0.4136, "step": 1552 }, { "epoch": 1.3079730488489614, "grad_norm": 0.39494240283966064, "learning_rate": 9.225386071692655e-06, "loss": 0.4183, "step": 1553 }, { "epoch": 1.308815272318922, "grad_norm": 0.4388789236545563, "learning_rate": 9.223812943777011e-06, "loss": 0.3964, "step": 1554 }, { "epoch": 1.3096574957888827, "grad_norm": 0.47073227167129517, "learning_rate": 9.222238354494512e-06, "loss": 0.4694, "step": 1555 }, { "epoch": 1.3104997192588432, "grad_norm": 0.3945550322532654, "learning_rate": 9.220662304389935e-06, "loss": 0.3901, "step": 1556 }, { "epoch": 1.311341942728804, "grad_norm": 0.41987496614456177, "learning_rate": 9.219084794008569e-06, "loss": 0.4754, "step": 1557 }, { "epoch": 1.3121841661987648, "grad_norm": 0.4330595135688782, "learning_rate": 9.217505823896203e-06, "loss": 0.3699, "step": 1558 }, { "epoch": 1.3130263896687255, "grad_norm": 0.4396032392978668, "learning_rate": 9.215925394599137e-06, "loss": 0.4222, "step": 1559 }, { "epoch": 1.313868613138686, "grad_norm": 0.4480987787246704, "learning_rate": 9.214343506664169e-06, "loss": 0.4443, "step": 1560 }, { "epoch": 1.3147108366086468, "grad_norm": 0.4286564886569977, "learning_rate": 9.212760160638606e-06, "loss": 0.4178, "step": 1561 }, { "epoch": 1.3155530600786074, "grad_norm": 0.5066958665847778, "learning_rate": 9.21117535707026e-06, "loss": 0.4275, "step": 1562 }, { "epoch": 1.3163952835485682, "grad_norm": 0.4252334237098694, "learning_rate": 9.209589096507445e-06, "loss": 0.4378, "step": 1563 }, { "epoch": 1.317237507018529, "grad_norm": 0.4872836768627167, "learning_rate": 9.20800137949898e-06, "loss": 0.3917, "step": 1564 }, { "epoch": 1.3180797304884897, "grad_norm": 0.5196892619132996, "learning_rate": 9.206412206594187e-06, "loss": 0.4456, "step": 1565 }, { "epoch": 1.3189219539584502, "grad_norm": 0.45114099979400635, "learning_rate": 9.204821578342893e-06, "loss": 0.4479, "step": 1566 }, { "epoch": 1.319764177428411, "grad_norm": 0.47835469245910645, "learning_rate": 9.20322949529543e-06, "loss": 0.4144, "step": 1567 }, { "epoch": 1.3206064008983718, "grad_norm": 0.5985450148582458, "learning_rate": 9.201635958002628e-06, "loss": 0.4339, "step": 1568 }, { "epoch": 1.3214486243683323, "grad_norm": 0.40391120314598083, "learning_rate": 9.200040967015828e-06, "loss": 0.4358, "step": 1569 }, { "epoch": 1.322290847838293, "grad_norm": 0.5715929865837097, "learning_rate": 9.198444522886864e-06, "loss": 0.4883, "step": 1570 }, { "epoch": 1.3231330713082539, "grad_norm": 0.49965864419937134, "learning_rate": 9.196846626168084e-06, "loss": 0.3778, "step": 1571 }, { "epoch": 1.3239752947782144, "grad_norm": 0.45927849411964417, "learning_rate": 9.19524727741233e-06, "loss": 0.4499, "step": 1572 }, { "epoch": 1.3248175182481752, "grad_norm": 0.44325965642929077, "learning_rate": 9.193646477172947e-06, "loss": 0.4419, "step": 1573 }, { "epoch": 1.325659741718136, "grad_norm": 0.48808398842811584, "learning_rate": 9.19204422600379e-06, "loss": 0.4443, "step": 1574 }, { "epoch": 1.3265019651880965, "grad_norm": 0.4329679310321808, "learning_rate": 9.190440524459203e-06, "loss": 0.4302, "step": 1575 }, { "epoch": 1.3273441886580573, "grad_norm": 0.4613704979419708, "learning_rate": 9.188835373094047e-06, "loss": 0.4479, "step": 1576 }, { "epoch": 1.328186412128018, "grad_norm": 0.5076000094413757, "learning_rate": 9.187228772463672e-06, "loss": 0.4178, "step": 1577 }, { "epoch": 1.3290286355979788, "grad_norm": 0.3962564766407013, "learning_rate": 9.185620723123935e-06, "loss": 0.411, "step": 1578 }, { "epoch": 1.3298708590679393, "grad_norm": 0.506770133972168, "learning_rate": 9.184011225631197e-06, "loss": 0.4441, "step": 1579 }, { "epoch": 1.3307130825379, "grad_norm": 0.47881758213043213, "learning_rate": 9.182400280542312e-06, "loss": 0.4485, "step": 1580 }, { "epoch": 1.3315553060078607, "grad_norm": 0.5449895858764648, "learning_rate": 9.18078788841464e-06, "loss": 0.4399, "step": 1581 }, { "epoch": 1.3323975294778214, "grad_norm": 0.38550689816474915, "learning_rate": 9.179174049806043e-06, "loss": 0.3729, "step": 1582 }, { "epoch": 1.3332397529477822, "grad_norm": 0.5417882800102234, "learning_rate": 9.177558765274881e-06, "loss": 0.4541, "step": 1583 }, { "epoch": 1.334081976417743, "grad_norm": 0.5130826234817505, "learning_rate": 9.175942035380015e-06, "loss": 0.4727, "step": 1584 }, { "epoch": 1.3349241998877035, "grad_norm": 0.44567933678627014, "learning_rate": 9.174323860680802e-06, "loss": 0.4127, "step": 1585 }, { "epoch": 1.3357664233576643, "grad_norm": 0.4723002314567566, "learning_rate": 9.17270424173711e-06, "loss": 0.4191, "step": 1586 }, { "epoch": 1.3366086468276248, "grad_norm": 0.49456915259361267, "learning_rate": 9.171083179109292e-06, "loss": 0.4156, "step": 1587 }, { "epoch": 1.3374508702975856, "grad_norm": 0.46696969866752625, "learning_rate": 9.169460673358213e-06, "loss": 0.4095, "step": 1588 }, { "epoch": 1.3382930937675463, "grad_norm": 0.5001147389411926, "learning_rate": 9.16783672504523e-06, "loss": 0.466, "step": 1589 }, { "epoch": 1.3391353172375071, "grad_norm": 0.5121516585350037, "learning_rate": 9.1662113347322e-06, "loss": 0.4391, "step": 1590 }, { "epoch": 1.3399775407074677, "grad_norm": 0.42390841245651245, "learning_rate": 9.164584502981483e-06, "loss": 0.3745, "step": 1591 }, { "epoch": 1.3408197641774284, "grad_norm": 0.48743993043899536, "learning_rate": 9.162956230355933e-06, "loss": 0.4257, "step": 1592 }, { "epoch": 1.341661987647389, "grad_norm": 0.4358460605144501, "learning_rate": 9.161326517418906e-06, "loss": 0.4228, "step": 1593 }, { "epoch": 1.3425042111173497, "grad_norm": 0.5028144121170044, "learning_rate": 9.159695364734253e-06, "loss": 0.428, "step": 1594 }, { "epoch": 1.3433464345873105, "grad_norm": 0.4871158301830292, "learning_rate": 9.158062772866326e-06, "loss": 0.4449, "step": 1595 }, { "epoch": 1.3441886580572713, "grad_norm": 0.44183334708213806, "learning_rate": 9.156428742379974e-06, "loss": 0.4516, "step": 1596 }, { "epoch": 1.3450308815272318, "grad_norm": 0.4339861571788788, "learning_rate": 9.154793273840541e-06, "loss": 0.4105, "step": 1597 }, { "epoch": 1.3458731049971926, "grad_norm": 0.4502689838409424, "learning_rate": 9.153156367813876e-06, "loss": 0.4511, "step": 1598 }, { "epoch": 1.3467153284671534, "grad_norm": 0.4224354028701782, "learning_rate": 9.151518024866315e-06, "loss": 0.4043, "step": 1599 }, { "epoch": 1.347557551937114, "grad_norm": 0.5588507056236267, "learning_rate": 9.149878245564699e-06, "loss": 0.44, "step": 1600 }, { "epoch": 1.3483997754070747, "grad_norm": 0.40880969166755676, "learning_rate": 9.148237030476365e-06, "loss": 0.4504, "step": 1601 }, { "epoch": 1.3492419988770354, "grad_norm": 0.46684569120407104, "learning_rate": 9.146594380169142e-06, "loss": 0.3862, "step": 1602 }, { "epoch": 1.350084222346996, "grad_norm": 0.5306395888328552, "learning_rate": 9.144950295211363e-06, "loss": 0.4385, "step": 1603 }, { "epoch": 1.3509264458169568, "grad_norm": 0.4532155990600586, "learning_rate": 9.143304776171848e-06, "loss": 0.4534, "step": 1604 }, { "epoch": 1.3517686692869175, "grad_norm": 0.5570151209831238, "learning_rate": 9.141657823619923e-06, "loss": 0.4188, "step": 1605 }, { "epoch": 1.352610892756878, "grad_norm": 0.4173699617385864, "learning_rate": 9.140009438125404e-06, "loss": 0.4805, "step": 1606 }, { "epoch": 1.3534531162268388, "grad_norm": 0.4586082994937897, "learning_rate": 9.138359620258603e-06, "loss": 0.3956, "step": 1607 }, { "epoch": 1.3542953396967996, "grad_norm": 0.4018876552581787, "learning_rate": 9.13670837059033e-06, "loss": 0.3828, "step": 1608 }, { "epoch": 1.3551375631667604, "grad_norm": 0.48617249727249146, "learning_rate": 9.135055689691887e-06, "loss": 0.4414, "step": 1609 }, { "epoch": 1.355979786636721, "grad_norm": 0.4621151387691498, "learning_rate": 9.133401578135076e-06, "loss": 0.4139, "step": 1610 }, { "epoch": 1.3568220101066817, "grad_norm": 0.6061564087867737, "learning_rate": 9.13174603649219e-06, "loss": 0.4739, "step": 1611 }, { "epoch": 1.3576642335766422, "grad_norm": 0.4624588191509247, "learning_rate": 9.130089065336018e-06, "loss": 0.4043, "step": 1612 }, { "epoch": 1.358506457046603, "grad_norm": 0.426770955324173, "learning_rate": 9.128430665239842e-06, "loss": 0.4136, "step": 1613 }, { "epoch": 1.3593486805165638, "grad_norm": 0.5920709371566772, "learning_rate": 9.126770836777443e-06, "loss": 0.4353, "step": 1614 }, { "epoch": 1.3601909039865245, "grad_norm": 0.43033164739608765, "learning_rate": 9.125109580523093e-06, "loss": 0.3919, "step": 1615 }, { "epoch": 1.361033127456485, "grad_norm": 0.45799171924591064, "learning_rate": 9.123446897051556e-06, "loss": 0.4207, "step": 1616 }, { "epoch": 1.3618753509264458, "grad_norm": 0.450250506401062, "learning_rate": 9.121782786938091e-06, "loss": 0.4286, "step": 1617 }, { "epoch": 1.3627175743964064, "grad_norm": 0.5233408212661743, "learning_rate": 9.120117250758455e-06, "loss": 0.4874, "step": 1618 }, { "epoch": 1.3635597978663672, "grad_norm": 0.4073256850242615, "learning_rate": 9.118450289088893e-06, "loss": 0.4315, "step": 1619 }, { "epoch": 1.364402021336328, "grad_norm": 0.46815356612205505, "learning_rate": 9.116781902506147e-06, "loss": 0.4536, "step": 1620 }, { "epoch": 1.3652442448062887, "grad_norm": 0.42569711804389954, "learning_rate": 9.115112091587448e-06, "loss": 0.3863, "step": 1621 }, { "epoch": 1.3660864682762492, "grad_norm": 0.455709308385849, "learning_rate": 9.113440856910523e-06, "loss": 0.4552, "step": 1622 }, { "epoch": 1.36692869174621, "grad_norm": 0.4189848005771637, "learning_rate": 9.111768199053588e-06, "loss": 0.3926, "step": 1623 }, { "epoch": 1.3677709152161706, "grad_norm": 0.3921946883201599, "learning_rate": 9.110094118595358e-06, "loss": 0.404, "step": 1624 }, { "epoch": 1.3686131386861313, "grad_norm": 0.42861810326576233, "learning_rate": 9.108418616115035e-06, "loss": 0.4283, "step": 1625 }, { "epoch": 1.369455362156092, "grad_norm": 0.4474807679653168, "learning_rate": 9.106741692192312e-06, "loss": 0.4372, "step": 1626 }, { "epoch": 1.3702975856260529, "grad_norm": 0.40630075335502625, "learning_rate": 9.10506334740738e-06, "loss": 0.41, "step": 1627 }, { "epoch": 1.3711398090960134, "grad_norm": 0.46563005447387695, "learning_rate": 9.103383582340913e-06, "loss": 0.4151, "step": 1628 }, { "epoch": 1.3719820325659742, "grad_norm": 0.5104817152023315, "learning_rate": 9.101702397574082e-06, "loss": 0.4325, "step": 1629 }, { "epoch": 1.372824256035935, "grad_norm": 0.4601053297519684, "learning_rate": 9.10001979368855e-06, "loss": 0.4218, "step": 1630 }, { "epoch": 1.3736664795058955, "grad_norm": 0.4869697391986847, "learning_rate": 9.098335771266466e-06, "loss": 0.3986, "step": 1631 }, { "epoch": 1.3745087029758563, "grad_norm": 0.5945298075675964, "learning_rate": 9.096650330890477e-06, "loss": 0.4267, "step": 1632 }, { "epoch": 1.375350926445817, "grad_norm": 0.432952880859375, "learning_rate": 9.094963473143715e-06, "loss": 0.4391, "step": 1633 }, { "epoch": 1.3761931499157778, "grad_norm": 0.4788016676902771, "learning_rate": 9.093275198609802e-06, "loss": 0.4011, "step": 1634 }, { "epoch": 1.3770353733857383, "grad_norm": 0.55423903465271, "learning_rate": 9.091585507872853e-06, "loss": 0.3937, "step": 1635 }, { "epoch": 1.377877596855699, "grad_norm": 0.5208324790000916, "learning_rate": 9.089894401517474e-06, "loss": 0.4453, "step": 1636 }, { "epoch": 1.3787198203256597, "grad_norm": 0.4711167514324188, "learning_rate": 9.088201880128755e-06, "loss": 0.4079, "step": 1637 }, { "epoch": 1.3795620437956204, "grad_norm": 0.5165712833404541, "learning_rate": 9.086507944292283e-06, "loss": 0.4191, "step": 1638 }, { "epoch": 1.3804042672655812, "grad_norm": 0.4982201159000397, "learning_rate": 9.08481259459413e-06, "loss": 0.4334, "step": 1639 }, { "epoch": 1.381246490735542, "grad_norm": 0.41858235001564026, "learning_rate": 9.083115831620855e-06, "loss": 0.4243, "step": 1640 }, { "epoch": 1.3820887142055025, "grad_norm": 0.4640701115131378, "learning_rate": 9.081417655959511e-06, "loss": 0.4502, "step": 1641 }, { "epoch": 1.3829309376754633, "grad_norm": 0.38916051387786865, "learning_rate": 9.079718068197638e-06, "loss": 0.4207, "step": 1642 }, { "epoch": 1.3837731611454238, "grad_norm": 0.41813981533050537, "learning_rate": 9.078017068923266e-06, "loss": 0.4402, "step": 1643 }, { "epoch": 1.3846153846153846, "grad_norm": 0.39099475741386414, "learning_rate": 9.076314658724907e-06, "loss": 0.394, "step": 1644 }, { "epoch": 1.3854576080853453, "grad_norm": 0.45377224683761597, "learning_rate": 9.07461083819157e-06, "loss": 0.4359, "step": 1645 }, { "epoch": 1.3862998315553061, "grad_norm": 0.4653199315071106, "learning_rate": 9.072905607912745e-06, "loss": 0.4593, "step": 1646 }, { "epoch": 1.3871420550252667, "grad_norm": 0.46016189455986023, "learning_rate": 9.071198968478414e-06, "loss": 0.3927, "step": 1647 }, { "epoch": 1.3879842784952274, "grad_norm": 0.5273410677909851, "learning_rate": 9.069490920479043e-06, "loss": 0.4491, "step": 1648 }, { "epoch": 1.388826501965188, "grad_norm": 0.44343096017837524, "learning_rate": 9.06778146450559e-06, "loss": 0.3812, "step": 1649 }, { "epoch": 1.3896687254351487, "grad_norm": 0.427127867937088, "learning_rate": 9.066070601149497e-06, "loss": 0.4295, "step": 1650 }, { "epoch": 1.3905109489051095, "grad_norm": 0.4201031029224396, "learning_rate": 9.064358331002692e-06, "loss": 0.4189, "step": 1651 }, { "epoch": 1.3913531723750703, "grad_norm": 0.43095865845680237, "learning_rate": 9.062644654657591e-06, "loss": 0.4119, "step": 1652 }, { "epoch": 1.3921953958450308, "grad_norm": 0.4107137620449066, "learning_rate": 9.060929572707096e-06, "loss": 0.4229, "step": 1653 }, { "epoch": 1.3930376193149916, "grad_norm": 0.4688437283039093, "learning_rate": 9.0592130857446e-06, "loss": 0.437, "step": 1654 }, { "epoch": 1.3938798427849521, "grad_norm": 0.4342764914035797, "learning_rate": 9.057495194363974e-06, "loss": 0.404, "step": 1655 }, { "epoch": 1.394722066254913, "grad_norm": 0.3883206248283386, "learning_rate": 9.05577589915958e-06, "loss": 0.4239, "step": 1656 }, { "epoch": 1.3955642897248737, "grad_norm": 0.44637325406074524, "learning_rate": 9.054055200726263e-06, "loss": 0.4404, "step": 1657 }, { "epoch": 1.3964065131948344, "grad_norm": 0.46745821833610535, "learning_rate": 9.05233309965936e-06, "loss": 0.4011, "step": 1658 }, { "epoch": 1.397248736664795, "grad_norm": 0.3758101165294647, "learning_rate": 9.050609596554685e-06, "loss": 0.398, "step": 1659 }, { "epoch": 1.3980909601347558, "grad_norm": 0.4378582537174225, "learning_rate": 9.048884692008542e-06, "loss": 0.4311, "step": 1660 }, { "epoch": 1.3989331836047165, "grad_norm": 0.4112163782119751, "learning_rate": 9.047158386617717e-06, "loss": 0.4458, "step": 1661 }, { "epoch": 1.399775407074677, "grad_norm": 0.3879702389240265, "learning_rate": 9.04543068097948e-06, "loss": 0.4239, "step": 1662 }, { "epoch": 1.4006176305446378, "grad_norm": 0.49702873826026917, "learning_rate": 9.043701575691593e-06, "loss": 0.4231, "step": 1663 }, { "epoch": 1.4014598540145986, "grad_norm": 0.4315569996833801, "learning_rate": 9.041971071352292e-06, "loss": 0.4152, "step": 1664 }, { "epoch": 1.4023020774845594, "grad_norm": 0.4170345962047577, "learning_rate": 9.040239168560302e-06, "loss": 0.4109, "step": 1665 }, { "epoch": 1.40314430095452, "grad_norm": 0.4659956097602844, "learning_rate": 9.038505867914837e-06, "loss": 0.4541, "step": 1666 }, { "epoch": 1.4039865244244807, "grad_norm": 0.4951450228691101, "learning_rate": 9.036771170015584e-06, "loss": 0.4323, "step": 1667 }, { "epoch": 1.4048287478944412, "grad_norm": 0.4709434509277344, "learning_rate": 9.03503507546272e-06, "loss": 0.4175, "step": 1668 }, { "epoch": 1.405670971364402, "grad_norm": 0.47694265842437744, "learning_rate": 9.033297584856906e-06, "loss": 0.4267, "step": 1669 }, { "epoch": 1.4065131948343628, "grad_norm": 0.500157356262207, "learning_rate": 9.031558698799281e-06, "loss": 0.4409, "step": 1670 }, { "epoch": 1.4073554183043235, "grad_norm": 0.4246869385242462, "learning_rate": 9.02981841789147e-06, "loss": 0.3817, "step": 1671 }, { "epoch": 1.408197641774284, "grad_norm": 0.5015645623207092, "learning_rate": 9.028076742735583e-06, "loss": 0.4763, "step": 1672 }, { "epoch": 1.4090398652442448, "grad_norm": 0.4633517563343048, "learning_rate": 9.026333673934206e-06, "loss": 0.4564, "step": 1673 }, { "epoch": 1.4098820887142054, "grad_norm": 0.4208310842514038, "learning_rate": 9.024589212090416e-06, "loss": 0.4069, "step": 1674 }, { "epoch": 1.4107243121841662, "grad_norm": 0.42503952980041504, "learning_rate": 9.022843357807763e-06, "loss": 0.4613, "step": 1675 }, { "epoch": 1.411566535654127, "grad_norm": 0.41029393672943115, "learning_rate": 9.021096111690283e-06, "loss": 0.4255, "step": 1676 }, { "epoch": 1.4124087591240877, "grad_norm": 0.44210487604141235, "learning_rate": 9.019347474342493e-06, "loss": 0.3994, "step": 1677 }, { "epoch": 1.4132509825940482, "grad_norm": 0.4540521204471588, "learning_rate": 9.017597446369392e-06, "loss": 0.4669, "step": 1678 }, { "epoch": 1.414093206064009, "grad_norm": 0.5488752722740173, "learning_rate": 9.015846028376463e-06, "loss": 0.4328, "step": 1679 }, { "epoch": 1.4149354295339696, "grad_norm": 0.4598940908908844, "learning_rate": 9.01409322096966e-06, "loss": 0.4033, "step": 1680 }, { "epoch": 1.4157776530039303, "grad_norm": 0.47420692443847656, "learning_rate": 9.012339024755429e-06, "loss": 0.4275, "step": 1681 }, { "epoch": 1.416619876473891, "grad_norm": 0.4501713514328003, "learning_rate": 9.010583440340693e-06, "loss": 0.419, "step": 1682 }, { "epoch": 1.4174620999438519, "grad_norm": 0.4389709234237671, "learning_rate": 9.00882646833285e-06, "loss": 0.3667, "step": 1683 }, { "epoch": 1.4183043234138124, "grad_norm": 0.4822598993778229, "learning_rate": 9.007068109339783e-06, "loss": 0.4574, "step": 1684 }, { "epoch": 1.4191465468837732, "grad_norm": 0.5037694573402405, "learning_rate": 9.005308363969858e-06, "loss": 0.4145, "step": 1685 }, { "epoch": 1.4199887703537337, "grad_norm": 0.5200945734977722, "learning_rate": 9.003547232831911e-06, "loss": 0.4152, "step": 1686 }, { "epoch": 1.4208309938236945, "grad_norm": 0.43716001510620117, "learning_rate": 9.001784716535267e-06, "loss": 0.384, "step": 1687 }, { "epoch": 1.4216732172936553, "grad_norm": 0.6132208108901978, "learning_rate": 9.000020815689725e-06, "loss": 0.4882, "step": 1688 }, { "epoch": 1.422515440763616, "grad_norm": 0.4909684658050537, "learning_rate": 8.998255530905566e-06, "loss": 0.3999, "step": 1689 }, { "epoch": 1.4233576642335766, "grad_norm": 0.547904908657074, "learning_rate": 8.996488862793545e-06, "loss": 0.449, "step": 1690 }, { "epoch": 1.4241998877035373, "grad_norm": 0.501721203327179, "learning_rate": 8.994720811964902e-06, "loss": 0.439, "step": 1691 }, { "epoch": 1.425042111173498, "grad_norm": 0.44935330748558044, "learning_rate": 8.992951379031351e-06, "loss": 0.4168, "step": 1692 }, { "epoch": 1.4258843346434587, "grad_norm": 0.4915817081928253, "learning_rate": 8.991180564605086e-06, "loss": 0.4956, "step": 1693 }, { "epoch": 1.4267265581134194, "grad_norm": 0.42701271176338196, "learning_rate": 8.98940836929878e-06, "loss": 0.3813, "step": 1694 }, { "epoch": 1.4275687815833802, "grad_norm": 0.44306331872940063, "learning_rate": 8.987634793725577e-06, "loss": 0.4304, "step": 1695 }, { "epoch": 1.428411005053341, "grad_norm": 0.4849162697792053, "learning_rate": 8.98585983849911e-06, "loss": 0.393, "step": 1696 }, { "epoch": 1.4292532285233015, "grad_norm": 0.4882151186466217, "learning_rate": 8.984083504233478e-06, "loss": 0.4864, "step": 1697 }, { "epoch": 1.4300954519932623, "grad_norm": 0.4829281270503998, "learning_rate": 8.982305791543264e-06, "loss": 0.4362, "step": 1698 }, { "epoch": 1.4309376754632228, "grad_norm": 0.46609070897102356, "learning_rate": 8.980526701043528e-06, "loss": 0.3776, "step": 1699 }, { "epoch": 1.4317798989331836, "grad_norm": 0.5277087092399597, "learning_rate": 8.978746233349803e-06, "loss": 0.4053, "step": 1700 }, { "epoch": 1.4326221224031443, "grad_norm": 0.45911088585853577, "learning_rate": 8.9769643890781e-06, "loss": 0.4537, "step": 1701 }, { "epoch": 1.4334643458731051, "grad_norm": 0.5376895070075989, "learning_rate": 8.975181168844908e-06, "loss": 0.4686, "step": 1702 }, { "epoch": 1.4343065693430657, "grad_norm": 0.5087576508522034, "learning_rate": 8.97339657326719e-06, "loss": 0.4258, "step": 1703 }, { "epoch": 1.4351487928130264, "grad_norm": 0.4762192368507385, "learning_rate": 8.971610602962384e-06, "loss": 0.4703, "step": 1704 }, { "epoch": 1.435991016282987, "grad_norm": 0.4522605836391449, "learning_rate": 8.969823258548408e-06, "loss": 0.4268, "step": 1705 }, { "epoch": 1.4368332397529477, "grad_norm": 0.4621274173259735, "learning_rate": 8.968034540643648e-06, "loss": 0.4365, "step": 1706 }, { "epoch": 1.4376754632229085, "grad_norm": 0.39732030034065247, "learning_rate": 8.966244449866975e-06, "loss": 0.4214, "step": 1707 }, { "epoch": 1.4385176866928693, "grad_norm": 0.43669217824935913, "learning_rate": 8.964452986837725e-06, "loss": 0.4004, "step": 1708 }, { "epoch": 1.4393599101628298, "grad_norm": 0.48937785625457764, "learning_rate": 8.962660152175717e-06, "loss": 0.4687, "step": 1709 }, { "epoch": 1.4402021336327906, "grad_norm": 0.3530083894729614, "learning_rate": 8.960865946501241e-06, "loss": 0.3866, "step": 1710 }, { "epoch": 1.4410443571027511, "grad_norm": 0.5206476449966431, "learning_rate": 8.959070370435058e-06, "loss": 0.4434, "step": 1711 }, { "epoch": 1.441886580572712, "grad_norm": 0.3872814178466797, "learning_rate": 8.95727342459841e-06, "loss": 0.3956, "step": 1712 }, { "epoch": 1.4427288040426727, "grad_norm": 0.4733298718929291, "learning_rate": 8.955475109613008e-06, "loss": 0.4412, "step": 1713 }, { "epoch": 1.4435710275126334, "grad_norm": 0.41190043091773987, "learning_rate": 8.953675426101039e-06, "loss": 0.4357, "step": 1714 }, { "epoch": 1.444413250982594, "grad_norm": 0.4557223320007324, "learning_rate": 8.951874374685162e-06, "loss": 0.4185, "step": 1715 }, { "epoch": 1.4452554744525548, "grad_norm": 0.44398990273475647, "learning_rate": 8.950071955988508e-06, "loss": 0.4514, "step": 1716 }, { "epoch": 1.4460976979225155, "grad_norm": 0.3940070569515228, "learning_rate": 8.948268170634687e-06, "loss": 0.3917, "step": 1717 }, { "epoch": 1.446939921392476, "grad_norm": 0.4797621965408325, "learning_rate": 8.946463019247774e-06, "loss": 0.4329, "step": 1718 }, { "epoch": 1.4477821448624368, "grad_norm": 0.41780275106430054, "learning_rate": 8.944656502452322e-06, "loss": 0.4256, "step": 1719 }, { "epoch": 1.4486243683323976, "grad_norm": 0.40672406554222107, "learning_rate": 8.942848620873356e-06, "loss": 0.4299, "step": 1720 }, { "epoch": 1.4494665918023582, "grad_norm": 0.5384768843650818, "learning_rate": 8.94103937513637e-06, "loss": 0.4716, "step": 1721 }, { "epoch": 1.450308815272319, "grad_norm": 0.43744954466819763, "learning_rate": 8.939228765867335e-06, "loss": 0.4175, "step": 1722 }, { "epoch": 1.4511510387422797, "grad_norm": 0.4963338077068329, "learning_rate": 8.937416793692688e-06, "loss": 0.4852, "step": 1723 }, { "epoch": 1.4519932622122402, "grad_norm": 0.5222018361091614, "learning_rate": 8.935603459239342e-06, "loss": 0.4272, "step": 1724 }, { "epoch": 1.452835485682201, "grad_norm": 0.4396589994430542, "learning_rate": 8.933788763134677e-06, "loss": 0.4197, "step": 1725 }, { "epoch": 1.4536777091521618, "grad_norm": 0.4306478500366211, "learning_rate": 8.93197270600655e-06, "loss": 0.3983, "step": 1726 }, { "epoch": 1.4545199326221225, "grad_norm": 0.6117476224899292, "learning_rate": 8.93015528848328e-06, "loss": 0.4634, "step": 1727 }, { "epoch": 1.455362156092083, "grad_norm": 0.46932604908943176, "learning_rate": 8.92833651119367e-06, "loss": 0.4352, "step": 1728 }, { "epoch": 1.4562043795620438, "grad_norm": 0.5209106206893921, "learning_rate": 8.92651637476698e-06, "loss": 0.4327, "step": 1729 }, { "epoch": 1.4570466030320044, "grad_norm": 0.49242115020751953, "learning_rate": 8.92469487983295e-06, "loss": 0.427, "step": 1730 }, { "epoch": 1.4578888265019652, "grad_norm": 0.5172726511955261, "learning_rate": 8.922872027021783e-06, "loss": 0.4282, "step": 1731 }, { "epoch": 1.458731049971926, "grad_norm": 0.5153068900108337, "learning_rate": 8.921047816964157e-06, "loss": 0.423, "step": 1732 }, { "epoch": 1.4595732734418867, "grad_norm": 0.4783433675765991, "learning_rate": 8.919222250291215e-06, "loss": 0.4015, "step": 1733 }, { "epoch": 1.4604154969118472, "grad_norm": 0.4175061583518982, "learning_rate": 8.917395327634573e-06, "loss": 0.4242, "step": 1734 }, { "epoch": 1.461257720381808, "grad_norm": 0.5708388090133667, "learning_rate": 8.915567049626317e-06, "loss": 0.478, "step": 1735 }, { "epoch": 1.4620999438517686, "grad_norm": 0.49804964661598206, "learning_rate": 8.913737416898995e-06, "loss": 0.3937, "step": 1736 }, { "epoch": 1.4629421673217293, "grad_norm": 0.41671475768089294, "learning_rate": 8.911906430085632e-06, "loss": 0.3907, "step": 1737 }, { "epoch": 1.46378439079169, "grad_norm": 0.45887747406959534, "learning_rate": 8.91007408981972e-06, "loss": 0.4828, "step": 1738 }, { "epoch": 1.4646266142616509, "grad_norm": 0.4434395134449005, "learning_rate": 8.908240396735213e-06, "loss": 0.386, "step": 1739 }, { "epoch": 1.4654688377316114, "grad_norm": 0.42664727568626404, "learning_rate": 8.906405351466539e-06, "loss": 0.4668, "step": 1740 }, { "epoch": 1.4663110612015722, "grad_norm": 0.417929470539093, "learning_rate": 8.904568954648592e-06, "loss": 0.3972, "step": 1741 }, { "epoch": 1.4671532846715327, "grad_norm": 0.43188968300819397, "learning_rate": 8.902731206916736e-06, "loss": 0.4336, "step": 1742 }, { "epoch": 1.4679955081414935, "grad_norm": 0.527172327041626, "learning_rate": 8.900892108906795e-06, "loss": 0.4492, "step": 1743 }, { "epoch": 1.4688377316114543, "grad_norm": 0.3941579759120941, "learning_rate": 8.899051661255071e-06, "loss": 0.4082, "step": 1744 }, { "epoch": 1.469679955081415, "grad_norm": 0.46658822894096375, "learning_rate": 8.897209864598327e-06, "loss": 0.4224, "step": 1745 }, { "epoch": 1.4705221785513756, "grad_norm": 0.41723257303237915, "learning_rate": 8.895366719573787e-06, "loss": 0.3863, "step": 1746 }, { "epoch": 1.4713644020213363, "grad_norm": 0.45445165038108826, "learning_rate": 8.893522226819154e-06, "loss": 0.4451, "step": 1747 }, { "epoch": 1.472206625491297, "grad_norm": 0.4443155825138092, "learning_rate": 8.891676386972588e-06, "loss": 0.396, "step": 1748 }, { "epoch": 1.4730488489612577, "grad_norm": 0.4979252219200134, "learning_rate": 8.889829200672719e-06, "loss": 0.4953, "step": 1749 }, { "epoch": 1.4738910724312184, "grad_norm": 0.4714994728565216, "learning_rate": 8.887980668558642e-06, "loss": 0.4526, "step": 1750 }, { "epoch": 1.4747332959011792, "grad_norm": 0.40021127462387085, "learning_rate": 8.886130791269915e-06, "loss": 0.4319, "step": 1751 }, { "epoch": 1.4755755193711397, "grad_norm": 0.4642370045185089, "learning_rate": 8.884279569446566e-06, "loss": 0.4463, "step": 1752 }, { "epoch": 1.4764177428411005, "grad_norm": 0.4545647203922272, "learning_rate": 8.882427003729087e-06, "loss": 0.4048, "step": 1753 }, { "epoch": 1.4772599663110613, "grad_norm": 0.42290374636650085, "learning_rate": 8.88057309475843e-06, "loss": 0.4019, "step": 1754 }, { "epoch": 1.4781021897810218, "grad_norm": 0.47162893414497375, "learning_rate": 8.878717843176018e-06, "loss": 0.484, "step": 1755 }, { "epoch": 1.4789444132509826, "grad_norm": 0.35421669483184814, "learning_rate": 8.87686124962374e-06, "loss": 0.3916, "step": 1756 }, { "epoch": 1.4797866367209433, "grad_norm": 0.473965048789978, "learning_rate": 8.87500331474394e-06, "loss": 0.4167, "step": 1757 }, { "epoch": 1.4806288601909041, "grad_norm": 0.40414685010910034, "learning_rate": 8.873144039179433e-06, "loss": 0.4469, "step": 1758 }, { "epoch": 1.4814710836608647, "grad_norm": 0.4462132155895233, "learning_rate": 8.871283423573499e-06, "loss": 0.4245, "step": 1759 }, { "epoch": 1.4823133071308254, "grad_norm": 0.4093773365020752, "learning_rate": 8.869421468569874e-06, "loss": 0.4139, "step": 1760 }, { "epoch": 1.483155530600786, "grad_norm": 0.5118158459663391, "learning_rate": 8.867558174812767e-06, "loss": 0.4364, "step": 1761 }, { "epoch": 1.4839977540707467, "grad_norm": 0.4156516492366791, "learning_rate": 8.865693542946845e-06, "loss": 0.422, "step": 1762 }, { "epoch": 1.4848399775407075, "grad_norm": 0.4128626883029938, "learning_rate": 8.86382757361724e-06, "loss": 0.4136, "step": 1763 }, { "epoch": 1.4856822010106683, "grad_norm": 0.4844525158405304, "learning_rate": 8.86196026746954e-06, "loss": 0.4418, "step": 1764 }, { "epoch": 1.4865244244806288, "grad_norm": 0.46558845043182373, "learning_rate": 8.860091625149804e-06, "loss": 0.4719, "step": 1765 }, { "epoch": 1.4873666479505896, "grad_norm": 0.4618058502674103, "learning_rate": 8.858221647304554e-06, "loss": 0.4231, "step": 1766 }, { "epoch": 1.4882088714205501, "grad_norm": 0.4906204640865326, "learning_rate": 8.856350334580764e-06, "loss": 0.4372, "step": 1767 }, { "epoch": 1.489051094890511, "grad_norm": 0.4999374747276306, "learning_rate": 8.85447768762588e-06, "loss": 0.4142, "step": 1768 }, { "epoch": 1.4898933183604717, "grad_norm": 0.4267418682575226, "learning_rate": 8.852603707087804e-06, "loss": 0.4153, "step": 1769 }, { "epoch": 1.4907355418304324, "grad_norm": 0.4718432128429413, "learning_rate": 8.850728393614903e-06, "loss": 0.418, "step": 1770 }, { "epoch": 1.491577765300393, "grad_norm": 0.44175803661346436, "learning_rate": 8.848851747856003e-06, "loss": 0.3971, "step": 1771 }, { "epoch": 1.4924199887703538, "grad_norm": 0.5120987892150879, "learning_rate": 8.846973770460387e-06, "loss": 0.4308, "step": 1772 }, { "epoch": 1.4932622122403143, "grad_norm": 0.4630904793739319, "learning_rate": 8.845094462077807e-06, "loss": 0.4558, "step": 1773 }, { "epoch": 1.494104435710275, "grad_norm": 0.4522722661495209, "learning_rate": 8.843213823358474e-06, "loss": 0.4185, "step": 1774 }, { "epoch": 1.4949466591802358, "grad_norm": 0.4797394871711731, "learning_rate": 8.841331854953052e-06, "loss": 0.4143, "step": 1775 }, { "epoch": 1.4957888826501966, "grad_norm": 0.4420163631439209, "learning_rate": 8.839448557512671e-06, "loss": 0.4457, "step": 1776 }, { "epoch": 1.4966311061201572, "grad_norm": 0.41434893012046814, "learning_rate": 8.83756393168892e-06, "loss": 0.4232, "step": 1777 }, { "epoch": 1.497473329590118, "grad_norm": 0.5004256367683411, "learning_rate": 8.835677978133847e-06, "loss": 0.4655, "step": 1778 }, { "epoch": 1.4983155530600787, "grad_norm": 0.46632516384124756, "learning_rate": 8.833790697499959e-06, "loss": 0.425, "step": 1779 }, { "epoch": 1.4991577765300392, "grad_norm": 0.4400708079338074, "learning_rate": 8.831902090440225e-06, "loss": 0.4237, "step": 1780 }, { "epoch": 1.5, "grad_norm": 0.4824596643447876, "learning_rate": 8.830012157608069e-06, "loss": 0.4122, "step": 1781 }, { "epoch": 1.5008422234699608, "grad_norm": 0.5061484575271606, "learning_rate": 8.828120899657375e-06, "loss": 0.4245, "step": 1782 }, { "epoch": 1.5016844469399215, "grad_norm": 0.4662112891674042, "learning_rate": 8.826228317242487e-06, "loss": 0.4399, "step": 1783 }, { "epoch": 1.502526670409882, "grad_norm": 0.5218390226364136, "learning_rate": 8.824334411018205e-06, "loss": 0.4413, "step": 1784 }, { "epoch": 1.5033688938798426, "grad_norm": 0.4812275171279907, "learning_rate": 8.822439181639789e-06, "loss": 0.4269, "step": 1785 }, { "epoch": 1.5042111173498034, "grad_norm": 0.46809783577919006, "learning_rate": 8.820542629762955e-06, "loss": 0.4297, "step": 1786 }, { "epoch": 1.5050533408197642, "grad_norm": 0.5321475863456726, "learning_rate": 8.818644756043878e-06, "loss": 0.462, "step": 1787 }, { "epoch": 1.505895564289725, "grad_norm": 0.46229827404022217, "learning_rate": 8.81674556113919e-06, "loss": 0.394, "step": 1788 }, { "epoch": 1.5067377877596857, "grad_norm": 0.5621922612190247, "learning_rate": 8.814845045705978e-06, "loss": 0.4729, "step": 1789 }, { "epoch": 1.5075800112296462, "grad_norm": 0.4815962314605713, "learning_rate": 8.81294321040179e-06, "loss": 0.4023, "step": 1790 }, { "epoch": 1.508422234699607, "grad_norm": 0.48737606406211853, "learning_rate": 8.811040055884629e-06, "loss": 0.4647, "step": 1791 }, { "epoch": 1.5092644581695676, "grad_norm": 0.4182209372520447, "learning_rate": 8.809135582812951e-06, "loss": 0.383, "step": 1792 }, { "epoch": 1.5101066816395283, "grad_norm": 0.4757055938243866, "learning_rate": 8.807229791845673e-06, "loss": 0.4548, "step": 1793 }, { "epoch": 1.510948905109489, "grad_norm": 0.4349101483821869, "learning_rate": 8.805322683642166e-06, "loss": 0.4358, "step": 1794 }, { "epoch": 1.5117911285794499, "grad_norm": 0.39926400780677795, "learning_rate": 8.803414258862255e-06, "loss": 0.4259, "step": 1795 }, { "epoch": 1.5126333520494104, "grad_norm": 0.3486942648887634, "learning_rate": 8.801504518166223e-06, "loss": 0.3517, "step": 1796 }, { "epoch": 1.5134755755193712, "grad_norm": 0.4794962406158447, "learning_rate": 8.79959346221481e-06, "loss": 0.4543, "step": 1797 }, { "epoch": 1.5143177989893317, "grad_norm": 0.5402107834815979, "learning_rate": 8.797681091669206e-06, "loss": 0.451, "step": 1798 }, { "epoch": 1.5151600224592925, "grad_norm": 0.3842420279979706, "learning_rate": 8.79576740719106e-06, "loss": 0.4189, "step": 1799 }, { "epoch": 1.5160022459292533, "grad_norm": 0.5624842643737793, "learning_rate": 8.79385240944247e-06, "loss": 0.4166, "step": 1800 }, { "epoch": 1.516844469399214, "grad_norm": 0.4692447781562805, "learning_rate": 8.791936099085999e-06, "loss": 0.3823, "step": 1801 }, { "epoch": 1.5176866928691746, "grad_norm": 0.38963061571121216, "learning_rate": 8.790018476784653e-06, "loss": 0.4093, "step": 1802 }, { "epoch": 1.5185289163391353, "grad_norm": 0.41782960295677185, "learning_rate": 8.788099543201896e-06, "loss": 0.4646, "step": 1803 }, { "epoch": 1.5193711398090959, "grad_norm": 0.4876917600631714, "learning_rate": 8.786179299001648e-06, "loss": 0.4339, "step": 1804 }, { "epoch": 1.5202133632790567, "grad_norm": 0.41549503803253174, "learning_rate": 8.784257744848279e-06, "loss": 0.4275, "step": 1805 }, { "epoch": 1.5210555867490174, "grad_norm": 0.41871294379234314, "learning_rate": 8.782334881406617e-06, "loss": 0.389, "step": 1806 }, { "epoch": 1.5218978102189782, "grad_norm": 0.47432196140289307, "learning_rate": 8.780410709341935e-06, "loss": 0.4511, "step": 1807 }, { "epoch": 1.522740033688939, "grad_norm": 0.4098506569862366, "learning_rate": 8.778485229319969e-06, "loss": 0.443, "step": 1808 }, { "epoch": 1.5235822571588995, "grad_norm": 0.45061343908309937, "learning_rate": 8.7765584420069e-06, "loss": 0.4125, "step": 1809 }, { "epoch": 1.52442448062886, "grad_norm": 0.4532206058502197, "learning_rate": 8.77463034806936e-06, "loss": 0.434, "step": 1810 }, { "epoch": 1.5252667040988208, "grad_norm": 0.4080379903316498, "learning_rate": 8.77270094817444e-06, "loss": 0.4279, "step": 1811 }, { "epoch": 1.5261089275687816, "grad_norm": 0.4261135160923004, "learning_rate": 8.770770242989679e-06, "loss": 0.451, "step": 1812 }, { "epoch": 1.5269511510387423, "grad_norm": 0.4634437561035156, "learning_rate": 8.768838233183065e-06, "loss": 0.3907, "step": 1813 }, { "epoch": 1.5277933745087031, "grad_norm": 0.4767743945121765, "learning_rate": 8.766904919423044e-06, "loss": 0.445, "step": 1814 }, { "epoch": 1.5286355979786637, "grad_norm": 0.42262527346611023, "learning_rate": 8.764970302378509e-06, "loss": 0.4146, "step": 1815 }, { "epoch": 1.5294778214486242, "grad_norm": 0.5009405612945557, "learning_rate": 8.763034382718802e-06, "loss": 0.4664, "step": 1816 }, { "epoch": 1.530320044918585, "grad_norm": 0.4027503728866577, "learning_rate": 8.76109716111372e-06, "loss": 0.4257, "step": 1817 }, { "epoch": 1.5311622683885457, "grad_norm": 0.42649319767951965, "learning_rate": 8.759158638233508e-06, "loss": 0.4315, "step": 1818 }, { "epoch": 1.5320044918585065, "grad_norm": 0.4686887264251709, "learning_rate": 8.75721881474886e-06, "loss": 0.4551, "step": 1819 }, { "epoch": 1.5328467153284673, "grad_norm": 0.5564696192741394, "learning_rate": 8.755277691330925e-06, "loss": 0.4589, "step": 1820 }, { "epoch": 1.5336889387984278, "grad_norm": 0.3886123597621918, "learning_rate": 8.753335268651296e-06, "loss": 0.4125, "step": 1821 }, { "epoch": 1.5345311622683886, "grad_norm": 0.5388073921203613, "learning_rate": 8.751391547382017e-06, "loss": 0.4463, "step": 1822 }, { "epoch": 1.5353733857383491, "grad_norm": 0.5244566202163696, "learning_rate": 8.749446528195584e-06, "loss": 0.4142, "step": 1823 }, { "epoch": 1.53621560920831, "grad_norm": 0.4923776388168335, "learning_rate": 8.74750021176494e-06, "loss": 0.4479, "step": 1824 }, { "epoch": 1.5370578326782707, "grad_norm": 0.4106716513633728, "learning_rate": 8.745552598763477e-06, "loss": 0.3796, "step": 1825 }, { "epoch": 1.5379000561482314, "grad_norm": 0.6016693115234375, "learning_rate": 8.743603689865039e-06, "loss": 0.438, "step": 1826 }, { "epoch": 1.538742279618192, "grad_norm": 0.43058109283447266, "learning_rate": 8.741653485743908e-06, "loss": 0.4151, "step": 1827 }, { "epoch": 1.5395845030881528, "grad_norm": 0.5541000962257385, "learning_rate": 8.739701987074827e-06, "loss": 0.4444, "step": 1828 }, { "epoch": 1.5404267265581133, "grad_norm": 0.4157347083091736, "learning_rate": 8.737749194532978e-06, "loss": 0.4238, "step": 1829 }, { "epoch": 1.541268950028074, "grad_norm": 0.49841147661209106, "learning_rate": 8.735795108793996e-06, "loss": 0.401, "step": 1830 }, { "epoch": 1.5421111734980348, "grad_norm": 0.48505377769470215, "learning_rate": 8.73383973053396e-06, "loss": 0.4505, "step": 1831 }, { "epoch": 1.5429533969679956, "grad_norm": 0.41846394538879395, "learning_rate": 8.731883060429396e-06, "loss": 0.3939, "step": 1832 }, { "epoch": 1.5437956204379562, "grad_norm": 0.57904452085495, "learning_rate": 8.729925099157281e-06, "loss": 0.4718, "step": 1833 }, { "epoch": 1.544637843907917, "grad_norm": 0.40922123193740845, "learning_rate": 8.727965847395035e-06, "loss": 0.4153, "step": 1834 }, { "epoch": 1.5454800673778775, "grad_norm": 0.5303305387496948, "learning_rate": 8.726005305820523e-06, "loss": 0.4602, "step": 1835 }, { "epoch": 1.5463222908478382, "grad_norm": 0.446688175201416, "learning_rate": 8.724043475112063e-06, "loss": 0.4197, "step": 1836 }, { "epoch": 1.547164514317799, "grad_norm": 0.44238898158073425, "learning_rate": 8.722080355948413e-06, "loss": 0.4381, "step": 1837 }, { "epoch": 1.5480067377877598, "grad_norm": 0.4388452470302582, "learning_rate": 8.720115949008776e-06, "loss": 0.4544, "step": 1838 }, { "epoch": 1.5488489612577205, "grad_norm": 0.4363900423049927, "learning_rate": 8.718150254972806e-06, "loss": 0.4322, "step": 1839 }, { "epoch": 1.549691184727681, "grad_norm": 0.49149712920188904, "learning_rate": 8.716183274520601e-06, "loss": 0.4342, "step": 1840 }, { "epoch": 1.5505334081976416, "grad_norm": 0.4077799916267395, "learning_rate": 8.7142150083327e-06, "loss": 0.4089, "step": 1841 }, { "epoch": 1.5513756316676024, "grad_norm": 0.46508705615997314, "learning_rate": 8.712245457090089e-06, "loss": 0.4139, "step": 1842 }, { "epoch": 1.5522178551375632, "grad_norm": 0.4017239511013031, "learning_rate": 8.7102746214742e-06, "loss": 0.4161, "step": 1843 }, { "epoch": 1.553060078607524, "grad_norm": 0.42796334624290466, "learning_rate": 8.708302502166908e-06, "loss": 0.4548, "step": 1844 }, { "epoch": 1.5539023020774847, "grad_norm": 0.44847917556762695, "learning_rate": 8.706329099850532e-06, "loss": 0.429, "step": 1845 }, { "epoch": 1.5547445255474452, "grad_norm": 0.38017165660858154, "learning_rate": 8.704354415207836e-06, "loss": 0.367, "step": 1846 }, { "epoch": 1.5555867490174058, "grad_norm": 0.39611223340034485, "learning_rate": 8.702378448922027e-06, "loss": 0.4427, "step": 1847 }, { "epoch": 1.5564289724873666, "grad_norm": 0.4916168451309204, "learning_rate": 8.700401201676756e-06, "loss": 0.456, "step": 1848 }, { "epoch": 1.5572711959573273, "grad_norm": 0.44334906339645386, "learning_rate": 8.698422674156114e-06, "loss": 0.4311, "step": 1849 }, { "epoch": 1.558113419427288, "grad_norm": 0.4443252682685852, "learning_rate": 8.696442867044643e-06, "loss": 0.4535, "step": 1850 }, { "epoch": 1.5589556428972489, "grad_norm": 0.4587916433811188, "learning_rate": 8.694461781027316e-06, "loss": 0.4255, "step": 1851 }, { "epoch": 1.5597978663672094, "grad_norm": 0.4282359182834625, "learning_rate": 8.69247941678956e-06, "loss": 0.3881, "step": 1852 }, { "epoch": 1.5606400898371702, "grad_norm": 0.5189934372901917, "learning_rate": 8.690495775017234e-06, "loss": 0.4665, "step": 1853 }, { "epoch": 1.5614823133071307, "grad_norm": 0.43406468629837036, "learning_rate": 8.68851085639665e-06, "loss": 0.4011, "step": 1854 }, { "epoch": 1.5623245367770915, "grad_norm": 0.4264048933982849, "learning_rate": 8.68652466161455e-06, "loss": 0.4171, "step": 1855 }, { "epoch": 1.5631667602470523, "grad_norm": 0.4250393211841583, "learning_rate": 8.684537191358127e-06, "loss": 0.4036, "step": 1856 }, { "epoch": 1.564008983717013, "grad_norm": 0.40937691926956177, "learning_rate": 8.68254844631501e-06, "loss": 0.4105, "step": 1857 }, { "epoch": 1.5648512071869736, "grad_norm": 0.4202471375465393, "learning_rate": 8.680558427173274e-06, "loss": 0.4337, "step": 1858 }, { "epoch": 1.5656934306569343, "grad_norm": 0.4566057026386261, "learning_rate": 8.678567134621425e-06, "loss": 0.4007, "step": 1859 }, { "epoch": 1.5665356541268949, "grad_norm": 0.43366217613220215, "learning_rate": 8.676574569348422e-06, "loss": 0.4473, "step": 1860 }, { "epoch": 1.5673778775968557, "grad_norm": 0.4611562192440033, "learning_rate": 8.674580732043656e-06, "loss": 0.4312, "step": 1861 }, { "epoch": 1.5682201010668164, "grad_norm": 0.5183505415916443, "learning_rate": 8.67258562339696e-06, "loss": 0.4233, "step": 1862 }, { "epoch": 1.5690623245367772, "grad_norm": 0.46538248658180237, "learning_rate": 8.670589244098608e-06, "loss": 0.4221, "step": 1863 }, { "epoch": 1.5699045480067377, "grad_norm": 0.475240558385849, "learning_rate": 8.668591594839315e-06, "loss": 0.4359, "step": 1864 }, { "epoch": 1.5707467714766985, "grad_norm": 0.4094696342945099, "learning_rate": 8.66659267631023e-06, "loss": 0.4415, "step": 1865 }, { "epoch": 1.571588994946659, "grad_norm": 0.46238774061203003, "learning_rate": 8.664592489202948e-06, "loss": 0.3836, "step": 1866 }, { "epoch": 1.5724312184166198, "grad_norm": 0.4615926444530487, "learning_rate": 8.662591034209495e-06, "loss": 0.4287, "step": 1867 }, { "epoch": 1.5732734418865806, "grad_norm": 0.5102584362030029, "learning_rate": 8.660588312022345e-06, "loss": 0.4648, "step": 1868 }, { "epoch": 1.5741156653565413, "grad_norm": 0.45844516158103943, "learning_rate": 8.6585843233344e-06, "loss": 0.4481, "step": 1869 }, { "epoch": 1.5749578888265021, "grad_norm": 0.40496957302093506, "learning_rate": 8.656579068839013e-06, "loss": 0.3924, "step": 1870 }, { "epoch": 1.5758001122964627, "grad_norm": 0.43425244092941284, "learning_rate": 8.65457254922996e-06, "loss": 0.4261, "step": 1871 }, { "epoch": 1.5766423357664232, "grad_norm": 0.4036092460155487, "learning_rate": 8.65256476520147e-06, "loss": 0.4207, "step": 1872 }, { "epoch": 1.577484559236384, "grad_norm": 0.4055071175098419, "learning_rate": 8.650555717448194e-06, "loss": 0.4266, "step": 1873 }, { "epoch": 1.5783267827063447, "grad_norm": 0.45530277490615845, "learning_rate": 8.648545406665233e-06, "loss": 0.445, "step": 1874 }, { "epoch": 1.5791690061763055, "grad_norm": 0.43252959847450256, "learning_rate": 8.64653383354812e-06, "loss": 0.4594, "step": 1875 }, { "epoch": 1.5800112296462663, "grad_norm": 0.37379810214042664, "learning_rate": 8.644520998792823e-06, "loss": 0.3841, "step": 1876 }, { "epoch": 1.5808534531162268, "grad_norm": 0.3765110671520233, "learning_rate": 8.64250690309575e-06, "loss": 0.402, "step": 1877 }, { "epoch": 1.5816956765861874, "grad_norm": 0.4393147826194763, "learning_rate": 8.640491547153741e-06, "loss": 0.4439, "step": 1878 }, { "epoch": 1.5825379000561481, "grad_norm": 0.4384205937385559, "learning_rate": 8.638474931664077e-06, "loss": 0.4544, "step": 1879 }, { "epoch": 1.583380123526109, "grad_norm": 0.4095999300479889, "learning_rate": 8.636457057324473e-06, "loss": 0.4101, "step": 1880 }, { "epoch": 1.5842223469960697, "grad_norm": 0.4512237310409546, "learning_rate": 8.634437924833075e-06, "loss": 0.4458, "step": 1881 }, { "epoch": 1.5850645704660304, "grad_norm": 0.4624376893043518, "learning_rate": 8.632417534888472e-06, "loss": 0.3889, "step": 1882 }, { "epoch": 1.585906793935991, "grad_norm": 0.46367138624191284, "learning_rate": 8.630395888189684e-06, "loss": 0.4068, "step": 1883 }, { "epoch": 1.5867490174059518, "grad_norm": 0.45756664872169495, "learning_rate": 8.628372985436164e-06, "loss": 0.4065, "step": 1884 }, { "epoch": 1.5875912408759123, "grad_norm": 0.4527446925640106, "learning_rate": 8.626348827327804e-06, "loss": 0.4249, "step": 1885 }, { "epoch": 1.588433464345873, "grad_norm": 0.4120786488056183, "learning_rate": 8.624323414564925e-06, "loss": 0.447, "step": 1886 }, { "epoch": 1.5892756878158338, "grad_norm": 0.5018812417984009, "learning_rate": 8.62229674784829e-06, "loss": 0.445, "step": 1887 }, { "epoch": 1.5901179112857946, "grad_norm": 0.41980451345443726, "learning_rate": 8.620268827879086e-06, "loss": 0.4201, "step": 1888 }, { "epoch": 1.5909601347557552, "grad_norm": 0.44902777671813965, "learning_rate": 8.618239655358939e-06, "loss": 0.407, "step": 1889 }, { "epoch": 1.591802358225716, "grad_norm": 0.5357686877250671, "learning_rate": 8.616209230989912e-06, "loss": 0.4391, "step": 1890 }, { "epoch": 1.5926445816956765, "grad_norm": 0.39781653881073, "learning_rate": 8.614177555474491e-06, "loss": 0.3904, "step": 1891 }, { "epoch": 1.5934868051656372, "grad_norm": 0.5472583770751953, "learning_rate": 8.612144629515609e-06, "loss": 0.4888, "step": 1892 }, { "epoch": 1.594329028635598, "grad_norm": 0.48200172185897827, "learning_rate": 8.610110453816616e-06, "loss": 0.4211, "step": 1893 }, { "epoch": 1.5951712521055588, "grad_norm": 0.44481077790260315, "learning_rate": 8.608075029081304e-06, "loss": 0.3938, "step": 1894 }, { "epoch": 1.5960134755755195, "grad_norm": 0.4450471103191376, "learning_rate": 8.606038356013896e-06, "loss": 0.4285, "step": 1895 }, { "epoch": 1.59685569904548, "grad_norm": 0.5440077781677246, "learning_rate": 8.604000435319047e-06, "loss": 0.408, "step": 1896 }, { "epoch": 1.5976979225154406, "grad_norm": 0.4059353470802307, "learning_rate": 8.601961267701839e-06, "loss": 0.4229, "step": 1897 }, { "epoch": 1.5985401459854014, "grad_norm": 0.4133564233779907, "learning_rate": 8.599920853867793e-06, "loss": 0.3881, "step": 1898 }, { "epoch": 1.5993823694553622, "grad_norm": 0.4858034551143646, "learning_rate": 8.597879194522856e-06, "loss": 0.4461, "step": 1899 }, { "epoch": 1.600224592925323, "grad_norm": 0.3972078263759613, "learning_rate": 8.595836290373406e-06, "loss": 0.4265, "step": 1900 }, { "epoch": 1.6010668163952837, "grad_norm": 0.5276329517364502, "learning_rate": 8.593792142126254e-06, "loss": 0.4235, "step": 1901 }, { "epoch": 1.6019090398652442, "grad_norm": 0.48380178213119507, "learning_rate": 8.591746750488639e-06, "loss": 0.4036, "step": 1902 }, { "epoch": 1.6027512633352048, "grad_norm": 0.4269721806049347, "learning_rate": 8.589700116168231e-06, "loss": 0.4261, "step": 1903 }, { "epoch": 1.6035934868051656, "grad_norm": 0.3954969346523285, "learning_rate": 8.587652239873135e-06, "loss": 0.398, "step": 1904 }, { "epoch": 1.6044357102751263, "grad_norm": 0.4630894362926483, "learning_rate": 8.585603122311874e-06, "loss": 0.4771, "step": 1905 }, { "epoch": 1.605277933745087, "grad_norm": 0.4481334984302521, "learning_rate": 8.583552764193412e-06, "loss": 0.4491, "step": 1906 }, { "epoch": 1.6061201572150479, "grad_norm": 0.36959418654441833, "learning_rate": 8.581501166227138e-06, "loss": 0.4167, "step": 1907 }, { "epoch": 1.6069623806850084, "grad_norm": 0.4710712730884552, "learning_rate": 8.579448329122869e-06, "loss": 0.419, "step": 1908 }, { "epoch": 1.607804604154969, "grad_norm": 0.5517801642417908, "learning_rate": 8.57739425359085e-06, "loss": 0.4709, "step": 1909 }, { "epoch": 1.6086468276249297, "grad_norm": 0.38991913199424744, "learning_rate": 8.575338940341758e-06, "loss": 0.4171, "step": 1910 }, { "epoch": 1.6094890510948905, "grad_norm": 0.5322321057319641, "learning_rate": 8.573282390086694e-06, "loss": 0.413, "step": 1911 }, { "epoch": 1.6103312745648513, "grad_norm": 0.5535772442817688, "learning_rate": 8.57122460353719e-06, "loss": 0.4363, "step": 1912 }, { "epoch": 1.611173498034812, "grad_norm": 0.4126940071582794, "learning_rate": 8.569165581405206e-06, "loss": 0.4453, "step": 1913 }, { "epoch": 1.6120157215047726, "grad_norm": 0.5122808218002319, "learning_rate": 8.567105324403126e-06, "loss": 0.4515, "step": 1914 }, { "epoch": 1.6128579449747333, "grad_norm": 0.4976341724395752, "learning_rate": 8.565043833243767e-06, "loss": 0.4375, "step": 1915 }, { "epoch": 1.6137001684446939, "grad_norm": 0.43737298250198364, "learning_rate": 8.562981108640367e-06, "loss": 0.4112, "step": 1916 }, { "epoch": 1.6145423919146547, "grad_norm": 0.5141275525093079, "learning_rate": 8.560917151306594e-06, "loss": 0.4125, "step": 1917 }, { "epoch": 1.6153846153846154, "grad_norm": 0.5293352007865906, "learning_rate": 8.558851961956542e-06, "loss": 0.4093, "step": 1918 }, { "epoch": 1.6162268388545762, "grad_norm": 0.410214364528656, "learning_rate": 8.556785541304731e-06, "loss": 0.3998, "step": 1919 }, { "epoch": 1.6170690623245367, "grad_norm": 0.5817462801933289, "learning_rate": 8.554717890066107e-06, "loss": 0.4355, "step": 1920 }, { "epoch": 1.6179112857944975, "grad_norm": 0.4895290732383728, "learning_rate": 8.552649008956043e-06, "loss": 0.4161, "step": 1921 }, { "epoch": 1.618753509264458, "grad_norm": 0.5137314796447754, "learning_rate": 8.550578898690333e-06, "loss": 0.4606, "step": 1922 }, { "epoch": 1.6195957327344188, "grad_norm": 0.46706530451774597, "learning_rate": 8.548507559985203e-06, "loss": 0.4121, "step": 1923 }, { "epoch": 1.6204379562043796, "grad_norm": 0.5067141056060791, "learning_rate": 8.5464349935573e-06, "loss": 0.4366, "step": 1924 }, { "epoch": 1.6212801796743403, "grad_norm": 0.47393152117729187, "learning_rate": 8.544361200123696e-06, "loss": 0.4407, "step": 1925 }, { "epoch": 1.6221224031443011, "grad_norm": 0.3918601870536804, "learning_rate": 8.542286180401888e-06, "loss": 0.4203, "step": 1926 }, { "epoch": 1.6229646266142617, "grad_norm": 0.43134209513664246, "learning_rate": 8.540209935109798e-06, "loss": 0.3924, "step": 1927 }, { "epoch": 1.6238068500842222, "grad_norm": 0.5417711138725281, "learning_rate": 8.53813246496577e-06, "loss": 0.4632, "step": 1928 }, { "epoch": 1.624649073554183, "grad_norm": 0.38196757435798645, "learning_rate": 8.536053770688574e-06, "loss": 0.367, "step": 1929 }, { "epoch": 1.6254912970241437, "grad_norm": 0.5047089457511902, "learning_rate": 8.533973852997402e-06, "loss": 0.4606, "step": 1930 }, { "epoch": 1.6263335204941045, "grad_norm": 0.3984459638595581, "learning_rate": 8.53189271261187e-06, "loss": 0.3892, "step": 1931 }, { "epoch": 1.6271757439640653, "grad_norm": 0.48510128259658813, "learning_rate": 8.529810350252018e-06, "loss": 0.4227, "step": 1932 }, { "epoch": 1.6280179674340258, "grad_norm": 0.41683727502822876, "learning_rate": 8.527726766638305e-06, "loss": 0.4515, "step": 1933 }, { "epoch": 1.6288601909039864, "grad_norm": 0.5247741937637329, "learning_rate": 8.52564196249162e-06, "loss": 0.4354, "step": 1934 }, { "epoch": 1.6297024143739471, "grad_norm": 0.4704350531101227, "learning_rate": 8.523555938533262e-06, "loss": 0.3917, "step": 1935 }, { "epoch": 1.630544637843908, "grad_norm": 0.40774402022361755, "learning_rate": 8.521468695484967e-06, "loss": 0.4106, "step": 1936 }, { "epoch": 1.6313868613138687, "grad_norm": 0.4450548589229584, "learning_rate": 8.51938023406888e-06, "loss": 0.3697, "step": 1937 }, { "epoch": 1.6322290847838294, "grad_norm": 0.47571325302124023, "learning_rate": 8.517290555007578e-06, "loss": 0.4654, "step": 1938 }, { "epoch": 1.63307130825379, "grad_norm": 0.37967485189437866, "learning_rate": 8.515199659024049e-06, "loss": 0.4025, "step": 1939 }, { "epoch": 1.6339135317237508, "grad_norm": 0.49356693029403687, "learning_rate": 8.513107546841708e-06, "loss": 0.4376, "step": 1940 }, { "epoch": 1.6347557551937113, "grad_norm": 0.40577614307403564, "learning_rate": 8.511014219184395e-06, "loss": 0.4186, "step": 1941 }, { "epoch": 1.635597978663672, "grad_norm": 0.4078255593776703, "learning_rate": 8.508919676776358e-06, "loss": 0.4148, "step": 1942 }, { "epoch": 1.6364402021336328, "grad_norm": 0.44012075662612915, "learning_rate": 8.50682392034228e-06, "loss": 0.4788, "step": 1943 }, { "epoch": 1.6372824256035936, "grad_norm": 0.3738163411617279, "learning_rate": 8.504726950607251e-06, "loss": 0.371, "step": 1944 }, { "epoch": 1.6381246490735542, "grad_norm": 0.4548744857311249, "learning_rate": 8.502628768296788e-06, "loss": 0.4705, "step": 1945 }, { "epoch": 1.638966872543515, "grad_norm": 0.39190298318862915, "learning_rate": 8.50052937413683e-06, "loss": 0.4285, "step": 1946 }, { "epoch": 1.6398090960134755, "grad_norm": 0.4169197380542755, "learning_rate": 8.498428768853725e-06, "loss": 0.3949, "step": 1947 }, { "epoch": 1.6406513194834362, "grad_norm": 0.503379762172699, "learning_rate": 8.496326953174253e-06, "loss": 0.3858, "step": 1948 }, { "epoch": 1.641493542953397, "grad_norm": 0.4443649351596832, "learning_rate": 8.494223927825601e-06, "loss": 0.4312, "step": 1949 }, { "epoch": 1.6423357664233578, "grad_norm": 0.4543350040912628, "learning_rate": 8.492119693535383e-06, "loss": 0.4429, "step": 1950 }, { "epoch": 1.6431779898933183, "grad_norm": 0.4795394539833069, "learning_rate": 8.490014251031626e-06, "loss": 0.4054, "step": 1951 }, { "epoch": 1.644020213363279, "grad_norm": 0.46229133009910583, "learning_rate": 8.487907601042778e-06, "loss": 0.4321, "step": 1952 }, { "epoch": 1.6448624368332396, "grad_norm": 0.4463028013706207, "learning_rate": 8.485799744297702e-06, "loss": 0.4352, "step": 1953 }, { "epoch": 1.6457046603032004, "grad_norm": 0.46504998207092285, "learning_rate": 8.483690681525683e-06, "loss": 0.4006, "step": 1954 }, { "epoch": 1.6465468837731612, "grad_norm": 0.3970571458339691, "learning_rate": 8.481580413456418e-06, "loss": 0.363, "step": 1955 }, { "epoch": 1.647389107243122, "grad_norm": 0.4370676279067993, "learning_rate": 8.479468940820026e-06, "loss": 0.4592, "step": 1956 }, { "epoch": 1.6482313307130827, "grad_norm": 0.5025774240493774, "learning_rate": 8.47735626434704e-06, "loss": 0.3884, "step": 1957 }, { "epoch": 1.6490735541830432, "grad_norm": 0.4099838137626648, "learning_rate": 8.475242384768407e-06, "loss": 0.4344, "step": 1958 }, { "epoch": 1.6499157776530038, "grad_norm": 0.40673449635505676, "learning_rate": 8.473127302815497e-06, "loss": 0.4443, "step": 1959 }, { "epoch": 1.6507580011229646, "grad_norm": 0.4340780973434448, "learning_rate": 8.47101101922009e-06, "loss": 0.4453, "step": 1960 }, { "epoch": 1.6516002245929253, "grad_norm": 0.4677739143371582, "learning_rate": 8.468893534714382e-06, "loss": 0.4295, "step": 1961 }, { "epoch": 1.652442448062886, "grad_norm": 0.4416394531726837, "learning_rate": 8.46677485003099e-06, "loss": 0.4008, "step": 1962 }, { "epoch": 1.6532846715328469, "grad_norm": 0.4299498200416565, "learning_rate": 8.46465496590294e-06, "loss": 0.4606, "step": 1963 }, { "epoch": 1.6541268950028074, "grad_norm": 0.43649721145629883, "learning_rate": 8.462533883063678e-06, "loss": 0.387, "step": 1964 }, { "epoch": 1.654969118472768, "grad_norm": 0.5068464875221252, "learning_rate": 8.46041160224706e-06, "loss": 0.4303, "step": 1965 }, { "epoch": 1.6558113419427287, "grad_norm": 0.38220757246017456, "learning_rate": 8.45828812418736e-06, "loss": 0.4393, "step": 1966 }, { "epoch": 1.6566535654126895, "grad_norm": 0.5016233325004578, "learning_rate": 8.456163449619265e-06, "loss": 0.4638, "step": 1967 }, { "epoch": 1.6574957888826503, "grad_norm": 0.4304533898830414, "learning_rate": 8.454037579277874e-06, "loss": 0.3781, "step": 1968 }, { "epoch": 1.658338012352611, "grad_norm": 0.46218544244766235, "learning_rate": 8.451910513898704e-06, "loss": 0.4759, "step": 1969 }, { "epoch": 1.6591802358225716, "grad_norm": 0.4024166762828827, "learning_rate": 8.449782254217682e-06, "loss": 0.4092, "step": 1970 }, { "epoch": 1.6600224592925323, "grad_norm": 0.44259995222091675, "learning_rate": 8.44765280097115e-06, "loss": 0.4077, "step": 1971 }, { "epoch": 1.6608646827624929, "grad_norm": 0.39786458015441895, "learning_rate": 8.445522154895864e-06, "loss": 0.4098, "step": 1972 }, { "epoch": 1.6617069062324537, "grad_norm": 0.4788183271884918, "learning_rate": 8.443390316728987e-06, "loss": 0.4439, "step": 1973 }, { "epoch": 1.6625491297024144, "grad_norm": 0.3903910219669342, "learning_rate": 8.4412572872081e-06, "loss": 0.4012, "step": 1974 }, { "epoch": 1.6633913531723752, "grad_norm": 0.4585959017276764, "learning_rate": 8.439123067071196e-06, "loss": 0.4218, "step": 1975 }, { "epoch": 1.6642335766423357, "grad_norm": 0.44170159101486206, "learning_rate": 8.436987657056678e-06, "loss": 0.4128, "step": 1976 }, { "epoch": 1.6650758001122965, "grad_norm": 0.4915020167827606, "learning_rate": 8.43485105790336e-06, "loss": 0.4441, "step": 1977 }, { "epoch": 1.665918023582257, "grad_norm": 0.43701839447021484, "learning_rate": 8.43271327035047e-06, "loss": 0.4245, "step": 1978 }, { "epoch": 1.6667602470522178, "grad_norm": 0.45055702328681946, "learning_rate": 8.430574295137647e-06, "loss": 0.3499, "step": 1979 }, { "epoch": 1.6676024705221786, "grad_norm": 0.48781394958496094, "learning_rate": 8.428434133004937e-06, "loss": 0.4575, "step": 1980 }, { "epoch": 1.6684446939921393, "grad_norm": 0.4962453842163086, "learning_rate": 8.4262927846928e-06, "loss": 0.4438, "step": 1981 }, { "epoch": 1.6692869174621, "grad_norm": 0.4676836431026459, "learning_rate": 8.424150250942108e-06, "loss": 0.4473, "step": 1982 }, { "epoch": 1.6701291409320607, "grad_norm": 0.4506339430809021, "learning_rate": 8.42200653249414e-06, "loss": 0.4055, "step": 1983 }, { "epoch": 1.6709713644020212, "grad_norm": 0.4534107446670532, "learning_rate": 8.419861630090583e-06, "loss": 0.4432, "step": 1984 }, { "epoch": 1.671813587871982, "grad_norm": 0.4198240637779236, "learning_rate": 8.41771554447354e-06, "loss": 0.4013, "step": 1985 }, { "epoch": 1.6726558113419427, "grad_norm": 0.4855913519859314, "learning_rate": 8.415568276385518e-06, "loss": 0.4395, "step": 1986 }, { "epoch": 1.6734980348119035, "grad_norm": 0.4930519163608551, "learning_rate": 8.413419826569436e-06, "loss": 0.4484, "step": 1987 }, { "epoch": 1.6743402582818643, "grad_norm": 0.45919734239578247, "learning_rate": 8.411270195768618e-06, "loss": 0.4629, "step": 1988 }, { "epoch": 1.6751824817518248, "grad_norm": 0.4565297067165375, "learning_rate": 8.409119384726806e-06, "loss": 0.4104, "step": 1989 }, { "epoch": 1.6760247052217854, "grad_norm": 0.5075744390487671, "learning_rate": 8.406967394188135e-06, "loss": 0.4487, "step": 1990 }, { "epoch": 1.6768669286917461, "grad_norm": 0.44805198907852173, "learning_rate": 8.404814224897162e-06, "loss": 0.4353, "step": 1991 }, { "epoch": 1.677709152161707, "grad_norm": 0.4158684313297272, "learning_rate": 8.402659877598847e-06, "loss": 0.3806, "step": 1992 }, { "epoch": 1.6785513756316677, "grad_norm": 0.4528931677341461, "learning_rate": 8.400504353038556e-06, "loss": 0.4255, "step": 1993 }, { "epoch": 1.6793935991016284, "grad_norm": 0.4640510678291321, "learning_rate": 8.398347651962065e-06, "loss": 0.3935, "step": 1994 }, { "epoch": 1.680235822571589, "grad_norm": 0.40863776206970215, "learning_rate": 8.396189775115551e-06, "loss": 0.3867, "step": 1995 }, { "epoch": 1.6810780460415495, "grad_norm": 0.46197953820228577, "learning_rate": 8.394030723245608e-06, "loss": 0.4666, "step": 1996 }, { "epoch": 1.6819202695115103, "grad_norm": 0.4470638632774353, "learning_rate": 8.391870497099228e-06, "loss": 0.4056, "step": 1997 }, { "epoch": 1.682762492981471, "grad_norm": 0.46883779764175415, "learning_rate": 8.389709097423812e-06, "loss": 0.4497, "step": 1998 }, { "epoch": 1.6836047164514318, "grad_norm": 0.38233381509780884, "learning_rate": 8.38754652496717e-06, "loss": 0.3814, "step": 1999 }, { "epoch": 1.6844469399213926, "grad_norm": 0.4384881854057312, "learning_rate": 8.385382780477511e-06, "loss": 0.4435, "step": 2000 }, { "epoch": 1.6852891633913532, "grad_norm": 0.440840482711792, "learning_rate": 8.383217864703457e-06, "loss": 0.4371, "step": 2001 }, { "epoch": 1.686131386861314, "grad_norm": 0.4086010456085205, "learning_rate": 8.38105177839403e-06, "loss": 0.4056, "step": 2002 }, { "epoch": 1.6869736103312745, "grad_norm": 0.4146847426891327, "learning_rate": 8.378884522298659e-06, "loss": 0.4488, "step": 2003 }, { "epoch": 1.6878158338012352, "grad_norm": 0.41414108872413635, "learning_rate": 8.376716097167176e-06, "loss": 0.4296, "step": 2004 }, { "epoch": 1.688658057271196, "grad_norm": 0.43544501066207886, "learning_rate": 8.374546503749821e-06, "loss": 0.4349, "step": 2005 }, { "epoch": 1.6895002807411568, "grad_norm": 0.3628418743610382, "learning_rate": 8.372375742797235e-06, "loss": 0.3822, "step": 2006 }, { "epoch": 1.6903425042111173, "grad_norm": 0.43000245094299316, "learning_rate": 8.370203815060465e-06, "loss": 0.4172, "step": 2007 }, { "epoch": 1.691184727681078, "grad_norm": 0.40139099955558777, "learning_rate": 8.368030721290961e-06, "loss": 0.4071, "step": 2008 }, { "epoch": 1.6920269511510386, "grad_norm": 0.4507725238800049, "learning_rate": 8.365856462240574e-06, "loss": 0.4249, "step": 2009 }, { "epoch": 1.6928691746209994, "grad_norm": 0.46088388562202454, "learning_rate": 8.363681038661562e-06, "loss": 0.4383, "step": 2010 }, { "epoch": 1.6937113980909602, "grad_norm": 0.4555124342441559, "learning_rate": 8.361504451306585e-06, "loss": 0.4214, "step": 2011 }, { "epoch": 1.694553621560921, "grad_norm": 0.4617425203323364, "learning_rate": 8.359326700928703e-06, "loss": 0.4351, "step": 2012 }, { "epoch": 1.6953958450308815, "grad_norm": 0.5048226714134216, "learning_rate": 8.357147788281382e-06, "loss": 0.4366, "step": 2013 }, { "epoch": 1.6962380685008422, "grad_norm": 0.4140453338623047, "learning_rate": 8.354967714118487e-06, "loss": 0.4073, "step": 2014 }, { "epoch": 1.6970802919708028, "grad_norm": 0.49135303497314453, "learning_rate": 8.352786479194288e-06, "loss": 0.4485, "step": 2015 }, { "epoch": 1.6979225154407636, "grad_norm": 0.4075947403907776, "learning_rate": 8.350604084263453e-06, "loss": 0.381, "step": 2016 }, { "epoch": 1.6987647389107243, "grad_norm": 0.44698360562324524, "learning_rate": 8.348420530081054e-06, "loss": 0.4623, "step": 2017 }, { "epoch": 1.699606962380685, "grad_norm": 0.406100332736969, "learning_rate": 8.346235817402563e-06, "loss": 0.4009, "step": 2018 }, { "epoch": 1.7004491858506459, "grad_norm": 0.395178884267807, "learning_rate": 8.344049946983854e-06, "loss": 0.4472, "step": 2019 }, { "epoch": 1.7012914093206064, "grad_norm": 0.4397771656513214, "learning_rate": 8.3418629195812e-06, "loss": 0.4194, "step": 2020 }, { "epoch": 1.702133632790567, "grad_norm": 0.39724308252334595, "learning_rate": 8.339674735951276e-06, "loss": 0.4161, "step": 2021 }, { "epoch": 1.7029758562605277, "grad_norm": 0.39679238200187683, "learning_rate": 8.337485396851155e-06, "loss": 0.3687, "step": 2022 }, { "epoch": 1.7038180797304885, "grad_norm": 0.48304101824760437, "learning_rate": 8.335294903038308e-06, "loss": 0.4558, "step": 2023 }, { "epoch": 1.7046603032004493, "grad_norm": 0.40437957644462585, "learning_rate": 8.333103255270616e-06, "loss": 0.4468, "step": 2024 }, { "epoch": 1.70550252667041, "grad_norm": 0.4358759820461273, "learning_rate": 8.330910454306344e-06, "loss": 0.4038, "step": 2025 }, { "epoch": 1.7063447501403706, "grad_norm": 0.39024776220321655, "learning_rate": 8.328716500904168e-06, "loss": 0.4266, "step": 2026 }, { "epoch": 1.7071869736103311, "grad_norm": 0.4968058168888092, "learning_rate": 8.326521395823155e-06, "loss": 0.4485, "step": 2027 }, { "epoch": 1.7080291970802919, "grad_norm": 0.39932191371917725, "learning_rate": 8.324325139822776e-06, "loss": 0.4186, "step": 2028 }, { "epoch": 1.7088714205502527, "grad_norm": 0.3864066004753113, "learning_rate": 8.322127733662896e-06, "loss": 0.382, "step": 2029 }, { "epoch": 1.7097136440202134, "grad_norm": 0.518562376499176, "learning_rate": 8.319929178103782e-06, "loss": 0.4293, "step": 2030 }, { "epoch": 1.7105558674901742, "grad_norm": 0.4214074909687042, "learning_rate": 8.317729473906097e-06, "loss": 0.4298, "step": 2031 }, { "epoch": 1.7113980909601347, "grad_norm": 0.4226950407028198, "learning_rate": 8.315528621830897e-06, "loss": 0.4475, "step": 2032 }, { "epoch": 1.7122403144300955, "grad_norm": 0.4413611590862274, "learning_rate": 8.313326622639644e-06, "loss": 0.4052, "step": 2033 }, { "epoch": 1.713082537900056, "grad_norm": 0.4176136255264282, "learning_rate": 8.311123477094188e-06, "loss": 0.4198, "step": 2034 }, { "epoch": 1.7139247613700168, "grad_norm": 0.5934779047966003, "learning_rate": 8.308919185956781e-06, "loss": 0.4752, "step": 2035 }, { "epoch": 1.7147669848399776, "grad_norm": 0.43099042773246765, "learning_rate": 8.306713749990072e-06, "loss": 0.4077, "step": 2036 }, { "epoch": 1.7156092083099383, "grad_norm": 0.4502543807029724, "learning_rate": 8.3045071699571e-06, "loss": 0.4082, "step": 2037 }, { "epoch": 1.716451431779899, "grad_norm": 0.527538537979126, "learning_rate": 8.302299446621309e-06, "loss": 0.4624, "step": 2038 }, { "epoch": 1.7172936552498597, "grad_norm": 0.3779890537261963, "learning_rate": 8.300090580746529e-06, "loss": 0.4097, "step": 2039 }, { "epoch": 1.7181358787198202, "grad_norm": 0.45578092336654663, "learning_rate": 8.29788057309699e-06, "loss": 0.4157, "step": 2040 }, { "epoch": 1.718978102189781, "grad_norm": 0.47460421919822693, "learning_rate": 8.295669424437318e-06, "loss": 0.4417, "step": 2041 }, { "epoch": 1.7198203256597417, "grad_norm": 0.4312497675418854, "learning_rate": 8.293457135532534e-06, "loss": 0.4794, "step": 2042 }, { "epoch": 1.7206625491297025, "grad_norm": 0.45490846037864685, "learning_rate": 8.291243707148048e-06, "loss": 0.4632, "step": 2043 }, { "epoch": 1.721504772599663, "grad_norm": 0.3855568468570709, "learning_rate": 8.289029140049673e-06, "loss": 0.4003, "step": 2044 }, { "epoch": 1.7223469960696238, "grad_norm": 0.46640142798423767, "learning_rate": 8.286813435003608e-06, "loss": 0.3971, "step": 2045 }, { "epoch": 1.7231892195395844, "grad_norm": 0.42425939440727234, "learning_rate": 8.284596592776451e-06, "loss": 0.4337, "step": 2046 }, { "epoch": 1.7240314430095451, "grad_norm": 0.43880191445350647, "learning_rate": 8.282378614135191e-06, "loss": 0.4404, "step": 2047 }, { "epoch": 1.724873666479506, "grad_norm": 0.4021606743335724, "learning_rate": 8.280159499847207e-06, "loss": 0.3893, "step": 2048 }, { "epoch": 1.7257158899494667, "grad_norm": 0.461506724357605, "learning_rate": 8.27793925068028e-06, "loss": 0.4692, "step": 2049 }, { "epoch": 1.7265581134194274, "grad_norm": 0.39428791403770447, "learning_rate": 8.275717867402574e-06, "loss": 0.388, "step": 2050 }, { "epoch": 1.727400336889388, "grad_norm": 0.42778825759887695, "learning_rate": 8.273495350782652e-06, "loss": 0.4112, "step": 2051 }, { "epoch": 1.7282425603593485, "grad_norm": 0.44944852590560913, "learning_rate": 8.271271701589464e-06, "loss": 0.4081, "step": 2052 }, { "epoch": 1.7290847838293093, "grad_norm": 0.4252696931362152, "learning_rate": 8.269046920592359e-06, "loss": 0.4076, "step": 2053 }, { "epoch": 1.72992700729927, "grad_norm": 0.41070038080215454, "learning_rate": 8.26682100856107e-06, "loss": 0.4592, "step": 2054 }, { "epoch": 1.7307692307692308, "grad_norm": 0.42233511805534363, "learning_rate": 8.264593966265724e-06, "loss": 0.4084, "step": 2055 }, { "epoch": 1.7316114542391916, "grad_norm": 0.47488483786582947, "learning_rate": 8.262365794476838e-06, "loss": 0.4536, "step": 2056 }, { "epoch": 1.7324536777091522, "grad_norm": 0.456987589597702, "learning_rate": 8.260136493965326e-06, "loss": 0.4651, "step": 2057 }, { "epoch": 1.7332959011791127, "grad_norm": 0.41747692227363586, "learning_rate": 8.257906065502483e-06, "loss": 0.3958, "step": 2058 }, { "epoch": 1.7341381246490735, "grad_norm": 0.43954572081565857, "learning_rate": 8.255674509860004e-06, "loss": 0.4412, "step": 2059 }, { "epoch": 1.7349803481190342, "grad_norm": 0.38647621870040894, "learning_rate": 8.253441827809965e-06, "loss": 0.4282, "step": 2060 }, { "epoch": 1.735822571588995, "grad_norm": 0.5023011565208435, "learning_rate": 8.251208020124837e-06, "loss": 0.462, "step": 2061 }, { "epoch": 1.7366647950589558, "grad_norm": 0.447698712348938, "learning_rate": 8.248973087577477e-06, "loss": 0.4146, "step": 2062 }, { "epoch": 1.7375070185289163, "grad_norm": 0.4024762511253357, "learning_rate": 8.246737030941137e-06, "loss": 0.4066, "step": 2063 }, { "epoch": 1.738349241998877, "grad_norm": 0.4650697708129883, "learning_rate": 8.244499850989453e-06, "loss": 0.4039, "step": 2064 }, { "epoch": 1.7391914654688376, "grad_norm": 0.5101742148399353, "learning_rate": 8.24226154849645e-06, "loss": 0.4384, "step": 2065 }, { "epoch": 1.7400336889387984, "grad_norm": 0.42523935437202454, "learning_rate": 8.240022124236543e-06, "loss": 0.4269, "step": 2066 }, { "epoch": 1.7408759124087592, "grad_norm": 0.5009523034095764, "learning_rate": 8.237781578984534e-06, "loss": 0.4104, "step": 2067 }, { "epoch": 1.74171813587872, "grad_norm": 0.42771783471107483, "learning_rate": 8.235539913515612e-06, "loss": 0.423, "step": 2068 }, { "epoch": 1.7425603593486805, "grad_norm": 0.41776803135871887, "learning_rate": 8.233297128605358e-06, "loss": 0.435, "step": 2069 }, { "epoch": 1.7434025828186412, "grad_norm": 0.43918198347091675, "learning_rate": 8.231053225029735e-06, "loss": 0.4351, "step": 2070 }, { "epoch": 1.7442448062886018, "grad_norm": 0.41149473190307617, "learning_rate": 8.228808203565096e-06, "loss": 0.3849, "step": 2071 }, { "epoch": 1.7450870297585626, "grad_norm": 0.404376745223999, "learning_rate": 8.226562064988179e-06, "loss": 0.4401, "step": 2072 }, { "epoch": 1.7459292532285233, "grad_norm": 0.4018196761608124, "learning_rate": 8.224314810076109e-06, "loss": 0.4187, "step": 2073 }, { "epoch": 1.746771476698484, "grad_norm": 0.5398818850517273, "learning_rate": 8.2220664396064e-06, "loss": 0.4766, "step": 2074 }, { "epoch": 1.7476137001684446, "grad_norm": 0.4021288752555847, "learning_rate": 8.219816954356947e-06, "loss": 0.4423, "step": 2075 }, { "epoch": 1.7484559236384054, "grad_norm": 0.43011555075645447, "learning_rate": 8.217566355106035e-06, "loss": 0.4246, "step": 2076 }, { "epoch": 1.749298147108366, "grad_norm": 0.398597776889801, "learning_rate": 8.215314642632332e-06, "loss": 0.3799, "step": 2077 }, { "epoch": 1.7501403705783267, "grad_norm": 0.40762796998023987, "learning_rate": 8.213061817714893e-06, "loss": 0.4236, "step": 2078 }, { "epoch": 1.7509825940482875, "grad_norm": 0.4497831165790558, "learning_rate": 8.210807881133157e-06, "loss": 0.4372, "step": 2079 }, { "epoch": 1.7518248175182483, "grad_norm": 0.41671937704086304, "learning_rate": 8.208552833666945e-06, "loss": 0.3907, "step": 2080 }, { "epoch": 1.752667040988209, "grad_norm": 0.48451733589172363, "learning_rate": 8.206296676096466e-06, "loss": 0.4144, "step": 2081 }, { "epoch": 1.7535092644581696, "grad_norm": 0.4212430417537689, "learning_rate": 8.204039409202311e-06, "loss": 0.4491, "step": 2082 }, { "epoch": 1.7543514879281301, "grad_norm": 0.4624173641204834, "learning_rate": 8.201781033765459e-06, "loss": 0.4277, "step": 2083 }, { "epoch": 1.7551937113980909, "grad_norm": 0.5076501965522766, "learning_rate": 8.199521550567266e-06, "loss": 0.4243, "step": 2084 }, { "epoch": 1.7560359348680517, "grad_norm": 0.4105527400970459, "learning_rate": 8.197260960389475e-06, "loss": 0.4411, "step": 2085 }, { "epoch": 1.7568781583380124, "grad_norm": 0.5478642582893372, "learning_rate": 8.194999264014211e-06, "loss": 0.4689, "step": 2086 }, { "epoch": 1.7577203818079732, "grad_norm": 0.4042428135871887, "learning_rate": 8.192736462223984e-06, "loss": 0.4219, "step": 2087 }, { "epoch": 1.7585626052779337, "grad_norm": 0.4223308265209198, "learning_rate": 8.190472555801682e-06, "loss": 0.3917, "step": 2088 }, { "epoch": 1.7594048287478943, "grad_norm": 0.4825360178947449, "learning_rate": 8.18820754553058e-06, "loss": 0.429, "step": 2089 }, { "epoch": 1.760247052217855, "grad_norm": 0.4166293740272522, "learning_rate": 8.185941432194332e-06, "loss": 0.3547, "step": 2090 }, { "epoch": 1.7610892756878158, "grad_norm": 0.47624221444129944, "learning_rate": 8.183674216576975e-06, "loss": 0.4726, "step": 2091 }, { "epoch": 1.7619314991577766, "grad_norm": 0.481217622756958, "learning_rate": 8.181405899462926e-06, "loss": 0.4338, "step": 2092 }, { "epoch": 1.7627737226277373, "grad_norm": 0.49877381324768066, "learning_rate": 8.179136481636984e-06, "loss": 0.4254, "step": 2093 }, { "epoch": 1.763615946097698, "grad_norm": 0.41483259201049805, "learning_rate": 8.176865963884328e-06, "loss": 0.4076, "step": 2094 }, { "epoch": 1.7644581695676587, "grad_norm": 0.4152453541755676, "learning_rate": 8.174594346990518e-06, "loss": 0.4305, "step": 2095 }, { "epoch": 1.7653003930376192, "grad_norm": 0.46550074219703674, "learning_rate": 8.172321631741498e-06, "loss": 0.4232, "step": 2096 }, { "epoch": 1.76614261650758, "grad_norm": 0.41888606548309326, "learning_rate": 8.170047818923583e-06, "loss": 0.4134, "step": 2097 }, { "epoch": 1.7669848399775407, "grad_norm": 0.4257478415966034, "learning_rate": 8.167772909323477e-06, "loss": 0.471, "step": 2098 }, { "epoch": 1.7678270634475015, "grad_norm": 0.4296936094760895, "learning_rate": 8.16549690372826e-06, "loss": 0.4031, "step": 2099 }, { "epoch": 1.768669286917462, "grad_norm": 0.3816126585006714, "learning_rate": 8.163219802925389e-06, "loss": 0.3788, "step": 2100 }, { "epoch": 1.7695115103874228, "grad_norm": 0.43066343665122986, "learning_rate": 8.160941607702701e-06, "loss": 0.409, "step": 2101 }, { "epoch": 1.7703537338573834, "grad_norm": 0.44402164220809937, "learning_rate": 8.158662318848416e-06, "loss": 0.4194, "step": 2102 }, { "epoch": 1.7711959573273441, "grad_norm": 0.6020657420158386, "learning_rate": 8.156381937151125e-06, "loss": 0.4198, "step": 2103 }, { "epoch": 1.772038180797305, "grad_norm": 0.39206069707870483, "learning_rate": 8.154100463399805e-06, "loss": 0.3804, "step": 2104 }, { "epoch": 1.7728804042672657, "grad_norm": 0.4444130063056946, "learning_rate": 8.151817898383803e-06, "loss": 0.4146, "step": 2105 }, { "epoch": 1.7737226277372264, "grad_norm": 0.5045167803764343, "learning_rate": 8.14953424289285e-06, "loss": 0.4263, "step": 2106 }, { "epoch": 1.774564851207187, "grad_norm": 0.3962557315826416, "learning_rate": 8.14724949771705e-06, "loss": 0.3766, "step": 2107 }, { "epoch": 1.7754070746771475, "grad_norm": 0.5187271237373352, "learning_rate": 8.144963663646888e-06, "loss": 0.4651, "step": 2108 }, { "epoch": 1.7762492981471083, "grad_norm": 0.39791685342788696, "learning_rate": 8.142676741473218e-06, "loss": 0.3832, "step": 2109 }, { "epoch": 1.777091521617069, "grad_norm": 0.46503520011901855, "learning_rate": 8.140388731987284e-06, "loss": 0.4326, "step": 2110 }, { "epoch": 1.7779337450870298, "grad_norm": 0.43394729495048523, "learning_rate": 8.13809963598069e-06, "loss": 0.4044, "step": 2111 }, { "epoch": 1.7787759685569906, "grad_norm": 0.4611043334007263, "learning_rate": 8.135809454245429e-06, "loss": 0.4206, "step": 2112 }, { "epoch": 1.7796181920269512, "grad_norm": 0.4883257746696472, "learning_rate": 8.133518187573864e-06, "loss": 0.4081, "step": 2113 }, { "epoch": 1.7804604154969117, "grad_norm": 0.527190089225769, "learning_rate": 8.131225836758732e-06, "loss": 0.4502, "step": 2114 }, { "epoch": 1.7813026389668725, "grad_norm": 0.5460709929466248, "learning_rate": 8.128932402593149e-06, "loss": 0.4013, "step": 2115 }, { "epoch": 1.7821448624368332, "grad_norm": 0.38826051354408264, "learning_rate": 8.126637885870601e-06, "loss": 0.4015, "step": 2116 }, { "epoch": 1.782987085906794, "grad_norm": 0.6217319369316101, "learning_rate": 8.124342287384955e-06, "loss": 0.4381, "step": 2117 }, { "epoch": 1.7838293093767548, "grad_norm": 0.531176745891571, "learning_rate": 8.122045607930448e-06, "loss": 0.422, "step": 2118 }, { "epoch": 1.7846715328467153, "grad_norm": 0.36818113923072815, "learning_rate": 8.11974784830169e-06, "loss": 0.3845, "step": 2119 }, { "epoch": 1.7855137563166759, "grad_norm": 0.5104483962059021, "learning_rate": 8.117449009293668e-06, "loss": 0.4265, "step": 2120 }, { "epoch": 1.7863559797866366, "grad_norm": 0.5219356417655945, "learning_rate": 8.11514909170174e-06, "loss": 0.4176, "step": 2121 }, { "epoch": 1.7871982032565974, "grad_norm": 0.4948539733886719, "learning_rate": 8.11284809632164e-06, "loss": 0.4149, "step": 2122 }, { "epoch": 1.7880404267265582, "grad_norm": 0.45502138137817383, "learning_rate": 8.11054602394947e-06, "loss": 0.4503, "step": 2123 }, { "epoch": 1.788882650196519, "grad_norm": 0.4191913604736328, "learning_rate": 8.108242875381708e-06, "loss": 0.3993, "step": 2124 }, { "epoch": 1.7897248736664795, "grad_norm": 0.48848938941955566, "learning_rate": 8.105938651415207e-06, "loss": 0.4648, "step": 2125 }, { "epoch": 1.7905670971364402, "grad_norm": 0.39500728249549866, "learning_rate": 8.103633352847185e-06, "loss": 0.447, "step": 2126 }, { "epoch": 1.7914093206064008, "grad_norm": 0.45947498083114624, "learning_rate": 8.101326980475237e-06, "loss": 0.3895, "step": 2127 }, { "epoch": 1.7922515440763616, "grad_norm": 0.48967042565345764, "learning_rate": 8.099019535097332e-06, "loss": 0.4197, "step": 2128 }, { "epoch": 1.7930937675463223, "grad_norm": 0.39979997277259827, "learning_rate": 8.0967110175118e-06, "loss": 0.4221, "step": 2129 }, { "epoch": 1.793935991016283, "grad_norm": 0.6502275466918945, "learning_rate": 8.094401428517354e-06, "loss": 0.4468, "step": 2130 }, { "epoch": 1.7947782144862436, "grad_norm": 0.438873291015625, "learning_rate": 8.09209076891307e-06, "loss": 0.4067, "step": 2131 }, { "epoch": 1.7956204379562044, "grad_norm": 0.516609787940979, "learning_rate": 8.089779039498396e-06, "loss": 0.4122, "step": 2132 }, { "epoch": 1.796462661426165, "grad_norm": 0.4314734637737274, "learning_rate": 8.08746624107315e-06, "loss": 0.4219, "step": 2133 }, { "epoch": 1.7973048848961257, "grad_norm": 0.5303942561149597, "learning_rate": 8.085152374437525e-06, "loss": 0.417, "step": 2134 }, { "epoch": 1.7981471083660865, "grad_norm": 0.5948216915130615, "learning_rate": 8.082837440392073e-06, "loss": 0.4447, "step": 2135 }, { "epoch": 1.7989893318360473, "grad_norm": 0.4347953200340271, "learning_rate": 8.080521439737728e-06, "loss": 0.4355, "step": 2136 }, { "epoch": 1.799831555306008, "grad_norm": 0.5168211460113525, "learning_rate": 8.078204373275783e-06, "loss": 0.4111, "step": 2137 }, { "epoch": 1.8006737787759686, "grad_norm": 0.39689913392066956, "learning_rate": 8.075886241807901e-06, "loss": 0.4344, "step": 2138 }, { "epoch": 1.8015160022459291, "grad_norm": 0.44825732707977295, "learning_rate": 8.07356704613612e-06, "loss": 0.4293, "step": 2139 }, { "epoch": 1.8023582257158899, "grad_norm": 0.48535045981407166, "learning_rate": 8.07124678706284e-06, "loss": 0.421, "step": 2140 }, { "epoch": 1.8032004491858507, "grad_norm": 0.36820539832115173, "learning_rate": 8.06892546539083e-06, "loss": 0.3849, "step": 2141 }, { "epoch": 1.8040426726558114, "grad_norm": 0.5521648526191711, "learning_rate": 8.066603081923229e-06, "loss": 0.4833, "step": 2142 }, { "epoch": 1.8048848961257722, "grad_norm": 0.40797778964042664, "learning_rate": 8.064279637463541e-06, "loss": 0.3974, "step": 2143 }, { "epoch": 1.8057271195957327, "grad_norm": 0.4963454604148865, "learning_rate": 8.06195513281564e-06, "loss": 0.4297, "step": 2144 }, { "epoch": 1.8065693430656933, "grad_norm": 0.5404887199401855, "learning_rate": 8.059629568783762e-06, "loss": 0.4609, "step": 2145 }, { "epoch": 1.807411566535654, "grad_norm": 0.41455137729644775, "learning_rate": 8.057302946172515e-06, "loss": 0.3606, "step": 2146 }, { "epoch": 1.8082537900056148, "grad_norm": 0.5106844305992126, "learning_rate": 8.054975265786867e-06, "loss": 0.4322, "step": 2147 }, { "epoch": 1.8090960134755756, "grad_norm": 0.427109956741333, "learning_rate": 8.052646528432157e-06, "loss": 0.4104, "step": 2148 }, { "epoch": 1.8099382369455363, "grad_norm": 0.45564568042755127, "learning_rate": 8.05031673491409e-06, "loss": 0.4359, "step": 2149 }, { "epoch": 1.810780460415497, "grad_norm": 0.4153388738632202, "learning_rate": 8.047985886038732e-06, "loss": 0.4035, "step": 2150 }, { "epoch": 1.8116226838854577, "grad_norm": 0.5112539529800415, "learning_rate": 8.04565398261252e-06, "loss": 0.4622, "step": 2151 }, { "epoch": 1.8124649073554182, "grad_norm": 0.4414185881614685, "learning_rate": 8.043321025442248e-06, "loss": 0.4509, "step": 2152 }, { "epoch": 1.813307130825379, "grad_norm": 0.5314942598342896, "learning_rate": 8.040987015335085e-06, "loss": 0.4292, "step": 2153 }, { "epoch": 1.8141493542953397, "grad_norm": 0.49928420782089233, "learning_rate": 8.038651953098553e-06, "loss": 0.4653, "step": 2154 }, { "epoch": 1.8149915777653005, "grad_norm": 0.4704899787902832, "learning_rate": 8.036315839540545e-06, "loss": 0.4078, "step": 2155 }, { "epoch": 1.815833801235261, "grad_norm": 0.45496252179145813, "learning_rate": 8.033978675469318e-06, "loss": 0.3955, "step": 2156 }, { "epoch": 1.8166760247052218, "grad_norm": 0.45217305421829224, "learning_rate": 8.03164046169349e-06, "loss": 0.4091, "step": 2157 }, { "epoch": 1.8175182481751824, "grad_norm": 0.4861302673816681, "learning_rate": 8.029301199022042e-06, "loss": 0.4296, "step": 2158 }, { "epoch": 1.8183604716451431, "grad_norm": 0.4727301299571991, "learning_rate": 8.026960888264319e-06, "loss": 0.4354, "step": 2159 }, { "epoch": 1.819202695115104, "grad_norm": 0.5343275666236877, "learning_rate": 8.02461953023003e-06, "loss": 0.4688, "step": 2160 }, { "epoch": 1.8200449185850647, "grad_norm": 0.4657113254070282, "learning_rate": 8.022277125729242e-06, "loss": 0.4049, "step": 2161 }, { "epoch": 1.8208871420550252, "grad_norm": 0.5548436045646667, "learning_rate": 8.019933675572389e-06, "loss": 0.3856, "step": 2162 }, { "epoch": 1.821729365524986, "grad_norm": 0.46004951000213623, "learning_rate": 8.017589180570263e-06, "loss": 0.3958, "step": 2163 }, { "epoch": 1.8225715889949465, "grad_norm": 0.44965583086013794, "learning_rate": 8.015243641534021e-06, "loss": 0.4304, "step": 2164 }, { "epoch": 1.8234138124649073, "grad_norm": 0.5562565326690674, "learning_rate": 8.012897059275177e-06, "loss": 0.4616, "step": 2165 }, { "epoch": 1.824256035934868, "grad_norm": 0.4184322655200958, "learning_rate": 8.01054943460561e-06, "loss": 0.4065, "step": 2166 }, { "epoch": 1.8250982594048288, "grad_norm": 0.4971456229686737, "learning_rate": 8.008200768337558e-06, "loss": 0.4093, "step": 2167 }, { "epoch": 1.8259404828747896, "grad_norm": 0.4426514804363251, "learning_rate": 8.005851061283618e-06, "loss": 0.3986, "step": 2168 }, { "epoch": 1.8267827063447502, "grad_norm": 0.4178456664085388, "learning_rate": 8.003500314256749e-06, "loss": 0.4312, "step": 2169 }, { "epoch": 1.8276249298147107, "grad_norm": 0.5166444182395935, "learning_rate": 8.001148528070268e-06, "loss": 0.4534, "step": 2170 }, { "epoch": 1.8284671532846715, "grad_norm": 0.4338639974594116, "learning_rate": 7.998795703537856e-06, "loss": 0.4091, "step": 2171 }, { "epoch": 1.8293093767546322, "grad_norm": 0.3529128432273865, "learning_rate": 7.996441841473545e-06, "loss": 0.4124, "step": 2172 }, { "epoch": 1.830151600224593, "grad_norm": 0.44845345616340637, "learning_rate": 7.994086942691734e-06, "loss": 0.4329, "step": 2173 }, { "epoch": 1.8309938236945538, "grad_norm": 0.47825106978416443, "learning_rate": 7.99173100800718e-06, "loss": 0.4035, "step": 2174 }, { "epoch": 1.8318360471645143, "grad_norm": 0.4027039706707001, "learning_rate": 7.989374038234992e-06, "loss": 0.3702, "step": 2175 }, { "epoch": 1.8326782706344749, "grad_norm": 0.4478592276573181, "learning_rate": 7.98701603419064e-06, "loss": 0.4152, "step": 2176 }, { "epoch": 1.8335204941044356, "grad_norm": 0.42185312509536743, "learning_rate": 7.984656996689959e-06, "loss": 0.4267, "step": 2177 }, { "epoch": 1.8343627175743964, "grad_norm": 0.42277058959007263, "learning_rate": 7.982296926549128e-06, "loss": 0.4185, "step": 2178 }, { "epoch": 1.8352049410443572, "grad_norm": 0.40687575936317444, "learning_rate": 7.979935824584697e-06, "loss": 0.3786, "step": 2179 }, { "epoch": 1.836047164514318, "grad_norm": 0.421566903591156, "learning_rate": 7.977573691613564e-06, "loss": 0.4539, "step": 2180 }, { "epoch": 1.8368893879842785, "grad_norm": 0.47993147373199463, "learning_rate": 7.975210528452989e-06, "loss": 0.4302, "step": 2181 }, { "epoch": 1.8377316114542392, "grad_norm": 0.41261056065559387, "learning_rate": 7.97284633592058e-06, "loss": 0.4123, "step": 2182 }, { "epoch": 1.8385738349241998, "grad_norm": 0.4098217189311981, "learning_rate": 7.970481114834312e-06, "loss": 0.4052, "step": 2183 }, { "epoch": 1.8394160583941606, "grad_norm": 0.4734603464603424, "learning_rate": 7.96811486601251e-06, "loss": 0.4337, "step": 2184 }, { "epoch": 1.8402582818641213, "grad_norm": 0.4259733557701111, "learning_rate": 7.965747590273858e-06, "loss": 0.4183, "step": 2185 }, { "epoch": 1.841100505334082, "grad_norm": 0.49348220229148865, "learning_rate": 7.963379288437387e-06, "loss": 0.4356, "step": 2186 }, { "epoch": 1.8419427288040426, "grad_norm": 0.5072171092033386, "learning_rate": 7.961009961322495e-06, "loss": 0.43, "step": 2187 }, { "epoch": 1.8427849522740034, "grad_norm": 0.3887498378753662, "learning_rate": 7.958639609748924e-06, "loss": 0.4184, "step": 2188 }, { "epoch": 1.843627175743964, "grad_norm": 0.4558362066745758, "learning_rate": 7.956268234536777e-06, "loss": 0.3983, "step": 2189 }, { "epoch": 1.8444693992139247, "grad_norm": 0.4678648114204407, "learning_rate": 7.953895836506508e-06, "loss": 0.4762, "step": 2190 }, { "epoch": 1.8453116226838855, "grad_norm": 0.442787230014801, "learning_rate": 7.951522416478928e-06, "loss": 0.3934, "step": 2191 }, { "epoch": 1.8461538461538463, "grad_norm": 0.5269767045974731, "learning_rate": 7.949147975275198e-06, "loss": 0.4624, "step": 2192 }, { "epoch": 1.8469960696238068, "grad_norm": 0.3989448845386505, "learning_rate": 7.946772513716834e-06, "loss": 0.4024, "step": 2193 }, { "epoch": 1.8478382930937676, "grad_norm": 0.3966032564640045, "learning_rate": 7.944396032625705e-06, "loss": 0.386, "step": 2194 }, { "epoch": 1.8486805165637281, "grad_norm": 0.49084967374801636, "learning_rate": 7.942018532824035e-06, "loss": 0.4215, "step": 2195 }, { "epoch": 1.8495227400336889, "grad_norm": 0.4776197671890259, "learning_rate": 7.939640015134394e-06, "loss": 0.4388, "step": 2196 }, { "epoch": 1.8503649635036497, "grad_norm": 0.44070446491241455, "learning_rate": 7.937260480379711e-06, "loss": 0.4149, "step": 2197 }, { "epoch": 1.8512071869736104, "grad_norm": 0.5201008915901184, "learning_rate": 7.934879929383266e-06, "loss": 0.4131, "step": 2198 }, { "epoch": 1.8520494104435712, "grad_norm": 0.46868985891342163, "learning_rate": 7.932498362968683e-06, "loss": 0.3897, "step": 2199 }, { "epoch": 1.8528916339135317, "grad_norm": 0.4521868824958801, "learning_rate": 7.930115781959951e-06, "loss": 0.415, "step": 2200 }, { "epoch": 1.8537338573834923, "grad_norm": 0.4453527629375458, "learning_rate": 7.927732187181396e-06, "loss": 0.4441, "step": 2201 }, { "epoch": 1.854576080853453, "grad_norm": 0.4064571261405945, "learning_rate": 7.925347579457703e-06, "loss": 0.4003, "step": 2202 }, { "epoch": 1.8554183043234138, "grad_norm": 0.45339393615722656, "learning_rate": 7.922961959613904e-06, "loss": 0.4299, "step": 2203 }, { "epoch": 1.8562605277933746, "grad_norm": 0.4163025915622711, "learning_rate": 7.920575328475386e-06, "loss": 0.4207, "step": 2204 }, { "epoch": 1.8571027512633353, "grad_norm": 0.4010704755783081, "learning_rate": 7.918187686867878e-06, "loss": 0.4185, "step": 2205 }, { "epoch": 1.857944974733296, "grad_norm": 0.4116653800010681, "learning_rate": 7.915799035617468e-06, "loss": 0.4261, "step": 2206 }, { "epoch": 1.8587871982032564, "grad_norm": 0.4313340187072754, "learning_rate": 7.913409375550584e-06, "loss": 0.3864, "step": 2207 }, { "epoch": 1.8596294216732172, "grad_norm": 0.45681747794151306, "learning_rate": 7.91101870749401e-06, "loss": 0.424, "step": 2208 }, { "epoch": 1.860471645143178, "grad_norm": 0.4325839579105377, "learning_rate": 7.908627032274876e-06, "loss": 0.4017, "step": 2209 }, { "epoch": 1.8613138686131387, "grad_norm": 0.44237881898880005, "learning_rate": 7.906234350720658e-06, "loss": 0.4468, "step": 2210 }, { "epoch": 1.8621560920830995, "grad_norm": 0.4689525365829468, "learning_rate": 7.903840663659186e-06, "loss": 0.4418, "step": 2211 }, { "epoch": 1.86299831555306, "grad_norm": 0.40524742007255554, "learning_rate": 7.901445971918633e-06, "loss": 0.4087, "step": 2212 }, { "epoch": 1.8638405390230208, "grad_norm": 0.46004968881607056, "learning_rate": 7.89905027632752e-06, "loss": 0.4399, "step": 2213 }, { "epoch": 1.8646827624929814, "grad_norm": 0.372577041387558, "learning_rate": 7.896653577714722e-06, "loss": 0.3835, "step": 2214 }, { "epoch": 1.8655249859629421, "grad_norm": 0.511300265789032, "learning_rate": 7.89425587690945e-06, "loss": 0.4391, "step": 2215 }, { "epoch": 1.866367209432903, "grad_norm": 0.4844171702861786, "learning_rate": 7.891857174741268e-06, "loss": 0.4763, "step": 2216 }, { "epoch": 1.8672094329028637, "grad_norm": 0.36846452951431274, "learning_rate": 7.889457472040088e-06, "loss": 0.4044, "step": 2217 }, { "epoch": 1.8680516563728242, "grad_norm": 0.5026372671127319, "learning_rate": 7.887056769636166e-06, "loss": 0.4628, "step": 2218 }, { "epoch": 1.868893879842785, "grad_norm": 0.4031389653682709, "learning_rate": 7.884655068360102e-06, "loss": 0.3852, "step": 2219 }, { "epoch": 1.8697361033127455, "grad_norm": 0.3928333520889282, "learning_rate": 7.882252369042846e-06, "loss": 0.4312, "step": 2220 }, { "epoch": 1.8705783267827063, "grad_norm": 0.45821118354797363, "learning_rate": 7.879848672515686e-06, "loss": 0.4536, "step": 2221 }, { "epoch": 1.871420550252667, "grad_norm": 0.40815725922584534, "learning_rate": 7.877443979610266e-06, "loss": 0.4235, "step": 2222 }, { "epoch": 1.8722627737226278, "grad_norm": 0.4239349961280823, "learning_rate": 7.875038291158564e-06, "loss": 0.4178, "step": 2223 }, { "epoch": 1.8731049971925884, "grad_norm": 0.39337092638015747, "learning_rate": 7.87263160799291e-06, "loss": 0.4006, "step": 2224 }, { "epoch": 1.8739472206625492, "grad_norm": 0.47961464524269104, "learning_rate": 7.870223930945973e-06, "loss": 0.4562, "step": 2225 }, { "epoch": 1.8747894441325097, "grad_norm": 0.4185568392276764, "learning_rate": 7.867815260850766e-06, "loss": 0.3808, "step": 2226 }, { "epoch": 1.8756316676024705, "grad_norm": 0.4588843882083893, "learning_rate": 7.865405598540653e-06, "loss": 0.4622, "step": 2227 }, { "epoch": 1.8764738910724312, "grad_norm": 0.36958450078964233, "learning_rate": 7.862994944849333e-06, "loss": 0.4093, "step": 2228 }, { "epoch": 1.877316114542392, "grad_norm": 0.4501212239265442, "learning_rate": 7.860583300610849e-06, "loss": 0.4071, "step": 2229 }, { "epoch": 1.8781583380123528, "grad_norm": 0.437223345041275, "learning_rate": 7.858170666659592e-06, "loss": 0.4256, "step": 2230 }, { "epoch": 1.8790005614823133, "grad_norm": 0.4552583396434784, "learning_rate": 7.855757043830288e-06, "loss": 0.4368, "step": 2231 }, { "epoch": 1.8798427849522739, "grad_norm": 0.4665454626083374, "learning_rate": 7.853342432958012e-06, "loss": 0.4212, "step": 2232 }, { "epoch": 1.8806850084222346, "grad_norm": 0.39152124524116516, "learning_rate": 7.85092683487818e-06, "loss": 0.3807, "step": 2233 }, { "epoch": 1.8815272318921954, "grad_norm": 0.5878326296806335, "learning_rate": 7.848510250426543e-06, "loss": 0.4419, "step": 2234 }, { "epoch": 1.8823694553621562, "grad_norm": 0.41209447383880615, "learning_rate": 7.846092680439199e-06, "loss": 0.4477, "step": 2235 }, { "epoch": 1.883211678832117, "grad_norm": 0.518215537071228, "learning_rate": 7.84367412575259e-06, "loss": 0.419, "step": 2236 }, { "epoch": 1.8840539023020775, "grad_norm": 0.43436184525489807, "learning_rate": 7.841254587203486e-06, "loss": 0.3861, "step": 2237 }, { "epoch": 1.884896125772038, "grad_norm": 0.43222084641456604, "learning_rate": 7.838834065629014e-06, "loss": 0.4053, "step": 2238 }, { "epoch": 1.8857383492419988, "grad_norm": 0.48062723875045776, "learning_rate": 7.836412561866628e-06, "loss": 0.4107, "step": 2239 }, { "epoch": 1.8865805727119596, "grad_norm": 0.43677258491516113, "learning_rate": 7.83399007675413e-06, "loss": 0.4711, "step": 2240 }, { "epoch": 1.8874227961819203, "grad_norm": 0.4089498817920685, "learning_rate": 7.831566611129655e-06, "loss": 0.411, "step": 2241 }, { "epoch": 1.888265019651881, "grad_norm": 0.4284714162349701, "learning_rate": 7.829142165831684e-06, "loss": 0.394, "step": 2242 }, { "epoch": 1.8891072431218416, "grad_norm": 0.4424345791339874, "learning_rate": 7.826716741699031e-06, "loss": 0.4148, "step": 2243 }, { "epoch": 1.8899494665918024, "grad_norm": 0.4175011217594147, "learning_rate": 7.824290339570853e-06, "loss": 0.4256, "step": 2244 }, { "epoch": 1.890791690061763, "grad_norm": 0.47767317295074463, "learning_rate": 7.821862960286641e-06, "loss": 0.4424, "step": 2245 }, { "epoch": 1.8916339135317237, "grad_norm": 0.433854877948761, "learning_rate": 7.81943460468623e-06, "loss": 0.3867, "step": 2246 }, { "epoch": 1.8924761370016845, "grad_norm": 0.4683780074119568, "learning_rate": 7.817005273609787e-06, "loss": 0.4234, "step": 2247 }, { "epoch": 1.8933183604716453, "grad_norm": 0.4002044200897217, "learning_rate": 7.814574967897819e-06, "loss": 0.4264, "step": 2248 }, { "epoch": 1.8941605839416058, "grad_norm": 0.5174026489257812, "learning_rate": 7.812143688391171e-06, "loss": 0.4149, "step": 2249 }, { "epoch": 1.8950028074115666, "grad_norm": 0.4089507758617401, "learning_rate": 7.809711435931021e-06, "loss": 0.441, "step": 2250 }, { "epoch": 1.8958450308815271, "grad_norm": 0.4783347547054291, "learning_rate": 7.807278211358893e-06, "loss": 0.4395, "step": 2251 }, { "epoch": 1.8966872543514879, "grad_norm": 0.4121401309967041, "learning_rate": 7.804844015516638e-06, "loss": 0.4173, "step": 2252 }, { "epoch": 1.8975294778214487, "grad_norm": 0.37389329075813293, "learning_rate": 7.802408849246443e-06, "loss": 0.3837, "step": 2253 }, { "epoch": 1.8983717012914094, "grad_norm": 0.48550093173980713, "learning_rate": 7.799972713390835e-06, "loss": 0.4624, "step": 2254 }, { "epoch": 1.89921392476137, "grad_norm": 0.40800291299819946, "learning_rate": 7.79753560879268e-06, "loss": 0.4272, "step": 2255 }, { "epoch": 1.9000561482313307, "grad_norm": 0.40060028433799744, "learning_rate": 7.795097536295166e-06, "loss": 0.4141, "step": 2256 }, { "epoch": 1.9008983717012913, "grad_norm": 0.42208436131477356, "learning_rate": 7.792658496741832e-06, "loss": 0.3854, "step": 2257 }, { "epoch": 1.901740595171252, "grad_norm": 0.4338388741016388, "learning_rate": 7.79021849097654e-06, "loss": 0.4491, "step": 2258 }, { "epoch": 1.9025828186412128, "grad_norm": 0.3771706521511078, "learning_rate": 7.787777519843492e-06, "loss": 0.4054, "step": 2259 }, { "epoch": 1.9034250421111736, "grad_norm": 0.4534549117088318, "learning_rate": 7.78533558418722e-06, "loss": 0.4498, "step": 2260 }, { "epoch": 1.9042672655811343, "grad_norm": 0.35544222593307495, "learning_rate": 7.782892684852593e-06, "loss": 0.3894, "step": 2261 }, { "epoch": 1.905109489051095, "grad_norm": 0.3807908296585083, "learning_rate": 7.780448822684811e-06, "loss": 0.4261, "step": 2262 }, { "epoch": 1.9059517125210554, "grad_norm": 0.42635655403137207, "learning_rate": 7.778003998529414e-06, "loss": 0.4441, "step": 2263 }, { "epoch": 1.9067939359910162, "grad_norm": 0.38137882947921753, "learning_rate": 7.775558213232261e-06, "loss": 0.375, "step": 2264 }, { "epoch": 1.907636159460977, "grad_norm": 0.4740426540374756, "learning_rate": 7.773111467639557e-06, "loss": 0.4172, "step": 2265 }, { "epoch": 1.9084783829309377, "grad_norm": 0.4835158884525299, "learning_rate": 7.770663762597832e-06, "loss": 0.4475, "step": 2266 }, { "epoch": 1.9093206064008985, "grad_norm": 0.42120134830474854, "learning_rate": 7.768215098953952e-06, "loss": 0.4052, "step": 2267 }, { "epoch": 1.910162829870859, "grad_norm": 0.4571888744831085, "learning_rate": 7.765765477555111e-06, "loss": 0.4447, "step": 2268 }, { "epoch": 1.9110050533408196, "grad_norm": 0.3711426258087158, "learning_rate": 7.763314899248838e-06, "loss": 0.3952, "step": 2269 }, { "epoch": 1.9118472768107804, "grad_norm": 0.4525761604309082, "learning_rate": 7.760863364882985e-06, "loss": 0.4089, "step": 2270 }, { "epoch": 1.9126895002807411, "grad_norm": 0.40198585391044617, "learning_rate": 7.758410875305749e-06, "loss": 0.4038, "step": 2271 }, { "epoch": 1.913531723750702, "grad_norm": 0.40404975414276123, "learning_rate": 7.755957431365644e-06, "loss": 0.4397, "step": 2272 }, { "epoch": 1.9143739472206627, "grad_norm": 0.44432583451271057, "learning_rate": 7.753503033911522e-06, "loss": 0.4021, "step": 2273 }, { "epoch": 1.9152161706906232, "grad_norm": 0.447574257850647, "learning_rate": 7.751047683792562e-06, "loss": 0.4269, "step": 2274 }, { "epoch": 1.916058394160584, "grad_norm": 0.5312681198120117, "learning_rate": 7.748591381858273e-06, "loss": 0.4594, "step": 2275 }, { "epoch": 1.9169006176305445, "grad_norm": 0.42570194602012634, "learning_rate": 7.746134128958491e-06, "loss": 0.4175, "step": 2276 }, { "epoch": 1.9177428411005053, "grad_norm": 0.4594205915927887, "learning_rate": 7.743675925943389e-06, "loss": 0.4335, "step": 2277 }, { "epoch": 1.918585064570466, "grad_norm": 0.45350247621536255, "learning_rate": 7.741216773663455e-06, "loss": 0.3924, "step": 2278 }, { "epoch": 1.9194272880404268, "grad_norm": 0.48136866092681885, "learning_rate": 7.73875667296952e-06, "loss": 0.4592, "step": 2279 }, { "epoch": 1.9202695115103874, "grad_norm": 0.4313504695892334, "learning_rate": 7.736295624712735e-06, "loss": 0.416, "step": 2280 }, { "epoch": 1.9211117349803482, "grad_norm": 0.44472622871398926, "learning_rate": 7.733833629744579e-06, "loss": 0.4246, "step": 2281 }, { "epoch": 1.9219539584503087, "grad_norm": 0.397116482257843, "learning_rate": 7.731370688916863e-06, "loss": 0.3717, "step": 2282 }, { "epoch": 1.9227961819202695, "grad_norm": 0.4700952470302582, "learning_rate": 7.72890680308172e-06, "loss": 0.4509, "step": 2283 }, { "epoch": 1.9236384053902302, "grad_norm": 0.414626806974411, "learning_rate": 7.726441973091612e-06, "loss": 0.4409, "step": 2284 }, { "epoch": 1.924480628860191, "grad_norm": 0.4221349060535431, "learning_rate": 7.723976199799328e-06, "loss": 0.4288, "step": 2285 }, { "epoch": 1.9253228523301515, "grad_norm": 0.4493381977081299, "learning_rate": 7.721509484057986e-06, "loss": 0.406, "step": 2286 }, { "epoch": 1.9261650758001123, "grad_norm": 0.4102017283439636, "learning_rate": 7.719041826721025e-06, "loss": 0.406, "step": 2287 }, { "epoch": 1.9270072992700729, "grad_norm": 0.46869006752967834, "learning_rate": 7.71657322864221e-06, "loss": 0.4489, "step": 2288 }, { "epoch": 1.9278495227400336, "grad_norm": 0.4602300822734833, "learning_rate": 7.714103690675638e-06, "loss": 0.4498, "step": 2289 }, { "epoch": 1.9286917462099944, "grad_norm": 0.4005264937877655, "learning_rate": 7.711633213675724e-06, "loss": 0.4022, "step": 2290 }, { "epoch": 1.9295339696799552, "grad_norm": 0.47333115339279175, "learning_rate": 7.709161798497213e-06, "loss": 0.4223, "step": 2291 }, { "epoch": 1.930376193149916, "grad_norm": 0.4595816135406494, "learning_rate": 7.70668944599517e-06, "loss": 0.4329, "step": 2292 }, { "epoch": 1.9312184166198765, "grad_norm": 0.4188602864742279, "learning_rate": 7.704216157024986e-06, "loss": 0.4224, "step": 2293 }, { "epoch": 1.932060640089837, "grad_norm": 0.4075416326522827, "learning_rate": 7.701741932442381e-06, "loss": 0.3658, "step": 2294 }, { "epoch": 1.9329028635597978, "grad_norm": 0.5240015387535095, "learning_rate": 7.69926677310339e-06, "loss": 0.4657, "step": 2295 }, { "epoch": 1.9337450870297586, "grad_norm": 0.3449026048183441, "learning_rate": 7.696790679864376e-06, "loss": 0.3914, "step": 2296 }, { "epoch": 1.9345873104997193, "grad_norm": 0.5536909103393555, "learning_rate": 7.694313653582026e-06, "loss": 0.4512, "step": 2297 }, { "epoch": 1.93542953396968, "grad_norm": 0.44892656803131104, "learning_rate": 7.691835695113348e-06, "loss": 0.4341, "step": 2298 }, { "epoch": 1.9362717574396406, "grad_norm": 0.3883119821548462, "learning_rate": 7.689356805315676e-06, "loss": 0.382, "step": 2299 }, { "epoch": 1.9371139809096012, "grad_norm": 0.4982506334781647, "learning_rate": 7.686876985046658e-06, "loss": 0.4313, "step": 2300 }, { "epoch": 1.937956204379562, "grad_norm": 0.4563843905925751, "learning_rate": 7.684396235164272e-06, "loss": 0.4354, "step": 2301 }, { "epoch": 1.9387984278495227, "grad_norm": 0.4394354224205017, "learning_rate": 7.681914556526816e-06, "loss": 0.4127, "step": 2302 }, { "epoch": 1.9396406513194835, "grad_norm": 0.47024598717689514, "learning_rate": 7.679431949992908e-06, "loss": 0.4345, "step": 2303 }, { "epoch": 1.9404828747894443, "grad_norm": 0.4237031936645508, "learning_rate": 7.676948416421487e-06, "loss": 0.4268, "step": 2304 }, { "epoch": 1.9413250982594048, "grad_norm": 0.5053021311759949, "learning_rate": 7.67446395667181e-06, "loss": 0.4307, "step": 2305 }, { "epoch": 1.9421673217293656, "grad_norm": 0.5034184455871582, "learning_rate": 7.671978571603464e-06, "loss": 0.4305, "step": 2306 }, { "epoch": 1.9430095451993261, "grad_norm": 0.4683228135108948, "learning_rate": 7.669492262076344e-06, "loss": 0.4022, "step": 2307 }, { "epoch": 1.9438517686692869, "grad_norm": 0.49109575152397156, "learning_rate": 7.667005028950671e-06, "loss": 0.4125, "step": 2308 }, { "epoch": 1.9446939921392477, "grad_norm": 0.41540390253067017, "learning_rate": 7.664516873086987e-06, "loss": 0.4141, "step": 2309 }, { "epoch": 1.9455362156092084, "grad_norm": 0.45268017053604126, "learning_rate": 7.662027795346153e-06, "loss": 0.3711, "step": 2310 }, { "epoch": 1.946378439079169, "grad_norm": 0.42564329504966736, "learning_rate": 7.659537796589341e-06, "loss": 0.4341, "step": 2311 }, { "epoch": 1.9472206625491297, "grad_norm": 0.4052431881427765, "learning_rate": 7.657046877678054e-06, "loss": 0.4452, "step": 2312 }, { "epoch": 1.9480628860190903, "grad_norm": 0.38159602880477905, "learning_rate": 7.654555039474103e-06, "loss": 0.3916, "step": 2313 }, { "epoch": 1.948905109489051, "grad_norm": 0.3786541819572449, "learning_rate": 7.652062282839626e-06, "loss": 0.4174, "step": 2314 }, { "epoch": 1.9497473329590118, "grad_norm": 0.40515053272247314, "learning_rate": 7.649568608637069e-06, "loss": 0.4498, "step": 2315 }, { "epoch": 1.9505895564289726, "grad_norm": 0.4292750954627991, "learning_rate": 7.647074017729203e-06, "loss": 0.4052, "step": 2316 }, { "epoch": 1.9514317798989333, "grad_norm": 0.39685511589050293, "learning_rate": 7.644578510979111e-06, "loss": 0.4337, "step": 2317 }, { "epoch": 1.952274003368894, "grad_norm": 0.4079373776912689, "learning_rate": 7.642082089250203e-06, "loss": 0.4195, "step": 2318 }, { "epoch": 1.9531162268388544, "grad_norm": 0.4269951283931732, "learning_rate": 7.639584753406188e-06, "loss": 0.3779, "step": 2319 }, { "epoch": 1.9539584503088152, "grad_norm": 0.42815789580345154, "learning_rate": 7.637086504311111e-06, "loss": 0.4461, "step": 2320 }, { "epoch": 1.954800673778776, "grad_norm": 0.45192933082580566, "learning_rate": 7.634587342829315e-06, "loss": 0.4313, "step": 2321 }, { "epoch": 1.9556428972487367, "grad_norm": 0.4443899691104889, "learning_rate": 7.632087269825475e-06, "loss": 0.4209, "step": 2322 }, { "epoch": 1.9564851207186975, "grad_norm": 0.4313228726387024, "learning_rate": 7.629586286164566e-06, "loss": 0.4344, "step": 2323 }, { "epoch": 1.957327344188658, "grad_norm": 0.4112071692943573, "learning_rate": 7.62708439271189e-06, "loss": 0.4117, "step": 2324 }, { "epoch": 1.9581695676586186, "grad_norm": 0.41358062624931335, "learning_rate": 7.624581590333055e-06, "loss": 0.4032, "step": 2325 }, { "epoch": 1.9590117911285794, "grad_norm": 0.4124736785888672, "learning_rate": 7.6220778798939935e-06, "loss": 0.4134, "step": 2326 }, { "epoch": 1.9598540145985401, "grad_norm": 0.41618645191192627, "learning_rate": 7.619573262260943e-06, "loss": 0.4763, "step": 2327 }, { "epoch": 1.960696238068501, "grad_norm": 0.3802611827850342, "learning_rate": 7.617067738300459e-06, "loss": 0.3907, "step": 2328 }, { "epoch": 1.9615384615384617, "grad_norm": 0.44549760222435, "learning_rate": 7.614561308879409e-06, "loss": 0.434, "step": 2329 }, { "epoch": 1.9623806850084222, "grad_norm": 0.42627549171447754, "learning_rate": 7.612053974864976e-06, "loss": 0.3984, "step": 2330 }, { "epoch": 1.9632229084783828, "grad_norm": 0.403366357088089, "learning_rate": 7.609545737124653e-06, "loss": 0.4258, "step": 2331 }, { "epoch": 1.9640651319483435, "grad_norm": 0.3876890540122986, "learning_rate": 7.607036596526248e-06, "loss": 0.4178, "step": 2332 }, { "epoch": 1.9649073554183043, "grad_norm": 0.4221363067626953, "learning_rate": 7.604526553937881e-06, "loss": 0.4232, "step": 2333 }, { "epoch": 1.965749578888265, "grad_norm": 0.4242715835571289, "learning_rate": 7.602015610227984e-06, "loss": 0.4282, "step": 2334 }, { "epoch": 1.9665918023582258, "grad_norm": 0.4477480351924896, "learning_rate": 7.599503766265299e-06, "loss": 0.4307, "step": 2335 }, { "epoch": 1.9674340258281864, "grad_norm": 0.45627638697624207, "learning_rate": 7.596991022918882e-06, "loss": 0.4336, "step": 2336 }, { "epoch": 1.9682762492981472, "grad_norm": 0.3704930245876312, "learning_rate": 7.594477381058099e-06, "loss": 0.3854, "step": 2337 }, { "epoch": 1.9691184727681077, "grad_norm": 0.4112611711025238, "learning_rate": 7.591962841552627e-06, "loss": 0.4226, "step": 2338 }, { "epoch": 1.9699606962380685, "grad_norm": 0.43739116191864014, "learning_rate": 7.589447405272452e-06, "loss": 0.3944, "step": 2339 }, { "epoch": 1.9708029197080292, "grad_norm": 0.45789217948913574, "learning_rate": 7.586931073087875e-06, "loss": 0.4281, "step": 2340 }, { "epoch": 1.97164514317799, "grad_norm": 0.39664050936698914, "learning_rate": 7.5844138458695e-06, "loss": 0.4099, "step": 2341 }, { "epoch": 1.9724873666479505, "grad_norm": 0.4394627511501312, "learning_rate": 7.581895724488249e-06, "loss": 0.4117, "step": 2342 }, { "epoch": 1.9733295901179113, "grad_norm": 0.4203369915485382, "learning_rate": 7.5793767098153445e-06, "loss": 0.4338, "step": 2343 }, { "epoch": 1.9741718135878719, "grad_norm": 0.4495774805545807, "learning_rate": 7.576856802722325e-06, "loss": 0.3988, "step": 2344 }, { "epoch": 1.9750140370578326, "grad_norm": 0.42017650604248047, "learning_rate": 7.574336004081033e-06, "loss": 0.401, "step": 2345 }, { "epoch": 1.9758562605277934, "grad_norm": 0.4140045642852783, "learning_rate": 7.571814314763626e-06, "loss": 0.4377, "step": 2346 }, { "epoch": 1.9766984839977542, "grad_norm": 0.4749555289745331, "learning_rate": 7.56929173564256e-06, "loss": 0.3982, "step": 2347 }, { "epoch": 1.977540707467715, "grad_norm": 0.4258587062358856, "learning_rate": 7.566768267590608e-06, "loss": 0.4423, "step": 2348 }, { "epoch": 1.9783829309376755, "grad_norm": 0.5297998189926147, "learning_rate": 7.564243911480842e-06, "loss": 0.4352, "step": 2349 }, { "epoch": 1.979225154407636, "grad_norm": 0.42165282368659973, "learning_rate": 7.561718668186651e-06, "loss": 0.4269, "step": 2350 }, { "epoch": 1.9800673778775968, "grad_norm": 0.5034988522529602, "learning_rate": 7.559192538581723e-06, "loss": 0.4434, "step": 2351 }, { "epoch": 1.9809096013475576, "grad_norm": 0.5448981523513794, "learning_rate": 7.556665523540058e-06, "loss": 0.3744, "step": 2352 }, { "epoch": 1.9817518248175183, "grad_norm": 0.4499759376049042, "learning_rate": 7.554137623935958e-06, "loss": 0.4619, "step": 2353 }, { "epoch": 1.982594048287479, "grad_norm": 0.40544286370277405, "learning_rate": 7.551608840644031e-06, "loss": 0.392, "step": 2354 }, { "epoch": 1.9834362717574396, "grad_norm": 0.4233337938785553, "learning_rate": 7.549079174539197e-06, "loss": 0.4236, "step": 2355 }, { "epoch": 1.9842784952274002, "grad_norm": 0.4794633686542511, "learning_rate": 7.546548626496674e-06, "loss": 0.4427, "step": 2356 }, { "epoch": 1.985120718697361, "grad_norm": 0.4141610860824585, "learning_rate": 7.54401719739199e-06, "loss": 0.39, "step": 2357 }, { "epoch": 1.9859629421673217, "grad_norm": 0.40143895149230957, "learning_rate": 7.541484888100974e-06, "loss": 0.406, "step": 2358 }, { "epoch": 1.9868051656372825, "grad_norm": 0.4139725863933563, "learning_rate": 7.538951699499763e-06, "loss": 0.4164, "step": 2359 }, { "epoch": 1.9876473891072433, "grad_norm": 0.4463483691215515, "learning_rate": 7.5364176324647974e-06, "loss": 0.4619, "step": 2360 }, { "epoch": 1.9884896125772038, "grad_norm": 0.3669396936893463, "learning_rate": 7.5338826878728196e-06, "loss": 0.3903, "step": 2361 }, { "epoch": 1.9893318360471643, "grad_norm": 0.37873542308807373, "learning_rate": 7.531346866600878e-06, "loss": 0.3823, "step": 2362 }, { "epoch": 1.9901740595171251, "grad_norm": 0.4959426522254944, "learning_rate": 7.528810169526322e-06, "loss": 0.492, "step": 2363 }, { "epoch": 1.9910162829870859, "grad_norm": 0.41902732849121094, "learning_rate": 7.526272597526807e-06, "loss": 0.4263, "step": 2364 }, { "epoch": 1.9918585064570467, "grad_norm": 0.4791882038116455, "learning_rate": 7.523734151480288e-06, "loss": 0.4356, "step": 2365 }, { "epoch": 1.9927007299270074, "grad_norm": 0.48954465985298157, "learning_rate": 7.521194832265024e-06, "loss": 0.4115, "step": 2366 }, { "epoch": 1.993542953396968, "grad_norm": 0.42382389307022095, "learning_rate": 7.518654640759579e-06, "loss": 0.3904, "step": 2367 }, { "epoch": 1.9943851768669287, "grad_norm": 0.43658944964408875, "learning_rate": 7.51611357784281e-06, "loss": 0.4377, "step": 2368 }, { "epoch": 1.9952274003368893, "grad_norm": 0.5070874691009521, "learning_rate": 7.513571644393885e-06, "loss": 0.4251, "step": 2369 }, { "epoch": 1.99606962380685, "grad_norm": 0.3915392756462097, "learning_rate": 7.511028841292268e-06, "loss": 0.4303, "step": 2370 }, { "epoch": 1.9969118472768108, "grad_norm": 0.449994295835495, "learning_rate": 7.508485169417727e-06, "loss": 0.4213, "step": 2371 }, { "epoch": 1.9977540707467716, "grad_norm": 0.5115357637405396, "learning_rate": 7.505940629650327e-06, "loss": 0.4326, "step": 2372 }, { "epoch": 1.9985962942167321, "grad_norm": 0.4168933033943176, "learning_rate": 7.503395222870436e-06, "loss": 0.3881, "step": 2373 }, { "epoch": 1.999438517686693, "grad_norm": 0.5255348682403564, "learning_rate": 7.50084894995872e-06, "loss": 0.4343, "step": 2374 }, { "epoch": 2.0002807411566534, "grad_norm": 0.7636502981185913, "learning_rate": 7.498301811796149e-06, "loss": 0.6644, "step": 2375 }, { "epoch": 2.001122964626614, "grad_norm": 0.42270055413246155, "learning_rate": 7.495753809263984e-06, "loss": 0.3488, "step": 2376 }, { "epoch": 2.001965188096575, "grad_norm": 0.4954950511455536, "learning_rate": 7.493204943243795e-06, "loss": 0.4146, "step": 2377 }, { "epoch": 2.0028074115665357, "grad_norm": 0.39596712589263916, "learning_rate": 7.490655214617443e-06, "loss": 0.339, "step": 2378 }, { "epoch": 2.0036496350364965, "grad_norm": 0.5418539047241211, "learning_rate": 7.488104624267092e-06, "loss": 0.436, "step": 2379 }, { "epoch": 2.004491858506457, "grad_norm": 0.3959310054779053, "learning_rate": 7.485553173075201e-06, "loss": 0.3828, "step": 2380 }, { "epoch": 2.0053340819764176, "grad_norm": 0.4356510639190674, "learning_rate": 7.4830008619245295e-06, "loss": 0.3688, "step": 2381 }, { "epoch": 2.0061763054463784, "grad_norm": 0.4752039909362793, "learning_rate": 7.480447691698134e-06, "loss": 0.4007, "step": 2382 }, { "epoch": 2.007018528916339, "grad_norm": 0.3961277902126312, "learning_rate": 7.4778936632793654e-06, "loss": 0.3762, "step": 2383 }, { "epoch": 2.0078607523863, "grad_norm": 0.5493079423904419, "learning_rate": 7.475338777551876e-06, "loss": 0.4103, "step": 2384 }, { "epoch": 2.0087029758562607, "grad_norm": 0.4265096187591553, "learning_rate": 7.472783035399611e-06, "loss": 0.4198, "step": 2385 }, { "epoch": 2.0095451993262214, "grad_norm": 0.44458839297294617, "learning_rate": 7.4702264377068136e-06, "loss": 0.3775, "step": 2386 }, { "epoch": 2.0103874227961818, "grad_norm": 0.45739227533340454, "learning_rate": 7.467668985358024e-06, "loss": 0.3573, "step": 2387 }, { "epoch": 2.0112296462661425, "grad_norm": 0.4804001450538635, "learning_rate": 7.465110679238076e-06, "loss": 0.4607, "step": 2388 }, { "epoch": 2.0120718697361033, "grad_norm": 0.3907682001590729, "learning_rate": 7.462551520232099e-06, "loss": 0.3426, "step": 2389 }, { "epoch": 2.012914093206064, "grad_norm": 0.4383847117424011, "learning_rate": 7.459991509225519e-06, "loss": 0.3811, "step": 2390 }, { "epoch": 2.013756316676025, "grad_norm": 0.4902573823928833, "learning_rate": 7.457430647104054e-06, "loss": 0.3956, "step": 2391 }, { "epoch": 2.0145985401459856, "grad_norm": 0.4220163822174072, "learning_rate": 7.454868934753723e-06, "loss": 0.3969, "step": 2392 }, { "epoch": 2.015440763615946, "grad_norm": 0.408650666475296, "learning_rate": 7.452306373060829e-06, "loss": 0.3512, "step": 2393 }, { "epoch": 2.0162829870859067, "grad_norm": 0.48511630296707153, "learning_rate": 7.449742962911979e-06, "loss": 0.3952, "step": 2394 }, { "epoch": 2.0171252105558675, "grad_norm": 0.4405776858329773, "learning_rate": 7.447178705194064e-06, "loss": 0.4077, "step": 2395 }, { "epoch": 2.0179674340258282, "grad_norm": 0.5347442626953125, "learning_rate": 7.444613600794281e-06, "loss": 0.4149, "step": 2396 }, { "epoch": 2.018809657495789, "grad_norm": 0.42815661430358887, "learning_rate": 7.4420476506001024e-06, "loss": 0.3588, "step": 2397 }, { "epoch": 2.0196518809657498, "grad_norm": 0.44005918502807617, "learning_rate": 7.43948085549931e-06, "loss": 0.3565, "step": 2398 }, { "epoch": 2.02049410443571, "grad_norm": 0.5014862418174744, "learning_rate": 7.436913216379968e-06, "loss": 0.4337, "step": 2399 }, { "epoch": 2.021336327905671, "grad_norm": 0.3564489185810089, "learning_rate": 7.434344734130438e-06, "loss": 0.3892, "step": 2400 }, { "epoch": 2.0221785513756316, "grad_norm": 0.4026906192302704, "learning_rate": 7.431775409639368e-06, "loss": 0.4034, "step": 2401 }, { "epoch": 2.0230207748455924, "grad_norm": 0.35910463333129883, "learning_rate": 7.429205243795701e-06, "loss": 0.344, "step": 2402 }, { "epoch": 2.023862998315553, "grad_norm": 0.3957500457763672, "learning_rate": 7.426634237488672e-06, "loss": 0.3996, "step": 2403 }, { "epoch": 2.024705221785514, "grad_norm": 0.3538331389427185, "learning_rate": 7.4240623916078015e-06, "loss": 0.3352, "step": 2404 }, { "epoch": 2.0255474452554743, "grad_norm": 0.36142396926879883, "learning_rate": 7.421489707042908e-06, "loss": 0.3772, "step": 2405 }, { "epoch": 2.026389668725435, "grad_norm": 0.38029101490974426, "learning_rate": 7.418916184684094e-06, "loss": 0.3361, "step": 2406 }, { "epoch": 2.027231892195396, "grad_norm": 0.42076575756073, "learning_rate": 7.416341825421755e-06, "loss": 0.441, "step": 2407 }, { "epoch": 2.0280741156653566, "grad_norm": 0.38075751066207886, "learning_rate": 7.4137666301465735e-06, "loss": 0.4158, "step": 2408 }, { "epoch": 2.0289163391353173, "grad_norm": 0.36711040139198303, "learning_rate": 7.411190599749526e-06, "loss": 0.3759, "step": 2409 }, { "epoch": 2.029758562605278, "grad_norm": 0.47869446873664856, "learning_rate": 7.408613735121872e-06, "loss": 0.4022, "step": 2410 }, { "epoch": 2.0306007860752384, "grad_norm": 0.38548538088798523, "learning_rate": 7.406036037155165e-06, "loss": 0.3862, "step": 2411 }, { "epoch": 2.031443009545199, "grad_norm": 0.46422791481018066, "learning_rate": 7.40345750674124e-06, "loss": 0.3632, "step": 2412 }, { "epoch": 2.03228523301516, "grad_norm": 0.47313570976257324, "learning_rate": 7.400878144772229e-06, "loss": 0.4244, "step": 2413 }, { "epoch": 2.0331274564851207, "grad_norm": 0.44441789388656616, "learning_rate": 7.398297952140545e-06, "loss": 0.4195, "step": 2414 }, { "epoch": 2.0339696799550815, "grad_norm": 0.3792380690574646, "learning_rate": 7.395716929738891e-06, "loss": 0.3869, "step": 2415 }, { "epoch": 2.0348119034250423, "grad_norm": 0.44934922456741333, "learning_rate": 7.3931350784602565e-06, "loss": 0.395, "step": 2416 }, { "epoch": 2.035654126895003, "grad_norm": 0.43328016996383667, "learning_rate": 7.390552399197916e-06, "loss": 0.3914, "step": 2417 }, { "epoch": 2.0364963503649633, "grad_norm": 0.3920332193374634, "learning_rate": 7.387968892845434e-06, "loss": 0.397, "step": 2418 }, { "epoch": 2.037338573834924, "grad_norm": 0.4505634605884552, "learning_rate": 7.3853845602966576e-06, "loss": 0.3885, "step": 2419 }, { "epoch": 2.038180797304885, "grad_norm": 0.43093305826187134, "learning_rate": 7.3827994024457265e-06, "loss": 0.3718, "step": 2420 }, { "epoch": 2.0390230207748457, "grad_norm": 0.37646305561065674, "learning_rate": 7.380213420187055e-06, "loss": 0.3956, "step": 2421 }, { "epoch": 2.0398652442448064, "grad_norm": 0.40610471367836, "learning_rate": 7.377626614415352e-06, "loss": 0.3868, "step": 2422 }, { "epoch": 2.040707467714767, "grad_norm": 0.425259530544281, "learning_rate": 7.375038986025605e-06, "loss": 0.3755, "step": 2423 }, { "epoch": 2.0415496911847275, "grad_norm": 0.39944788813591003, "learning_rate": 7.372450535913094e-06, "loss": 0.4006, "step": 2424 }, { "epoch": 2.0423919146546883, "grad_norm": 0.3574522137641907, "learning_rate": 7.3698612649733735e-06, "loss": 0.3756, "step": 2425 }, { "epoch": 2.043234138124649, "grad_norm": 0.36991435289382935, "learning_rate": 7.367271174102291e-06, "loss": 0.3465, "step": 2426 }, { "epoch": 2.04407636159461, "grad_norm": 0.39590391516685486, "learning_rate": 7.364680264195968e-06, "loss": 0.4198, "step": 2427 }, { "epoch": 2.0449185850645706, "grad_norm": 0.35472142696380615, "learning_rate": 7.362088536150821e-06, "loss": 0.3644, "step": 2428 }, { "epoch": 2.0457608085345313, "grad_norm": 0.34791022539138794, "learning_rate": 7.359495990863539e-06, "loss": 0.4089, "step": 2429 }, { "epoch": 2.0466030320044917, "grad_norm": 0.3731924295425415, "learning_rate": 7.356902629231102e-06, "loss": 0.4103, "step": 2430 }, { "epoch": 2.0474452554744524, "grad_norm": 0.3700176179409027, "learning_rate": 7.354308452150763e-06, "loss": 0.3825, "step": 2431 }, { "epoch": 2.048287478944413, "grad_norm": 0.4180849492549896, "learning_rate": 7.351713460520069e-06, "loss": 0.416, "step": 2432 }, { "epoch": 2.049129702414374, "grad_norm": 0.39755895733833313, "learning_rate": 7.34911765523684e-06, "loss": 0.3663, "step": 2433 }, { "epoch": 2.0499719258843347, "grad_norm": 0.4391947388648987, "learning_rate": 7.346521037199179e-06, "loss": 0.3902, "step": 2434 }, { "epoch": 2.0508141493542955, "grad_norm": 0.3880865275859833, "learning_rate": 7.343923607305472e-06, "loss": 0.3949, "step": 2435 }, { "epoch": 2.051656372824256, "grad_norm": 0.36252468824386597, "learning_rate": 7.341325366454387e-06, "loss": 0.3949, "step": 2436 }, { "epoch": 2.0524985962942166, "grad_norm": 0.40459316968917847, "learning_rate": 7.338726315544869e-06, "loss": 0.3981, "step": 2437 }, { "epoch": 2.0533408197641774, "grad_norm": 0.35923492908477783, "learning_rate": 7.336126455476146e-06, "loss": 0.3757, "step": 2438 }, { "epoch": 2.054183043234138, "grad_norm": 0.3258900046348572, "learning_rate": 7.333525787147724e-06, "loss": 0.3567, "step": 2439 }, { "epoch": 2.055025266704099, "grad_norm": 0.37977737188339233, "learning_rate": 7.3309243114593885e-06, "loss": 0.3775, "step": 2440 }, { "epoch": 2.0558674901740597, "grad_norm": 0.4009614884853363, "learning_rate": 7.328322029311209e-06, "loss": 0.4105, "step": 2441 }, { "epoch": 2.05670971364402, "grad_norm": 0.38889390230178833, "learning_rate": 7.325718941603528e-06, "loss": 0.4162, "step": 2442 }, { "epoch": 2.0575519371139808, "grad_norm": 0.40997999906539917, "learning_rate": 7.323115049236971e-06, "loss": 0.3809, "step": 2443 }, { "epoch": 2.0583941605839415, "grad_norm": 0.4038620889186859, "learning_rate": 7.320510353112435e-06, "loss": 0.3941, "step": 2444 }, { "epoch": 2.0592363840539023, "grad_norm": 0.36055174469947815, "learning_rate": 7.3179048541311096e-06, "loss": 0.3435, "step": 2445 }, { "epoch": 2.060078607523863, "grad_norm": 0.406767874956131, "learning_rate": 7.315298553194443e-06, "loss": 0.394, "step": 2446 }, { "epoch": 2.060920830993824, "grad_norm": 0.40554094314575195, "learning_rate": 7.312691451204178e-06, "loss": 0.389, "step": 2447 }, { "epoch": 2.0617630544637846, "grad_norm": 0.4170484244823456, "learning_rate": 7.310083549062321e-06, "loss": 0.4103, "step": 2448 }, { "epoch": 2.062605277933745, "grad_norm": 0.3989139497280121, "learning_rate": 7.3074748476711676e-06, "loss": 0.4155, "step": 2449 }, { "epoch": 2.0634475014037057, "grad_norm": 0.4155064821243286, "learning_rate": 7.304865347933279e-06, "loss": 0.3907, "step": 2450 }, { "epoch": 2.0642897248736665, "grad_norm": 0.4234904646873474, "learning_rate": 7.302255050751499e-06, "loss": 0.3779, "step": 2451 }, { "epoch": 2.0651319483436272, "grad_norm": 0.3691975474357605, "learning_rate": 7.299643957028945e-06, "loss": 0.3892, "step": 2452 }, { "epoch": 2.065974171813588, "grad_norm": 0.42901161313056946, "learning_rate": 7.297032067669013e-06, "loss": 0.3932, "step": 2453 }, { "epoch": 2.0668163952835488, "grad_norm": 0.35975411534309387, "learning_rate": 7.2944193835753705e-06, "loss": 0.3449, "step": 2454 }, { "epoch": 2.067658618753509, "grad_norm": 0.4861431121826172, "learning_rate": 7.291805905651961e-06, "loss": 0.4238, "step": 2455 }, { "epoch": 2.06850084222347, "grad_norm": 0.36864593625068665, "learning_rate": 7.289191634803002e-06, "loss": 0.3695, "step": 2456 }, { "epoch": 2.0693430656934306, "grad_norm": 0.4183270037174225, "learning_rate": 7.286576571932989e-06, "loss": 0.3574, "step": 2457 }, { "epoch": 2.0701852891633914, "grad_norm": 0.3897833824157715, "learning_rate": 7.283960717946686e-06, "loss": 0.4006, "step": 2458 }, { "epoch": 2.071027512633352, "grad_norm": 0.39834341406822205, "learning_rate": 7.281344073749137e-06, "loss": 0.3645, "step": 2459 }, { "epoch": 2.071869736103313, "grad_norm": 0.3589646816253662, "learning_rate": 7.2787266402456535e-06, "loss": 0.4015, "step": 2460 }, { "epoch": 2.0727119595732733, "grad_norm": 0.44179725646972656, "learning_rate": 7.276108418341823e-06, "loss": 0.3919, "step": 2461 }, { "epoch": 2.073554183043234, "grad_norm": 0.36506709456443787, "learning_rate": 7.273489408943506e-06, "loss": 0.3491, "step": 2462 }, { "epoch": 2.074396406513195, "grad_norm": 0.3806409239768982, "learning_rate": 7.270869612956834e-06, "loss": 0.4096, "step": 2463 }, { "epoch": 2.0752386299831556, "grad_norm": 0.43002256751060486, "learning_rate": 7.268249031288214e-06, "loss": 0.4, "step": 2464 }, { "epoch": 2.0760808534531163, "grad_norm": 0.4376688301563263, "learning_rate": 7.265627664844319e-06, "loss": 0.3881, "step": 2465 }, { "epoch": 2.076923076923077, "grad_norm": 0.3733011484146118, "learning_rate": 7.2630055145320985e-06, "loss": 0.3641, "step": 2466 }, { "epoch": 2.0777653003930374, "grad_norm": 0.4685676097869873, "learning_rate": 7.260382581258771e-06, "loss": 0.3706, "step": 2467 }, { "epoch": 2.078607523862998, "grad_norm": 0.42101266980171204, "learning_rate": 7.257758865931827e-06, "loss": 0.3744, "step": 2468 }, { "epoch": 2.079449747332959, "grad_norm": 0.39881131052970886, "learning_rate": 7.255134369459027e-06, "loss": 0.4282, "step": 2469 }, { "epoch": 2.0802919708029197, "grad_norm": 0.4599214792251587, "learning_rate": 7.252509092748401e-06, "loss": 0.3672, "step": 2470 }, { "epoch": 2.0811341942728805, "grad_norm": 0.41449546813964844, "learning_rate": 7.2498830367082505e-06, "loss": 0.3921, "step": 2471 }, { "epoch": 2.0819764177428413, "grad_norm": 0.39603298902511597, "learning_rate": 7.247256202247145e-06, "loss": 0.3996, "step": 2472 }, { "epoch": 2.0828186412128016, "grad_norm": 0.3422272205352783, "learning_rate": 7.244628590273927e-06, "loss": 0.3431, "step": 2473 }, { "epoch": 2.0836608646827623, "grad_norm": 0.4477284848690033, "learning_rate": 7.2420002016977e-06, "loss": 0.4133, "step": 2474 }, { "epoch": 2.084503088152723, "grad_norm": 0.42314767837524414, "learning_rate": 7.239371037427847e-06, "loss": 0.3735, "step": 2475 }, { "epoch": 2.085345311622684, "grad_norm": 0.35468557476997375, "learning_rate": 7.2367410983740086e-06, "loss": 0.3847, "step": 2476 }, { "epoch": 2.0861875350926447, "grad_norm": 0.433987557888031, "learning_rate": 7.234110385446104e-06, "loss": 0.3665, "step": 2477 }, { "epoch": 2.0870297585626054, "grad_norm": 0.40800195932388306, "learning_rate": 7.231478899554309e-06, "loss": 0.3813, "step": 2478 }, { "epoch": 2.087871982032566, "grad_norm": 0.38020676374435425, "learning_rate": 7.228846641609077e-06, "loss": 0.3857, "step": 2479 }, { "epoch": 2.0887142055025265, "grad_norm": 0.4243893027305603, "learning_rate": 7.226213612521123e-06, "loss": 0.4293, "step": 2480 }, { "epoch": 2.0895564289724873, "grad_norm": 0.4084448218345642, "learning_rate": 7.223579813201429e-06, "loss": 0.3525, "step": 2481 }, { "epoch": 2.090398652442448, "grad_norm": 0.4399142861366272, "learning_rate": 7.220945244561245e-06, "loss": 0.3959, "step": 2482 }, { "epoch": 2.091240875912409, "grad_norm": 0.38214874267578125, "learning_rate": 7.218309907512088e-06, "loss": 0.3591, "step": 2483 }, { "epoch": 2.0920830993823696, "grad_norm": 0.4242407977581024, "learning_rate": 7.215673802965735e-06, "loss": 0.3711, "step": 2484 }, { "epoch": 2.0929253228523303, "grad_norm": 0.3788277506828308, "learning_rate": 7.213036931834237e-06, "loss": 0.3497, "step": 2485 }, { "epoch": 2.0937675463222907, "grad_norm": 0.4649195969104767, "learning_rate": 7.210399295029906e-06, "loss": 0.3824, "step": 2486 }, { "epoch": 2.0946097697922514, "grad_norm": 0.42837047576904297, "learning_rate": 7.2077608934653186e-06, "loss": 0.4171, "step": 2487 }, { "epoch": 2.095451993262212, "grad_norm": 0.43380725383758545, "learning_rate": 7.205121728053315e-06, "loss": 0.363, "step": 2488 }, { "epoch": 2.096294216732173, "grad_norm": 0.4749314486980438, "learning_rate": 7.202481799707003e-06, "loss": 0.3723, "step": 2489 }, { "epoch": 2.0971364402021337, "grad_norm": 0.4153778553009033, "learning_rate": 7.199841109339752e-06, "loss": 0.373, "step": 2490 }, { "epoch": 2.0979786636720945, "grad_norm": 0.481983482837677, "learning_rate": 7.197199657865194e-06, "loss": 0.4701, "step": 2491 }, { "epoch": 2.098820887142055, "grad_norm": 0.3588376045227051, "learning_rate": 7.1945574461972305e-06, "loss": 0.3304, "step": 2492 }, { "epoch": 2.0996631106120156, "grad_norm": 0.37522268295288086, "learning_rate": 7.191914475250015e-06, "loss": 0.3542, "step": 2493 }, { "epoch": 2.1005053340819764, "grad_norm": 0.44664663076400757, "learning_rate": 7.189270745937976e-06, "loss": 0.4055, "step": 2494 }, { "epoch": 2.101347557551937, "grad_norm": 0.3918112516403198, "learning_rate": 7.186626259175794e-06, "loss": 0.3929, "step": 2495 }, { "epoch": 2.102189781021898, "grad_norm": 0.4125455617904663, "learning_rate": 7.183981015878419e-06, "loss": 0.3624, "step": 2496 }, { "epoch": 2.1030320044918587, "grad_norm": 0.41717851161956787, "learning_rate": 7.181335016961058e-06, "loss": 0.4009, "step": 2497 }, { "epoch": 2.103874227961819, "grad_norm": 0.3854632079601288, "learning_rate": 7.178688263339184e-06, "loss": 0.3476, "step": 2498 }, { "epoch": 2.1047164514317798, "grad_norm": 0.41728320717811584, "learning_rate": 7.176040755928525e-06, "loss": 0.4465, "step": 2499 }, { "epoch": 2.1055586749017405, "grad_norm": 0.4977567493915558, "learning_rate": 7.173392495645076e-06, "loss": 0.3984, "step": 2500 }, { "epoch": 2.1064008983717013, "grad_norm": 0.4031788408756256, "learning_rate": 7.170743483405087e-06, "loss": 0.3825, "step": 2501 }, { "epoch": 2.107243121841662, "grad_norm": 0.44569268822669983, "learning_rate": 7.168093720125073e-06, "loss": 0.3725, "step": 2502 }, { "epoch": 2.108085345311623, "grad_norm": 0.44907504320144653, "learning_rate": 7.165443206721805e-06, "loss": 0.3764, "step": 2503 }, { "epoch": 2.108927568781583, "grad_norm": 0.40030637383461, "learning_rate": 7.162791944112317e-06, "loss": 0.3714, "step": 2504 }, { "epoch": 2.109769792251544, "grad_norm": 0.44143038988113403, "learning_rate": 7.160139933213899e-06, "loss": 0.4103, "step": 2505 }, { "epoch": 2.1106120157215047, "grad_norm": 0.48231345415115356, "learning_rate": 7.157487174944101e-06, "loss": 0.3656, "step": 2506 }, { "epoch": 2.1114542391914655, "grad_norm": 0.45457252860069275, "learning_rate": 7.1548336702207334e-06, "loss": 0.3996, "step": 2507 }, { "epoch": 2.1122964626614262, "grad_norm": 0.3509310185909271, "learning_rate": 7.1521794199618625e-06, "loss": 0.3901, "step": 2508 }, { "epoch": 2.113138686131387, "grad_norm": 0.5324302911758423, "learning_rate": 7.149524425085813e-06, "loss": 0.4154, "step": 2509 }, { "epoch": 2.1139809096013478, "grad_norm": 0.4085950255393982, "learning_rate": 7.146868686511168e-06, "loss": 0.3707, "step": 2510 }, { "epoch": 2.114823133071308, "grad_norm": 0.36873355507850647, "learning_rate": 7.144212205156768e-06, "loss": 0.3886, "step": 2511 }, { "epoch": 2.115665356541269, "grad_norm": 0.3781096041202545, "learning_rate": 7.14155498194171e-06, "loss": 0.3671, "step": 2512 }, { "epoch": 2.1165075800112296, "grad_norm": 0.4123195707798004, "learning_rate": 7.138897017785345e-06, "loss": 0.4009, "step": 2513 }, { "epoch": 2.1173498034811904, "grad_norm": 0.421730637550354, "learning_rate": 7.136238313607288e-06, "loss": 0.4238, "step": 2514 }, { "epoch": 2.118192026951151, "grad_norm": 0.3866935968399048, "learning_rate": 7.1335788703274e-06, "loss": 0.4041, "step": 2515 }, { "epoch": 2.119034250421112, "grad_norm": 0.4026096761226654, "learning_rate": 7.130918688865806e-06, "loss": 0.3535, "step": 2516 }, { "epoch": 2.1198764738910723, "grad_norm": 0.4044872224330902, "learning_rate": 7.1282577701428825e-06, "loss": 0.4158, "step": 2517 }, { "epoch": 2.120718697361033, "grad_norm": 0.4152929484844208, "learning_rate": 7.125596115079261e-06, "loss": 0.3916, "step": 2518 }, { "epoch": 2.121560920830994, "grad_norm": 0.3990342617034912, "learning_rate": 7.12293372459583e-06, "loss": 0.4089, "step": 2519 }, { "epoch": 2.1224031443009546, "grad_norm": 0.40316280722618103, "learning_rate": 7.120270599613728e-06, "loss": 0.3937, "step": 2520 }, { "epoch": 2.1232453677709153, "grad_norm": 0.45258694887161255, "learning_rate": 7.117606741054353e-06, "loss": 0.3826, "step": 2521 }, { "epoch": 2.124087591240876, "grad_norm": 0.4364733397960663, "learning_rate": 7.114942149839355e-06, "loss": 0.3848, "step": 2522 }, { "epoch": 2.1249298147108364, "grad_norm": 0.3887994587421417, "learning_rate": 7.112276826890636e-06, "loss": 0.3902, "step": 2523 }, { "epoch": 2.125772038180797, "grad_norm": 0.4687117040157318, "learning_rate": 7.109610773130351e-06, "loss": 0.4123, "step": 2524 }, { "epoch": 2.126614261650758, "grad_norm": 0.40977513790130615, "learning_rate": 7.106943989480909e-06, "loss": 0.4134, "step": 2525 }, { "epoch": 2.1274564851207187, "grad_norm": 0.4248538315296173, "learning_rate": 7.1042764768649745e-06, "loss": 0.3626, "step": 2526 }, { "epoch": 2.1282987085906795, "grad_norm": 0.3860372304916382, "learning_rate": 7.101608236205456e-06, "loss": 0.4007, "step": 2527 }, { "epoch": 2.1291409320606403, "grad_norm": 0.35392144322395325, "learning_rate": 7.098939268425524e-06, "loss": 0.3442, "step": 2528 }, { "epoch": 2.1299831555306006, "grad_norm": 0.5112526416778564, "learning_rate": 7.096269574448591e-06, "loss": 0.4115, "step": 2529 }, { "epoch": 2.1308253790005613, "grad_norm": 0.3541620671749115, "learning_rate": 7.093599155198331e-06, "loss": 0.3662, "step": 2530 }, { "epoch": 2.131667602470522, "grad_norm": 0.5046049356460571, "learning_rate": 7.090928011598657e-06, "loss": 0.3592, "step": 2531 }, { "epoch": 2.132509825940483, "grad_norm": 0.4362925589084625, "learning_rate": 7.088256144573744e-06, "loss": 0.4183, "step": 2532 }, { "epoch": 2.1333520494104437, "grad_norm": 0.4963783323764801, "learning_rate": 7.085583555048008e-06, "loss": 0.4065, "step": 2533 }, { "epoch": 2.1341942728804044, "grad_norm": 0.46172335743904114, "learning_rate": 7.082910243946124e-06, "loss": 0.3581, "step": 2534 }, { "epoch": 2.1350364963503647, "grad_norm": 0.36987221240997314, "learning_rate": 7.080236212193006e-06, "loss": 0.3793, "step": 2535 }, { "epoch": 2.1358787198203255, "grad_norm": 0.43611493706703186, "learning_rate": 7.077561460713829e-06, "loss": 0.4152, "step": 2536 }, { "epoch": 2.1367209432902863, "grad_norm": 0.40720897912979126, "learning_rate": 7.074885990434006e-06, "loss": 0.3838, "step": 2537 }, { "epoch": 2.137563166760247, "grad_norm": 0.3740392327308655, "learning_rate": 7.072209802279206e-06, "loss": 0.3683, "step": 2538 }, { "epoch": 2.138405390230208, "grad_norm": 0.39631181955337524, "learning_rate": 7.069532897175346e-06, "loss": 0.37, "step": 2539 }, { "epoch": 2.1392476137001686, "grad_norm": 0.36645010113716125, "learning_rate": 7.0668552760485875e-06, "loss": 0.3689, "step": 2540 }, { "epoch": 2.1400898371701293, "grad_norm": 0.40702709555625916, "learning_rate": 7.06417693982534e-06, "loss": 0.408, "step": 2541 }, { "epoch": 2.1409320606400897, "grad_norm": 0.37797820568084717, "learning_rate": 7.061497889432265e-06, "loss": 0.4048, "step": 2542 }, { "epoch": 2.1417742841100504, "grad_norm": 0.3425788879394531, "learning_rate": 7.0588181257962665e-06, "loss": 0.354, "step": 2543 }, { "epoch": 2.142616507580011, "grad_norm": 0.4113151431083679, "learning_rate": 7.056137649844497e-06, "loss": 0.4226, "step": 2544 }, { "epoch": 2.143458731049972, "grad_norm": 0.3955594003200531, "learning_rate": 7.053456462504354e-06, "loss": 0.3791, "step": 2545 }, { "epoch": 2.1443009545199327, "grad_norm": 0.4093857705593109, "learning_rate": 7.050774564703483e-06, "loss": 0.406, "step": 2546 }, { "epoch": 2.1451431779898935, "grad_norm": 0.4166509509086609, "learning_rate": 7.048091957369777e-06, "loss": 0.367, "step": 2547 }, { "epoch": 2.145985401459854, "grad_norm": 0.4973333179950714, "learning_rate": 7.045408641431368e-06, "loss": 0.4247, "step": 2548 }, { "epoch": 2.1468276249298146, "grad_norm": 0.3445702791213989, "learning_rate": 7.042724617816642e-06, "loss": 0.3702, "step": 2549 }, { "epoch": 2.1476698483997754, "grad_norm": 0.4653063118457794, "learning_rate": 7.04003988745422e-06, "loss": 0.4152, "step": 2550 }, { "epoch": 2.148512071869736, "grad_norm": 0.4295486509799957, "learning_rate": 7.037354451272977e-06, "loss": 0.3675, "step": 2551 }, { "epoch": 2.149354295339697, "grad_norm": 0.36744561791419983, "learning_rate": 7.034668310202026e-06, "loss": 0.4038, "step": 2552 }, { "epoch": 2.1501965188096577, "grad_norm": 0.4995728135108948, "learning_rate": 7.031981465170726e-06, "loss": 0.3927, "step": 2553 }, { "epoch": 2.151038742279618, "grad_norm": 0.3628796339035034, "learning_rate": 7.029293917108678e-06, "loss": 0.3327, "step": 2554 }, { "epoch": 2.1518809657495788, "grad_norm": 0.48484113812446594, "learning_rate": 7.0266056669457316e-06, "loss": 0.4338, "step": 2555 }, { "epoch": 2.1527231892195395, "grad_norm": 0.43785569071769714, "learning_rate": 7.023916715611969e-06, "loss": 0.3943, "step": 2556 }, { "epoch": 2.1535654126895003, "grad_norm": 0.3583446741104126, "learning_rate": 7.021227064037727e-06, "loss": 0.3666, "step": 2557 }, { "epoch": 2.154407636159461, "grad_norm": 0.3986658453941345, "learning_rate": 7.018536713153577e-06, "loss": 0.3915, "step": 2558 }, { "epoch": 2.155249859629422, "grad_norm": 0.3994770050048828, "learning_rate": 7.0158456638903315e-06, "loss": 0.3873, "step": 2559 }, { "epoch": 2.156092083099382, "grad_norm": 0.4067269563674927, "learning_rate": 7.013153917179052e-06, "loss": 0.4027, "step": 2560 }, { "epoch": 2.156934306569343, "grad_norm": 0.3288009464740753, "learning_rate": 7.010461473951034e-06, "loss": 0.3254, "step": 2561 }, { "epoch": 2.1577765300393037, "grad_norm": 0.39667031168937683, "learning_rate": 7.0077683351378166e-06, "loss": 0.4086, "step": 2562 }, { "epoch": 2.1586187535092645, "grad_norm": 0.42918136715888977, "learning_rate": 7.005074501671181e-06, "loss": 0.4231, "step": 2563 }, { "epoch": 2.1594609769792252, "grad_norm": 0.357415109872818, "learning_rate": 7.002379974483146e-06, "loss": 0.3786, "step": 2564 }, { "epoch": 2.160303200449186, "grad_norm": 0.3756013810634613, "learning_rate": 6.999684754505973e-06, "loss": 0.4127, "step": 2565 }, { "epoch": 2.1611454239191463, "grad_norm": 0.3662298917770386, "learning_rate": 6.996988842672161e-06, "loss": 0.3601, "step": 2566 }, { "epoch": 2.161987647389107, "grad_norm": 0.42911261320114136, "learning_rate": 6.9942922399144504e-06, "loss": 0.4025, "step": 2567 }, { "epoch": 2.162829870859068, "grad_norm": 0.42383164167404175, "learning_rate": 6.991594947165818e-06, "loss": 0.3679, "step": 2568 }, { "epoch": 2.1636720943290286, "grad_norm": 0.37146803736686707, "learning_rate": 6.988896965359482e-06, "loss": 0.401, "step": 2569 }, { "epoch": 2.1645143177989894, "grad_norm": 0.3919735848903656, "learning_rate": 6.986198295428897e-06, "loss": 0.389, "step": 2570 }, { "epoch": 2.16535654126895, "grad_norm": 0.41430115699768066, "learning_rate": 6.983498938307758e-06, "loss": 0.3443, "step": 2571 }, { "epoch": 2.166198764738911, "grad_norm": 0.4601540267467499, "learning_rate": 6.9807988949299945e-06, "loss": 0.4161, "step": 2572 }, { "epoch": 2.1670409882088713, "grad_norm": 0.40126731991767883, "learning_rate": 6.978098166229777e-06, "loss": 0.3482, "step": 2573 }, { "epoch": 2.167883211678832, "grad_norm": 0.3807869553565979, "learning_rate": 6.975396753141509e-06, "loss": 0.3784, "step": 2574 }, { "epoch": 2.168725435148793, "grad_norm": 0.4212631583213806, "learning_rate": 6.972694656599834e-06, "loss": 0.4034, "step": 2575 }, { "epoch": 2.1695676586187536, "grad_norm": 0.44910484552383423, "learning_rate": 6.969991877539634e-06, "loss": 0.4415, "step": 2576 }, { "epoch": 2.1704098820887143, "grad_norm": 0.36208871006965637, "learning_rate": 6.967288416896018e-06, "loss": 0.3716, "step": 2577 }, { "epoch": 2.171252105558675, "grad_norm": 0.4024408161640167, "learning_rate": 6.964584275604343e-06, "loss": 0.4134, "step": 2578 }, { "epoch": 2.1720943290286354, "grad_norm": 0.4170951843261719, "learning_rate": 6.961879454600194e-06, "loss": 0.3743, "step": 2579 }, { "epoch": 2.172936552498596, "grad_norm": 0.4101586937904358, "learning_rate": 6.95917395481939e-06, "loss": 0.3849, "step": 2580 }, { "epoch": 2.173778775968557, "grad_norm": 0.35136982798576355, "learning_rate": 6.956467777197992e-06, "loss": 0.3861, "step": 2581 }, { "epoch": 2.1746209994385177, "grad_norm": 0.4743027687072754, "learning_rate": 6.9537609226722855e-06, "loss": 0.381, "step": 2582 }, { "epoch": 2.1754632229084785, "grad_norm": 0.5095626711845398, "learning_rate": 6.951053392178802e-06, "loss": 0.4225, "step": 2583 }, { "epoch": 2.1763054463784393, "grad_norm": 0.3821885883808136, "learning_rate": 6.948345186654294e-06, "loss": 0.4019, "step": 2584 }, { "epoch": 2.1771476698483996, "grad_norm": 0.5870478749275208, "learning_rate": 6.9456363070357614e-06, "loss": 0.4382, "step": 2585 }, { "epoch": 2.1779898933183603, "grad_norm": 0.4033108055591583, "learning_rate": 6.942926754260423e-06, "loss": 0.3866, "step": 2586 }, { "epoch": 2.178832116788321, "grad_norm": 0.3484478294849396, "learning_rate": 6.940216529265743e-06, "loss": 0.3654, "step": 2587 }, { "epoch": 2.179674340258282, "grad_norm": 0.4612495005130768, "learning_rate": 6.937505632989408e-06, "loss": 0.4102, "step": 2588 }, { "epoch": 2.1805165637282427, "grad_norm": 0.446803480386734, "learning_rate": 6.934794066369348e-06, "loss": 0.3985, "step": 2589 }, { "epoch": 2.1813587871982034, "grad_norm": 0.3514085114002228, "learning_rate": 6.932081830343712e-06, "loss": 0.3374, "step": 2590 }, { "epoch": 2.182201010668164, "grad_norm": 0.37717437744140625, "learning_rate": 6.92936892585089e-06, "loss": 0.3751, "step": 2591 }, { "epoch": 2.1830432341381245, "grad_norm": 0.43044930696487427, "learning_rate": 6.926655353829502e-06, "loss": 0.4082, "step": 2592 }, { "epoch": 2.1838854576080853, "grad_norm": 0.4174370765686035, "learning_rate": 6.923941115218396e-06, "loss": 0.3816, "step": 2593 }, { "epoch": 2.184727681078046, "grad_norm": 0.414310097694397, "learning_rate": 6.921226210956651e-06, "loss": 0.4296, "step": 2594 }, { "epoch": 2.185569904548007, "grad_norm": 0.3821665346622467, "learning_rate": 6.918510641983578e-06, "loss": 0.3653, "step": 2595 }, { "epoch": 2.1864121280179676, "grad_norm": 0.4104475975036621, "learning_rate": 6.915794409238719e-06, "loss": 0.3864, "step": 2596 }, { "epoch": 2.187254351487928, "grad_norm": 0.4231337606906891, "learning_rate": 6.913077513661841e-06, "loss": 0.3625, "step": 2597 }, { "epoch": 2.1880965749578887, "grad_norm": 0.43927380442619324, "learning_rate": 6.910359956192945e-06, "loss": 0.3838, "step": 2598 }, { "epoch": 2.1889387984278494, "grad_norm": 0.5182895064353943, "learning_rate": 6.907641737772258e-06, "loss": 0.4123, "step": 2599 }, { "epoch": 2.18978102189781, "grad_norm": 0.44504475593566895, "learning_rate": 6.904922859340241e-06, "loss": 0.4107, "step": 2600 }, { "epoch": 2.190623245367771, "grad_norm": 0.3719852864742279, "learning_rate": 6.902203321837573e-06, "loss": 0.3606, "step": 2601 }, { "epoch": 2.1914654688377317, "grad_norm": 0.44359180331230164, "learning_rate": 6.899483126205173e-06, "loss": 0.3744, "step": 2602 }, { "epoch": 2.1923076923076925, "grad_norm": 0.4753892421722412, "learning_rate": 6.896762273384179e-06, "loss": 0.3782, "step": 2603 }, { "epoch": 2.193149915777653, "grad_norm": 0.4548233449459076, "learning_rate": 6.8940407643159615e-06, "loss": 0.4367, "step": 2604 }, { "epoch": 2.1939921392476136, "grad_norm": 0.42085233330726624, "learning_rate": 6.891318599942111e-06, "loss": 0.4029, "step": 2605 }, { "epoch": 2.1948343627175744, "grad_norm": 0.47119858860969543, "learning_rate": 6.888595781204457e-06, "loss": 0.4236, "step": 2606 }, { "epoch": 2.195676586187535, "grad_norm": 0.3679356873035431, "learning_rate": 6.885872309045042e-06, "loss": 0.3725, "step": 2607 }, { "epoch": 2.196518809657496, "grad_norm": 0.41516584157943726, "learning_rate": 6.883148184406145e-06, "loss": 0.4031, "step": 2608 }, { "epoch": 2.1973610331274567, "grad_norm": 0.4914696216583252, "learning_rate": 6.880423408230264e-06, "loss": 0.388, "step": 2609 }, { "epoch": 2.198203256597417, "grad_norm": 0.38150885701179504, "learning_rate": 6.877697981460125e-06, "loss": 0.371, "step": 2610 }, { "epoch": 2.1990454800673778, "grad_norm": 0.46134328842163086, "learning_rate": 6.874971905038678e-06, "loss": 0.3777, "step": 2611 }, { "epoch": 2.1998877035373385, "grad_norm": 0.4368671774864197, "learning_rate": 6.8722451799091015e-06, "loss": 0.3567, "step": 2612 }, { "epoch": 2.2007299270072993, "grad_norm": 0.45665931701660156, "learning_rate": 6.869517807014794e-06, "loss": 0.4154, "step": 2613 }, { "epoch": 2.20157215047726, "grad_norm": 0.4420725405216217, "learning_rate": 6.866789787299379e-06, "loss": 0.3673, "step": 2614 }, { "epoch": 2.202414373947221, "grad_norm": 0.4349493086338043, "learning_rate": 6.864061121706707e-06, "loss": 0.3745, "step": 2615 }, { "epoch": 2.203256597417181, "grad_norm": 0.4215381145477295, "learning_rate": 6.861331811180847e-06, "loss": 0.3992, "step": 2616 }, { "epoch": 2.204098820887142, "grad_norm": 0.47482624650001526, "learning_rate": 6.858601856666095e-06, "loss": 0.3709, "step": 2617 }, { "epoch": 2.2049410443571027, "grad_norm": 0.382830947637558, "learning_rate": 6.8558712591069675e-06, "loss": 0.3704, "step": 2618 }, { "epoch": 2.2057832678270635, "grad_norm": 0.4734341502189636, "learning_rate": 6.853140019448206e-06, "loss": 0.3889, "step": 2619 }, { "epoch": 2.2066254912970242, "grad_norm": 0.4439683258533478, "learning_rate": 6.85040813863477e-06, "loss": 0.4024, "step": 2620 }, { "epoch": 2.207467714766985, "grad_norm": 0.42917972803115845, "learning_rate": 6.847675617611846e-06, "loss": 0.3982, "step": 2621 }, { "epoch": 2.2083099382369458, "grad_norm": 0.3831932246685028, "learning_rate": 6.844942457324838e-06, "loss": 0.3649, "step": 2622 }, { "epoch": 2.209152161706906, "grad_norm": 0.4541678726673126, "learning_rate": 6.842208658719373e-06, "loss": 0.3873, "step": 2623 }, { "epoch": 2.209994385176867, "grad_norm": 0.4559709429740906, "learning_rate": 6.839474222741299e-06, "loss": 0.3884, "step": 2624 }, { "epoch": 2.2108366086468276, "grad_norm": 0.4740651845932007, "learning_rate": 6.836739150336683e-06, "loss": 0.415, "step": 2625 }, { "epoch": 2.2116788321167884, "grad_norm": 0.43982240557670593, "learning_rate": 6.834003442451814e-06, "loss": 0.4018, "step": 2626 }, { "epoch": 2.212521055586749, "grad_norm": 0.3973946273326874, "learning_rate": 6.831267100033198e-06, "loss": 0.3633, "step": 2627 }, { "epoch": 2.2133632790567095, "grad_norm": 0.4857565760612488, "learning_rate": 6.828530124027566e-06, "loss": 0.3892, "step": 2628 }, { "epoch": 2.2142055025266703, "grad_norm": 0.43514126539230347, "learning_rate": 6.825792515381863e-06, "loss": 0.377, "step": 2629 }, { "epoch": 2.215047725996631, "grad_norm": 0.4195200800895691, "learning_rate": 6.823054275043254e-06, "loss": 0.3517, "step": 2630 }, { "epoch": 2.215889949466592, "grad_norm": 0.5022574663162231, "learning_rate": 6.820315403959123e-06, "loss": 0.4358, "step": 2631 }, { "epoch": 2.2167321729365526, "grad_norm": 0.47917354106903076, "learning_rate": 6.817575903077075e-06, "loss": 0.3857, "step": 2632 }, { "epoch": 2.2175743964065133, "grad_norm": 0.40639790892601013, "learning_rate": 6.814835773344926e-06, "loss": 0.3823, "step": 2633 }, { "epoch": 2.218416619876474, "grad_norm": 0.46938493847846985, "learning_rate": 6.812095015710719e-06, "loss": 0.3826, "step": 2634 }, { "epoch": 2.2192588433464344, "grad_norm": 0.44563955068588257, "learning_rate": 6.809353631122705e-06, "loss": 0.3549, "step": 2635 }, { "epoch": 2.220101066816395, "grad_norm": 0.4610142111778259, "learning_rate": 6.806611620529359e-06, "loss": 0.4143, "step": 2636 }, { "epoch": 2.220943290286356, "grad_norm": 0.38234102725982666, "learning_rate": 6.803868984879367e-06, "loss": 0.3826, "step": 2637 }, { "epoch": 2.2217855137563167, "grad_norm": 0.3732486665248871, "learning_rate": 6.8011257251216365e-06, "loss": 0.3796, "step": 2638 }, { "epoch": 2.2226277372262775, "grad_norm": 0.421195387840271, "learning_rate": 6.798381842205285e-06, "loss": 0.3696, "step": 2639 }, { "epoch": 2.2234699606962383, "grad_norm": 0.4038892388343811, "learning_rate": 6.795637337079653e-06, "loss": 0.3471, "step": 2640 }, { "epoch": 2.2243121841661986, "grad_norm": 0.3774791359901428, "learning_rate": 6.792892210694289e-06, "loss": 0.387, "step": 2641 }, { "epoch": 2.2251544076361593, "grad_norm": 0.38521677255630493, "learning_rate": 6.790146463998961e-06, "loss": 0.4032, "step": 2642 }, { "epoch": 2.22599663110612, "grad_norm": 0.38268768787384033, "learning_rate": 6.787400097943652e-06, "loss": 0.3851, "step": 2643 }, { "epoch": 2.226838854576081, "grad_norm": 0.3591472804546356, "learning_rate": 6.784653113478554e-06, "loss": 0.3597, "step": 2644 }, { "epoch": 2.2276810780460417, "grad_norm": 0.35263994336128235, "learning_rate": 6.7819055115540786e-06, "loss": 0.3792, "step": 2645 }, { "epoch": 2.2285233015160024, "grad_norm": 0.36026713252067566, "learning_rate": 6.779157293120848e-06, "loss": 0.3993, "step": 2646 }, { "epoch": 2.2293655249859627, "grad_norm": 0.37047407031059265, "learning_rate": 6.7764084591297006e-06, "loss": 0.3706, "step": 2647 }, { "epoch": 2.2302077484559235, "grad_norm": 0.3749423027038574, "learning_rate": 6.773659010531684e-06, "loss": 0.3725, "step": 2648 }, { "epoch": 2.2310499719258843, "grad_norm": 0.35925862193107605, "learning_rate": 6.770908948278061e-06, "loss": 0.3485, "step": 2649 }, { "epoch": 2.231892195395845, "grad_norm": 0.4112299084663391, "learning_rate": 6.768158273320306e-06, "loss": 0.3814, "step": 2650 }, { "epoch": 2.232734418865806, "grad_norm": 0.4206498861312866, "learning_rate": 6.765406986610104e-06, "loss": 0.3917, "step": 2651 }, { "epoch": 2.2335766423357666, "grad_norm": 0.429951936006546, "learning_rate": 6.762655089099353e-06, "loss": 0.3712, "step": 2652 }, { "epoch": 2.2344188658057273, "grad_norm": 0.4262179732322693, "learning_rate": 6.759902581740167e-06, "loss": 0.4166, "step": 2653 }, { "epoch": 2.2352610892756877, "grad_norm": 0.3434206545352936, "learning_rate": 6.7571494654848605e-06, "loss": 0.3601, "step": 2654 }, { "epoch": 2.2361033127456484, "grad_norm": 0.39816612005233765, "learning_rate": 6.754395741285968e-06, "loss": 0.4011, "step": 2655 }, { "epoch": 2.236945536215609, "grad_norm": 0.5071439146995544, "learning_rate": 6.751641410096228e-06, "loss": 0.408, "step": 2656 }, { "epoch": 2.23778775968557, "grad_norm": 0.33378866314888, "learning_rate": 6.748886472868597e-06, "loss": 0.3494, "step": 2657 }, { "epoch": 2.2386299831555307, "grad_norm": 0.45501789450645447, "learning_rate": 6.7461309305562294e-06, "loss": 0.4054, "step": 2658 }, { "epoch": 2.239472206625491, "grad_norm": 0.40748968720436096, "learning_rate": 6.743374784112501e-06, "loss": 0.4006, "step": 2659 }, { "epoch": 2.240314430095452, "grad_norm": 0.413161039352417, "learning_rate": 6.740618034490986e-06, "loss": 0.4038, "step": 2660 }, { "epoch": 2.2411566535654126, "grad_norm": 0.41916894912719727, "learning_rate": 6.7378606826454795e-06, "loss": 0.3851, "step": 2661 }, { "epoch": 2.2419988770353734, "grad_norm": 0.3534872531890869, "learning_rate": 6.735102729529971e-06, "loss": 0.386, "step": 2662 }, { "epoch": 2.242841100505334, "grad_norm": 0.36810004711151123, "learning_rate": 6.732344176098671e-06, "loss": 0.3878, "step": 2663 }, { "epoch": 2.243683323975295, "grad_norm": 0.3748600482940674, "learning_rate": 6.729585023305986e-06, "loss": 0.3392, "step": 2664 }, { "epoch": 2.2445255474452557, "grad_norm": 0.44931143522262573, "learning_rate": 6.726825272106539e-06, "loss": 0.3793, "step": 2665 }, { "epoch": 2.245367770915216, "grad_norm": 0.368571400642395, "learning_rate": 6.724064923455156e-06, "loss": 0.3665, "step": 2666 }, { "epoch": 2.2462099943851768, "grad_norm": 0.3748553991317749, "learning_rate": 6.72130397830687e-06, "loss": 0.4219, "step": 2667 }, { "epoch": 2.2470522178551375, "grad_norm": 0.40337809920310974, "learning_rate": 6.718542437616922e-06, "loss": 0.3755, "step": 2668 }, { "epoch": 2.2478944413250983, "grad_norm": 0.4169621765613556, "learning_rate": 6.7157803023407555e-06, "loss": 0.3944, "step": 2669 }, { "epoch": 2.248736664795059, "grad_norm": 0.35581812262535095, "learning_rate": 6.713017573434022e-06, "loss": 0.376, "step": 2670 }, { "epoch": 2.24957888826502, "grad_norm": 0.4262293875217438, "learning_rate": 6.710254251852581e-06, "loss": 0.3978, "step": 2671 }, { "epoch": 2.25042111173498, "grad_norm": 0.46019333600997925, "learning_rate": 6.7074903385524925e-06, "loss": 0.4082, "step": 2672 }, { "epoch": 2.251263335204941, "grad_norm": 0.31891703605651855, "learning_rate": 6.704725834490024e-06, "loss": 0.3272, "step": 2673 }, { "epoch": 2.2521055586749017, "grad_norm": 0.49676400423049927, "learning_rate": 6.701960740621645e-06, "loss": 0.419, "step": 2674 }, { "epoch": 2.2529477821448625, "grad_norm": 0.4122948944568634, "learning_rate": 6.699195057904031e-06, "loss": 0.3971, "step": 2675 }, { "epoch": 2.2537900056148232, "grad_norm": 0.4175935387611389, "learning_rate": 6.696428787294061e-06, "loss": 0.3987, "step": 2676 }, { "epoch": 2.254632229084784, "grad_norm": 0.4007091522216797, "learning_rate": 6.693661929748819e-06, "loss": 0.3766, "step": 2677 }, { "epoch": 2.2554744525547443, "grad_norm": 0.42834874987602234, "learning_rate": 6.690894486225587e-06, "loss": 0.387, "step": 2678 }, { "epoch": 2.256316676024705, "grad_norm": 0.4690724015235901, "learning_rate": 6.688126457681856e-06, "loss": 0.4325, "step": 2679 }, { "epoch": 2.257158899494666, "grad_norm": 0.36780402064323425, "learning_rate": 6.6853578450753155e-06, "loss": 0.3744, "step": 2680 }, { "epoch": 2.2580011229646266, "grad_norm": 0.4243084788322449, "learning_rate": 6.682588649363857e-06, "loss": 0.4178, "step": 2681 }, { "epoch": 2.2588433464345874, "grad_norm": 0.3653521239757538, "learning_rate": 6.679818871505575e-06, "loss": 0.3938, "step": 2682 }, { "epoch": 2.259685569904548, "grad_norm": 0.44290193915367126, "learning_rate": 6.677048512458766e-06, "loss": 0.4092, "step": 2683 }, { "epoch": 2.260527793374509, "grad_norm": 0.4315260946750641, "learning_rate": 6.674277573181925e-06, "loss": 0.4087, "step": 2684 }, { "epoch": 2.2613700168444693, "grad_norm": 0.36115244030952454, "learning_rate": 6.6715060546337515e-06, "loss": 0.3621, "step": 2685 }, { "epoch": 2.26221224031443, "grad_norm": 0.4225200414657593, "learning_rate": 6.6687339577731425e-06, "loss": 0.3552, "step": 2686 }, { "epoch": 2.263054463784391, "grad_norm": 0.41556936502456665, "learning_rate": 6.665961283559197e-06, "loss": 0.4157, "step": 2687 }, { "epoch": 2.2638966872543516, "grad_norm": 0.3945252001285553, "learning_rate": 6.663188032951211e-06, "loss": 0.3879, "step": 2688 }, { "epoch": 2.2647389107243123, "grad_norm": 0.44223693013191223, "learning_rate": 6.660414206908683e-06, "loss": 0.4168, "step": 2689 }, { "epoch": 2.2655811341942727, "grad_norm": 0.3876868486404419, "learning_rate": 6.657639806391307e-06, "loss": 0.368, "step": 2690 }, { "epoch": 2.2664233576642334, "grad_norm": 0.3890511989593506, "learning_rate": 6.654864832358983e-06, "loss": 0.398, "step": 2691 }, { "epoch": 2.267265581134194, "grad_norm": 0.45858412981033325, "learning_rate": 6.652089285771799e-06, "loss": 0.4307, "step": 2692 }, { "epoch": 2.268107804604155, "grad_norm": 0.41795751452445984, "learning_rate": 6.649313167590049e-06, "loss": 0.3753, "step": 2693 }, { "epoch": 2.2689500280741157, "grad_norm": 0.397975891828537, "learning_rate": 6.646536478774221e-06, "loss": 0.4017, "step": 2694 }, { "epoch": 2.2697922515440765, "grad_norm": 0.43247178196907043, "learning_rate": 6.643759220285004e-06, "loss": 0.3984, "step": 2695 }, { "epoch": 2.2706344750140373, "grad_norm": 0.3980551064014435, "learning_rate": 6.640981393083281e-06, "loss": 0.3485, "step": 2696 }, { "epoch": 2.2714766984839976, "grad_norm": 0.42270180583000183, "learning_rate": 6.6382029981301325e-06, "loss": 0.4048, "step": 2697 }, { "epoch": 2.2723189219539583, "grad_norm": 0.3977181017398834, "learning_rate": 6.6354240363868336e-06, "loss": 0.376, "step": 2698 }, { "epoch": 2.273161145423919, "grad_norm": 0.3996807336807251, "learning_rate": 6.632644508814859e-06, "loss": 0.3837, "step": 2699 }, { "epoch": 2.27400336889388, "grad_norm": 0.5296745300292969, "learning_rate": 6.629864416375879e-06, "loss": 0.4185, "step": 2700 }, { "epoch": 2.2748455923638407, "grad_norm": 0.4463975429534912, "learning_rate": 6.627083760031755e-06, "loss": 0.3869, "step": 2701 }, { "epoch": 2.2756878158338014, "grad_norm": 0.3772948682308197, "learning_rate": 6.624302540744549e-06, "loss": 0.3689, "step": 2702 }, { "epoch": 2.2765300393037617, "grad_norm": 0.4227517247200012, "learning_rate": 6.621520759476514e-06, "loss": 0.3742, "step": 2703 }, { "epoch": 2.2773722627737225, "grad_norm": 0.41157472133636475, "learning_rate": 6.6187384171900985e-06, "loss": 0.3981, "step": 2704 }, { "epoch": 2.2782144862436833, "grad_norm": 0.33998221158981323, "learning_rate": 6.6159555148479436e-06, "loss": 0.3515, "step": 2705 }, { "epoch": 2.279056709713644, "grad_norm": 0.39768990874290466, "learning_rate": 6.613172053412891e-06, "loss": 0.4358, "step": 2706 }, { "epoch": 2.279898933183605, "grad_norm": 0.3518751859664917, "learning_rate": 6.610388033847963e-06, "loss": 0.3537, "step": 2707 }, { "epoch": 2.2807411566535656, "grad_norm": 0.41168543696403503, "learning_rate": 6.60760345711639e-06, "loss": 0.4129, "step": 2708 }, { "epoch": 2.281583380123526, "grad_norm": 0.4166014492511749, "learning_rate": 6.604818324181583e-06, "loss": 0.4362, "step": 2709 }, { "epoch": 2.2824256035934867, "grad_norm": 0.3861297369003296, "learning_rate": 6.602032636007155e-06, "loss": 0.4025, "step": 2710 }, { "epoch": 2.2832678270634474, "grad_norm": 0.39288806915283203, "learning_rate": 6.5992463935569e-06, "loss": 0.4142, "step": 2711 }, { "epoch": 2.284110050533408, "grad_norm": 0.41795215010643005, "learning_rate": 6.596459597794815e-06, "loss": 0.3823, "step": 2712 }, { "epoch": 2.284952274003369, "grad_norm": 0.412968248128891, "learning_rate": 6.593672249685082e-06, "loss": 0.3834, "step": 2713 }, { "epoch": 2.2857944974733297, "grad_norm": 0.3739345371723175, "learning_rate": 6.590884350192075e-06, "loss": 0.4035, "step": 2714 }, { "epoch": 2.2866367209432905, "grad_norm": 0.3875386416912079, "learning_rate": 6.58809590028036e-06, "loss": 0.395, "step": 2715 }, { "epoch": 2.287478944413251, "grad_norm": 0.3555160462856293, "learning_rate": 6.585306900914694e-06, "loss": 0.3861, "step": 2716 }, { "epoch": 2.2883211678832116, "grad_norm": 0.360668808221817, "learning_rate": 6.58251735306002e-06, "loss": 0.3823, "step": 2717 }, { "epoch": 2.2891633913531724, "grad_norm": 0.3788992166519165, "learning_rate": 6.579727257681476e-06, "loss": 0.3956, "step": 2718 }, { "epoch": 2.290005614823133, "grad_norm": 0.3955320119857788, "learning_rate": 6.576936615744387e-06, "loss": 0.4044, "step": 2719 }, { "epoch": 2.290847838293094, "grad_norm": 0.3864670395851135, "learning_rate": 6.574145428214267e-06, "loss": 0.3727, "step": 2720 }, { "epoch": 2.2916900617630542, "grad_norm": 0.3666762709617615, "learning_rate": 6.571353696056819e-06, "loss": 0.4053, "step": 2721 }, { "epoch": 2.292532285233015, "grad_norm": 0.4335457980632782, "learning_rate": 6.568561420237935e-06, "loss": 0.4321, "step": 2722 }, { "epoch": 2.2933745087029758, "grad_norm": 0.4066126048564911, "learning_rate": 6.565768601723695e-06, "loss": 0.3916, "step": 2723 }, { "epoch": 2.2942167321729365, "grad_norm": 0.3470166325569153, "learning_rate": 6.5629752414803625e-06, "loss": 0.3721, "step": 2724 }, { "epoch": 2.2950589556428973, "grad_norm": 0.43016311526298523, "learning_rate": 6.560181340474396e-06, "loss": 0.4468, "step": 2725 }, { "epoch": 2.295901179112858, "grad_norm": 0.4320324957370758, "learning_rate": 6.5573868996724376e-06, "loss": 0.371, "step": 2726 }, { "epoch": 2.296743402582819, "grad_norm": 0.4056777358055115, "learning_rate": 6.554591920041314e-06, "loss": 0.4261, "step": 2727 }, { "epoch": 2.297585626052779, "grad_norm": 0.38960063457489014, "learning_rate": 6.551796402548042e-06, "loss": 0.3519, "step": 2728 }, { "epoch": 2.29842784952274, "grad_norm": 0.3476868271827698, "learning_rate": 6.549000348159821e-06, "loss": 0.3807, "step": 2729 }, { "epoch": 2.2992700729927007, "grad_norm": 0.42403295636177063, "learning_rate": 6.546203757844039e-06, "loss": 0.3964, "step": 2730 }, { "epoch": 2.3001122964626615, "grad_norm": 0.376118004322052, "learning_rate": 6.543406632568266e-06, "loss": 0.4053, "step": 2731 }, { "epoch": 2.3009545199326222, "grad_norm": 0.3982013463973999, "learning_rate": 6.540608973300263e-06, "loss": 0.4095, "step": 2732 }, { "epoch": 2.301796743402583, "grad_norm": 0.4044939875602722, "learning_rate": 6.53781078100797e-06, "loss": 0.3709, "step": 2733 }, { "epoch": 2.3026389668725433, "grad_norm": 0.37282589077949524, "learning_rate": 6.535012056659514e-06, "loss": 0.3918, "step": 2734 }, { "epoch": 2.303481190342504, "grad_norm": 0.4113833010196686, "learning_rate": 6.532212801223206e-06, "loss": 0.4724, "step": 2735 }, { "epoch": 2.304323413812465, "grad_norm": 0.41512298583984375, "learning_rate": 6.52941301566754e-06, "loss": 0.3476, "step": 2736 }, { "epoch": 2.3051656372824256, "grad_norm": 0.5135369896888733, "learning_rate": 6.526612700961192e-06, "loss": 0.3935, "step": 2737 }, { "epoch": 2.3060078607523864, "grad_norm": 0.3458305895328522, "learning_rate": 6.523811858073027e-06, "loss": 0.3658, "step": 2738 }, { "epoch": 2.306850084222347, "grad_norm": 0.452080637216568, "learning_rate": 6.521010487972083e-06, "loss": 0.3993, "step": 2739 }, { "epoch": 2.3076923076923075, "grad_norm": 0.4121856093406677, "learning_rate": 6.518208591627594e-06, "loss": 0.3805, "step": 2740 }, { "epoch": 2.3085345311622683, "grad_norm": 0.3998396396636963, "learning_rate": 6.5154061700089595e-06, "loss": 0.4431, "step": 2741 }, { "epoch": 2.309376754632229, "grad_norm": 0.37048959732055664, "learning_rate": 6.512603224085775e-06, "loss": 0.3801, "step": 2742 }, { "epoch": 2.31021897810219, "grad_norm": 0.36995649337768555, "learning_rate": 6.50979975482781e-06, "loss": 0.3707, "step": 2743 }, { "epoch": 2.3110612015721506, "grad_norm": 0.3605344295501709, "learning_rate": 6.506995763205016e-06, "loss": 0.3673, "step": 2744 }, { "epoch": 2.3119034250421113, "grad_norm": 0.3623274862766266, "learning_rate": 6.504191250187527e-06, "loss": 0.3779, "step": 2745 }, { "epoch": 2.312745648512072, "grad_norm": 0.40379709005355835, "learning_rate": 6.501386216745657e-06, "loss": 0.4411, "step": 2746 }, { "epoch": 2.3135878719820324, "grad_norm": 0.38447657227516174, "learning_rate": 6.498580663849898e-06, "loss": 0.4124, "step": 2747 }, { "epoch": 2.314430095451993, "grad_norm": 0.3776076138019562, "learning_rate": 6.495774592470924e-06, "loss": 0.3372, "step": 2748 }, { "epoch": 2.315272318921954, "grad_norm": 0.347569078207016, "learning_rate": 6.492968003579587e-06, "loss": 0.4012, "step": 2749 }, { "epoch": 2.3161145423919147, "grad_norm": 0.39316457509994507, "learning_rate": 6.490160898146919e-06, "loss": 0.3768, "step": 2750 }, { "epoch": 2.3169567658618755, "grad_norm": 0.4048765301704407, "learning_rate": 6.487353277144131e-06, "loss": 0.3801, "step": 2751 }, { "epoch": 2.317798989331836, "grad_norm": 0.36316490173339844, "learning_rate": 6.484545141542611e-06, "loss": 0.3499, "step": 2752 }, { "epoch": 2.3186412128017966, "grad_norm": 0.4471409022808075, "learning_rate": 6.481736492313926e-06, "loss": 0.3918, "step": 2753 }, { "epoch": 2.3194834362717573, "grad_norm": 0.38633257150650024, "learning_rate": 6.478927330429821e-06, "loss": 0.3908, "step": 2754 }, { "epoch": 2.320325659741718, "grad_norm": 0.3803563714027405, "learning_rate": 6.476117656862215e-06, "loss": 0.3761, "step": 2755 }, { "epoch": 2.321167883211679, "grad_norm": 0.4810994565486908, "learning_rate": 6.473307472583211e-06, "loss": 0.4023, "step": 2756 }, { "epoch": 2.3220101066816397, "grad_norm": 0.3754999339580536, "learning_rate": 6.470496778565083e-06, "loss": 0.3686, "step": 2757 }, { "epoch": 2.3228523301516004, "grad_norm": 0.4319044053554535, "learning_rate": 6.467685575780281e-06, "loss": 0.417, "step": 2758 }, { "epoch": 2.3236945536215607, "grad_norm": 0.4605753421783447, "learning_rate": 6.464873865201436e-06, "loss": 0.4156, "step": 2759 }, { "epoch": 2.3245367770915215, "grad_norm": 0.37612685561180115, "learning_rate": 6.46206164780135e-06, "loss": 0.3883, "step": 2760 }, { "epoch": 2.3253790005614823, "grad_norm": 0.3810202181339264, "learning_rate": 6.459248924553002e-06, "loss": 0.412, "step": 2761 }, { "epoch": 2.326221224031443, "grad_norm": 0.397990345954895, "learning_rate": 6.456435696429546e-06, "loss": 0.3572, "step": 2762 }, { "epoch": 2.327063447501404, "grad_norm": 0.4566120505332947, "learning_rate": 6.453621964404312e-06, "loss": 0.4099, "step": 2763 }, { "epoch": 2.3279056709713646, "grad_norm": 0.3623947203159332, "learning_rate": 6.450807729450801e-06, "loss": 0.3607, "step": 2764 }, { "epoch": 2.328747894441325, "grad_norm": 0.42892295122146606, "learning_rate": 6.447992992542692e-06, "loss": 0.3752, "step": 2765 }, { "epoch": 2.3295901179112857, "grad_norm": 0.4721515476703644, "learning_rate": 6.445177754653833e-06, "loss": 0.4122, "step": 2766 }, { "epoch": 2.3304323413812464, "grad_norm": 0.3569062650203705, "learning_rate": 6.442362016758253e-06, "loss": 0.3703, "step": 2767 }, { "epoch": 2.331274564851207, "grad_norm": 0.3307408392429352, "learning_rate": 6.439545779830142e-06, "loss": 0.3335, "step": 2768 }, { "epoch": 2.332116788321168, "grad_norm": 0.47392240166664124, "learning_rate": 6.4367290448438765e-06, "loss": 0.4473, "step": 2769 }, { "epoch": 2.3329590117911287, "grad_norm": 0.37798255681991577, "learning_rate": 6.433911812773993e-06, "loss": 0.3689, "step": 2770 }, { "epoch": 2.333801235261089, "grad_norm": 0.3925400376319885, "learning_rate": 6.4310940845952095e-06, "loss": 0.389, "step": 2771 }, { "epoch": 2.33464345873105, "grad_norm": 0.38920730352401733, "learning_rate": 6.4282758612824105e-06, "loss": 0.373, "step": 2772 }, { "epoch": 2.3354856822010106, "grad_norm": 0.3760654926300049, "learning_rate": 6.425457143810652e-06, "loss": 0.3785, "step": 2773 }, { "epoch": 2.3363279056709714, "grad_norm": 0.3877929151058197, "learning_rate": 6.4226379331551625e-06, "loss": 0.4099, "step": 2774 }, { "epoch": 2.337170129140932, "grad_norm": 0.3987997770309448, "learning_rate": 6.419818230291341e-06, "loss": 0.4337, "step": 2775 }, { "epoch": 2.338012352610893, "grad_norm": 0.3891735076904297, "learning_rate": 6.4169980361947555e-06, "loss": 0.3921, "step": 2776 }, { "epoch": 2.3388545760808537, "grad_norm": 0.36770740151405334, "learning_rate": 6.414177351841145e-06, "loss": 0.3676, "step": 2777 }, { "epoch": 2.339696799550814, "grad_norm": 0.3683178424835205, "learning_rate": 6.41135617820642e-06, "loss": 0.3609, "step": 2778 }, { "epoch": 2.3405390230207748, "grad_norm": 0.40322360396385193, "learning_rate": 6.4085345162666544e-06, "loss": 0.4275, "step": 2779 }, { "epoch": 2.3413812464907355, "grad_norm": 0.3498736321926117, "learning_rate": 6.405712366998097e-06, "loss": 0.3807, "step": 2780 }, { "epoch": 2.3422234699606963, "grad_norm": 0.38304603099823, "learning_rate": 6.402889731377163e-06, "loss": 0.384, "step": 2781 }, { "epoch": 2.343065693430657, "grad_norm": 0.42978259921073914, "learning_rate": 6.400066610380437e-06, "loss": 0.4641, "step": 2782 }, { "epoch": 2.3439079169006174, "grad_norm": 0.3679755628108978, "learning_rate": 6.397243004984668e-06, "loss": 0.352, "step": 2783 }, { "epoch": 2.344750140370578, "grad_norm": 0.4492332339286804, "learning_rate": 6.3944189161667754e-06, "loss": 0.4109, "step": 2784 }, { "epoch": 2.345592363840539, "grad_norm": 0.37373659014701843, "learning_rate": 6.391594344903848e-06, "loss": 0.3226, "step": 2785 }, { "epoch": 2.3464345873104997, "grad_norm": 0.49697330594062805, "learning_rate": 6.388769292173137e-06, "loss": 0.4648, "step": 2786 }, { "epoch": 2.3472768107804605, "grad_norm": 0.3632001280784607, "learning_rate": 6.385943758952062e-06, "loss": 0.3443, "step": 2787 }, { "epoch": 2.3481190342504212, "grad_norm": 0.39843788743019104, "learning_rate": 6.383117746218211e-06, "loss": 0.3927, "step": 2788 }, { "epoch": 2.348961257720382, "grad_norm": 0.40071138739585876, "learning_rate": 6.380291254949334e-06, "loss": 0.411, "step": 2789 }, { "epoch": 2.3498034811903423, "grad_norm": 0.3593490719795227, "learning_rate": 6.377464286123349e-06, "loss": 0.3853, "step": 2790 }, { "epoch": 2.350645704660303, "grad_norm": 0.39951279759407043, "learning_rate": 6.374636840718338e-06, "loss": 0.4096, "step": 2791 }, { "epoch": 2.351487928130264, "grad_norm": 0.3751651346683502, "learning_rate": 6.37180891971255e-06, "loss": 0.3646, "step": 2792 }, { "epoch": 2.3523301516002246, "grad_norm": 0.42634081840515137, "learning_rate": 6.368980524084397e-06, "loss": 0.3903, "step": 2793 }, { "epoch": 2.3531723750701854, "grad_norm": 0.40884703397750854, "learning_rate": 6.366151654812456e-06, "loss": 0.4203, "step": 2794 }, { "epoch": 2.354014598540146, "grad_norm": 0.3910317122936249, "learning_rate": 6.3633223128754655e-06, "loss": 0.3967, "step": 2795 }, { "epoch": 2.3548568220101065, "grad_norm": 0.46886903047561646, "learning_rate": 6.360492499252331e-06, "loss": 0.4005, "step": 2796 }, { "epoch": 2.3556990454800673, "grad_norm": 0.35853269696235657, "learning_rate": 6.357662214922118e-06, "loss": 0.3303, "step": 2797 }, { "epoch": 2.356541268950028, "grad_norm": 0.4691852629184723, "learning_rate": 6.354831460864056e-06, "loss": 0.4221, "step": 2798 }, { "epoch": 2.357383492419989, "grad_norm": 0.40125924348831177, "learning_rate": 6.3520002380575395e-06, "loss": 0.3974, "step": 2799 }, { "epoch": 2.3582257158899496, "grad_norm": 0.3979969024658203, "learning_rate": 6.3491685474821215e-06, "loss": 0.3694, "step": 2800 }, { "epoch": 2.3590679393599103, "grad_norm": 0.4006594121456146, "learning_rate": 6.3463363901175205e-06, "loss": 0.4205, "step": 2801 }, { "epoch": 2.3599101628298707, "grad_norm": 0.352524995803833, "learning_rate": 6.343503766943611e-06, "loss": 0.3843, "step": 2802 }, { "epoch": 2.3607523862998314, "grad_norm": 0.3850668966770172, "learning_rate": 6.3406706789404334e-06, "loss": 0.3611, "step": 2803 }, { "epoch": 2.361594609769792, "grad_norm": 0.40259698033332825, "learning_rate": 6.337837127088189e-06, "loss": 0.4214, "step": 2804 }, { "epoch": 2.362436833239753, "grad_norm": 0.3652294874191284, "learning_rate": 6.335003112367236e-06, "loss": 0.4006, "step": 2805 }, { "epoch": 2.3632790567097137, "grad_norm": 0.3759422302246094, "learning_rate": 6.332168635758097e-06, "loss": 0.3512, "step": 2806 }, { "epoch": 2.3641212801796745, "grad_norm": 0.3912147581577301, "learning_rate": 6.329333698241451e-06, "loss": 0.3859, "step": 2807 }, { "epoch": 2.3649635036496353, "grad_norm": 0.37647417187690735, "learning_rate": 6.3264983007981385e-06, "loss": 0.3837, "step": 2808 }, { "epoch": 2.3658057271195956, "grad_norm": 0.40577325224876404, "learning_rate": 6.323662444409157e-06, "loss": 0.4023, "step": 2809 }, { "epoch": 2.3666479505895563, "grad_norm": 0.4078388214111328, "learning_rate": 6.320826130055666e-06, "loss": 0.4093, "step": 2810 }, { "epoch": 2.367490174059517, "grad_norm": 0.38976743817329407, "learning_rate": 6.317989358718981e-06, "loss": 0.4097, "step": 2811 }, { "epoch": 2.368332397529478, "grad_norm": 0.3532593846321106, "learning_rate": 6.3151521313805755e-06, "loss": 0.359, "step": 2812 }, { "epoch": 2.3691746209994387, "grad_norm": 0.3777972161769867, "learning_rate": 6.31231444902208e-06, "loss": 0.3842, "step": 2813 }, { "epoch": 2.370016844469399, "grad_norm": 0.35087236762046814, "learning_rate": 6.309476312625289e-06, "loss": 0.3482, "step": 2814 }, { "epoch": 2.3708590679393597, "grad_norm": 0.39509135484695435, "learning_rate": 6.306637723172145e-06, "loss": 0.3898, "step": 2815 }, { "epoch": 2.3717012914093205, "grad_norm": 0.41550213098526, "learning_rate": 6.3037986816447525e-06, "loss": 0.4279, "step": 2816 }, { "epoch": 2.3725435148792813, "grad_norm": 0.4350346624851227, "learning_rate": 6.30095918902537e-06, "loss": 0.3998, "step": 2817 }, { "epoch": 2.373385738349242, "grad_norm": 0.3982677459716797, "learning_rate": 6.298119246296415e-06, "loss": 0.3953, "step": 2818 }, { "epoch": 2.374227961819203, "grad_norm": 0.42834633588790894, "learning_rate": 6.295278854440458e-06, "loss": 0.3815, "step": 2819 }, { "epoch": 2.3750701852891636, "grad_norm": 0.4114750921726227, "learning_rate": 6.292438014440228e-06, "loss": 0.3797, "step": 2820 }, { "epoch": 2.375912408759124, "grad_norm": 0.3684304356575012, "learning_rate": 6.289596727278602e-06, "loss": 0.3752, "step": 2821 }, { "epoch": 2.3767546322290847, "grad_norm": 0.41250863671302795, "learning_rate": 6.286754993938622e-06, "loss": 0.3877, "step": 2822 }, { "epoch": 2.3775968556990454, "grad_norm": 0.42247962951660156, "learning_rate": 6.2839128154034745e-06, "loss": 0.3938, "step": 2823 }, { "epoch": 2.378439079169006, "grad_norm": 0.4249562621116638, "learning_rate": 6.28107019265651e-06, "loss": 0.3912, "step": 2824 }, { "epoch": 2.379281302638967, "grad_norm": 0.510134220123291, "learning_rate": 6.278227126681221e-06, "loss": 0.4014, "step": 2825 }, { "epoch": 2.3801235261089277, "grad_norm": 0.3578128218650818, "learning_rate": 6.275383618461263e-06, "loss": 0.4099, "step": 2826 }, { "epoch": 2.3809657495788885, "grad_norm": 0.37700513005256653, "learning_rate": 6.2725396689804415e-06, "loss": 0.3942, "step": 2827 }, { "epoch": 2.381807973048849, "grad_norm": 0.42603740096092224, "learning_rate": 6.2696952792227136e-06, "loss": 0.3984, "step": 2828 }, { "epoch": 2.3826501965188096, "grad_norm": 0.40220481157302856, "learning_rate": 6.266850450172188e-06, "loss": 0.4001, "step": 2829 }, { "epoch": 2.3834924199887704, "grad_norm": 0.378782719373703, "learning_rate": 6.264005182813129e-06, "loss": 0.4261, "step": 2830 }, { "epoch": 2.384334643458731, "grad_norm": 0.3831484615802765, "learning_rate": 6.261159478129949e-06, "loss": 0.3873, "step": 2831 }, { "epoch": 2.385176866928692, "grad_norm": 0.35284513235092163, "learning_rate": 6.2583133371072135e-06, "loss": 0.3758, "step": 2832 }, { "epoch": 2.3860190903986522, "grad_norm": 0.37761390209198, "learning_rate": 6.255466760729639e-06, "loss": 0.3509, "step": 2833 }, { "epoch": 2.386861313868613, "grad_norm": 0.38148465752601624, "learning_rate": 6.252619749982089e-06, "loss": 0.3695, "step": 2834 }, { "epoch": 2.3877035373385738, "grad_norm": 0.3585124909877777, "learning_rate": 6.2497723058495856e-06, "loss": 0.3773, "step": 2835 }, { "epoch": 2.3885457608085345, "grad_norm": 0.36687353253364563, "learning_rate": 6.246924429317292e-06, "loss": 0.4263, "step": 2836 }, { "epoch": 2.3893879842784953, "grad_norm": 0.4248678684234619, "learning_rate": 6.244076121370524e-06, "loss": 0.4339, "step": 2837 }, { "epoch": 2.390230207748456, "grad_norm": 0.35852041840553284, "learning_rate": 6.24122738299475e-06, "loss": 0.3473, "step": 2838 }, { "epoch": 2.391072431218417, "grad_norm": 0.3717164695262909, "learning_rate": 6.238378215175584e-06, "loss": 0.3923, "step": 2839 }, { "epoch": 2.391914654688377, "grad_norm": 0.42008987069129944, "learning_rate": 6.235528618898788e-06, "loss": 0.4268, "step": 2840 }, { "epoch": 2.392756878158338, "grad_norm": 0.33741748332977295, "learning_rate": 6.232678595150275e-06, "loss": 0.3763, "step": 2841 }, { "epoch": 2.3935991016282987, "grad_norm": 0.3912580609321594, "learning_rate": 6.229828144916101e-06, "loss": 0.3935, "step": 2842 }, { "epoch": 2.3944413250982595, "grad_norm": 0.3772770166397095, "learning_rate": 6.226977269182478e-06, "loss": 0.4049, "step": 2843 }, { "epoch": 2.3952835485682202, "grad_norm": 0.3603202998638153, "learning_rate": 6.224125968935756e-06, "loss": 0.378, "step": 2844 }, { "epoch": 2.3961257720381806, "grad_norm": 0.37874338030815125, "learning_rate": 6.221274245162439e-06, "loss": 0.4067, "step": 2845 }, { "epoch": 2.3969679955081413, "grad_norm": 0.38481539487838745, "learning_rate": 6.218422098849172e-06, "loss": 0.3819, "step": 2846 }, { "epoch": 2.397810218978102, "grad_norm": 0.38552016019821167, "learning_rate": 6.21556953098275e-06, "loss": 0.4143, "step": 2847 }, { "epoch": 2.398652442448063, "grad_norm": 0.342472106218338, "learning_rate": 6.212716542550112e-06, "loss": 0.3572, "step": 2848 }, { "epoch": 2.3994946659180236, "grad_norm": 0.41837143898010254, "learning_rate": 6.209863134538344e-06, "loss": 0.3647, "step": 2849 }, { "epoch": 2.4003368893879844, "grad_norm": 0.4151238799095154, "learning_rate": 6.207009307934675e-06, "loss": 0.4075, "step": 2850 }, { "epoch": 2.401179112857945, "grad_norm": 0.40357598662376404, "learning_rate": 6.2041550637264815e-06, "loss": 0.4167, "step": 2851 }, { "epoch": 2.4020213363279055, "grad_norm": 0.38769906759262085, "learning_rate": 6.201300402901283e-06, "loss": 0.3467, "step": 2852 }, { "epoch": 2.4028635597978663, "grad_norm": 0.4209533929824829, "learning_rate": 6.1984453264467405e-06, "loss": 0.3846, "step": 2853 }, { "epoch": 2.403705783267827, "grad_norm": 0.432134747505188, "learning_rate": 6.195589835350665e-06, "loss": 0.4082, "step": 2854 }, { "epoch": 2.404548006737788, "grad_norm": 0.4270572364330292, "learning_rate": 6.192733930601005e-06, "loss": 0.4057, "step": 2855 }, { "epoch": 2.4053902302077486, "grad_norm": 0.34482041001319885, "learning_rate": 6.189877613185857e-06, "loss": 0.3774, "step": 2856 }, { "epoch": 2.4062324536777093, "grad_norm": 0.4011114835739136, "learning_rate": 6.187020884093455e-06, "loss": 0.396, "step": 2857 }, { "epoch": 2.40707467714767, "grad_norm": 0.4324972629547119, "learning_rate": 6.1841637443121806e-06, "loss": 0.3728, "step": 2858 }, { "epoch": 2.4079169006176304, "grad_norm": 0.4193319082260132, "learning_rate": 6.181306194830553e-06, "loss": 0.4091, "step": 2859 }, { "epoch": 2.408759124087591, "grad_norm": 0.3612236976623535, "learning_rate": 6.178448236637238e-06, "loss": 0.3613, "step": 2860 }, { "epoch": 2.409601347557552, "grad_norm": 0.4153840243816376, "learning_rate": 6.175589870721037e-06, "loss": 0.3823, "step": 2861 }, { "epoch": 2.4104435710275127, "grad_norm": 0.40573742985725403, "learning_rate": 6.1727310980708985e-06, "loss": 0.4017, "step": 2862 }, { "epoch": 2.4112857944974735, "grad_norm": 0.3596777319908142, "learning_rate": 6.169871919675908e-06, "loss": 0.3705, "step": 2863 }, { "epoch": 2.412128017967434, "grad_norm": 0.39080876111984253, "learning_rate": 6.167012336525291e-06, "loss": 0.4318, "step": 2864 }, { "epoch": 2.4129702414373946, "grad_norm": 0.38161271810531616, "learning_rate": 6.164152349608415e-06, "loss": 0.3488, "step": 2865 }, { "epoch": 2.4138124649073553, "grad_norm": 0.4144876003265381, "learning_rate": 6.161291959914787e-06, "loss": 0.3901, "step": 2866 }, { "epoch": 2.414654688377316, "grad_norm": 0.35589882731437683, "learning_rate": 6.1584311684340525e-06, "loss": 0.3916, "step": 2867 }, { "epoch": 2.415496911847277, "grad_norm": 0.36626961827278137, "learning_rate": 6.155569976155995e-06, "loss": 0.3807, "step": 2868 }, { "epoch": 2.4163391353172377, "grad_norm": 0.4160865247249603, "learning_rate": 6.152708384070541e-06, "loss": 0.3811, "step": 2869 }, { "epoch": 2.4171813587871984, "grad_norm": 0.3632601797580719, "learning_rate": 6.149846393167749e-06, "loss": 0.4045, "step": 2870 }, { "epoch": 2.4180235822571587, "grad_norm": 0.37783849239349365, "learning_rate": 6.14698400443782e-06, "loss": 0.4225, "step": 2871 }, { "epoch": 2.4188658057271195, "grad_norm": 0.36298975348472595, "learning_rate": 6.144121218871092e-06, "loss": 0.3978, "step": 2872 }, { "epoch": 2.4197080291970803, "grad_norm": 0.3553410470485687, "learning_rate": 6.14125803745804e-06, "loss": 0.3664, "step": 2873 }, { "epoch": 2.420550252667041, "grad_norm": 0.4450918138027191, "learning_rate": 6.138394461189273e-06, "loss": 0.3891, "step": 2874 }, { "epoch": 2.421392476137002, "grad_norm": 0.4353533387184143, "learning_rate": 6.135530491055544e-06, "loss": 0.3895, "step": 2875 }, { "epoch": 2.422234699606962, "grad_norm": 0.4165987968444824, "learning_rate": 6.132666128047732e-06, "loss": 0.3564, "step": 2876 }, { "epoch": 2.423076923076923, "grad_norm": 0.4718574583530426, "learning_rate": 6.129801373156863e-06, "loss": 0.3979, "step": 2877 }, { "epoch": 2.4239191465468837, "grad_norm": 0.430880606174469, "learning_rate": 6.126936227374087e-06, "loss": 0.3724, "step": 2878 }, { "epoch": 2.4247613700168444, "grad_norm": 0.41224467754364014, "learning_rate": 6.124070691690701e-06, "loss": 0.3915, "step": 2879 }, { "epoch": 2.425603593486805, "grad_norm": 0.4117022454738617, "learning_rate": 6.121204767098128e-06, "loss": 0.4052, "step": 2880 }, { "epoch": 2.426445816956766, "grad_norm": 0.36771082878112793, "learning_rate": 6.118338454587931e-06, "loss": 0.3869, "step": 2881 }, { "epoch": 2.4272880404267267, "grad_norm": 0.42212536931037903, "learning_rate": 6.115471755151803e-06, "loss": 0.4048, "step": 2882 }, { "epoch": 2.428130263896687, "grad_norm": 0.3755161464214325, "learning_rate": 6.112604669781572e-06, "loss": 0.3752, "step": 2883 }, { "epoch": 2.428972487366648, "grad_norm": 0.3606360852718353, "learning_rate": 6.109737199469205e-06, "loss": 0.3633, "step": 2884 }, { "epoch": 2.4298147108366086, "grad_norm": 0.433591365814209, "learning_rate": 6.106869345206792e-06, "loss": 0.4405, "step": 2885 }, { "epoch": 2.4306569343065694, "grad_norm": 0.36389392614364624, "learning_rate": 6.104001107986565e-06, "loss": 0.3817, "step": 2886 }, { "epoch": 2.43149915777653, "grad_norm": 0.37414687871932983, "learning_rate": 6.101132488800882e-06, "loss": 0.3576, "step": 2887 }, { "epoch": 2.432341381246491, "grad_norm": 0.36145448684692383, "learning_rate": 6.09826348864224e-06, "loss": 0.3803, "step": 2888 }, { "epoch": 2.4331836047164517, "grad_norm": 0.41576087474823, "learning_rate": 6.095394108503261e-06, "loss": 0.3873, "step": 2889 }, { "epoch": 2.434025828186412, "grad_norm": 0.416775107383728, "learning_rate": 6.092524349376702e-06, "loss": 0.3807, "step": 2890 }, { "epoch": 2.4348680516563728, "grad_norm": 0.35731199383735657, "learning_rate": 6.089654212255449e-06, "loss": 0.4007, "step": 2891 }, { "epoch": 2.4357102751263335, "grad_norm": 0.4653150141239166, "learning_rate": 6.086783698132523e-06, "loss": 0.4062, "step": 2892 }, { "epoch": 2.4365524985962943, "grad_norm": 0.38334277272224426, "learning_rate": 6.083912808001071e-06, "loss": 0.3517, "step": 2893 }, { "epoch": 2.437394722066255, "grad_norm": 0.39006733894348145, "learning_rate": 6.081041542854373e-06, "loss": 0.4069, "step": 2894 }, { "epoch": 2.4382369455362154, "grad_norm": 0.4276982545852661, "learning_rate": 6.078169903685835e-06, "loss": 0.4212, "step": 2895 }, { "epoch": 2.439079169006176, "grad_norm": 0.34826382994651794, "learning_rate": 6.075297891488999e-06, "loss": 0.3642, "step": 2896 }, { "epoch": 2.439921392476137, "grad_norm": 0.42698121070861816, "learning_rate": 6.072425507257528e-06, "loss": 0.4158, "step": 2897 }, { "epoch": 2.4407636159460977, "grad_norm": 0.36941829323768616, "learning_rate": 6.069552751985219e-06, "loss": 0.3802, "step": 2898 }, { "epoch": 2.4416058394160585, "grad_norm": 0.366868257522583, "learning_rate": 6.066679626665997e-06, "loss": 0.3624, "step": 2899 }, { "epoch": 2.4424480628860192, "grad_norm": 0.39438486099243164, "learning_rate": 6.063806132293912e-06, "loss": 0.4043, "step": 2900 }, { "epoch": 2.44329028635598, "grad_norm": 0.3440054953098297, "learning_rate": 6.060932269863147e-06, "loss": 0.3581, "step": 2901 }, { "epoch": 2.4441325098259403, "grad_norm": 0.3650175929069519, "learning_rate": 6.058058040368007e-06, "loss": 0.365, "step": 2902 }, { "epoch": 2.444974733295901, "grad_norm": 0.3923560678958893, "learning_rate": 6.055183444802924e-06, "loss": 0.3612, "step": 2903 }, { "epoch": 2.445816956765862, "grad_norm": 0.45850521326065063, "learning_rate": 6.052308484162464e-06, "loss": 0.4312, "step": 2904 }, { "epoch": 2.4466591802358226, "grad_norm": 0.39832937717437744, "learning_rate": 6.049433159441311e-06, "loss": 0.3541, "step": 2905 }, { "epoch": 2.4475014037057834, "grad_norm": 0.39918017387390137, "learning_rate": 6.046557471634277e-06, "loss": 0.4135, "step": 2906 }, { "epoch": 2.4483436271757437, "grad_norm": 0.37910640239715576, "learning_rate": 6.0436814217363025e-06, "loss": 0.3723, "step": 2907 }, { "epoch": 2.4491858506457045, "grad_norm": 0.4411233067512512, "learning_rate": 6.040805010742452e-06, "loss": 0.375, "step": 2908 }, { "epoch": 2.4500280741156653, "grad_norm": 0.3870784640312195, "learning_rate": 6.037928239647912e-06, "loss": 0.3865, "step": 2909 }, { "epoch": 2.450870297585626, "grad_norm": 0.41347405314445496, "learning_rate": 6.035051109447998e-06, "loss": 0.4479, "step": 2910 }, { "epoch": 2.451712521055587, "grad_norm": 0.3857194185256958, "learning_rate": 6.032173621138146e-06, "loss": 0.3174, "step": 2911 }, { "epoch": 2.4525547445255476, "grad_norm": 0.38060563802719116, "learning_rate": 6.0292957757139205e-06, "loss": 0.4, "step": 2912 }, { "epoch": 2.4533969679955083, "grad_norm": 0.38446739315986633, "learning_rate": 6.026417574171004e-06, "loss": 0.3935, "step": 2913 }, { "epoch": 2.4542391914654687, "grad_norm": 0.46322786808013916, "learning_rate": 6.023539017505206e-06, "loss": 0.4023, "step": 2914 }, { "epoch": 2.4550814149354294, "grad_norm": 0.3813149929046631, "learning_rate": 6.020660106712457e-06, "loss": 0.3884, "step": 2915 }, { "epoch": 2.45592363840539, "grad_norm": 0.3786385953426361, "learning_rate": 6.01778084278881e-06, "loss": 0.3662, "step": 2916 }, { "epoch": 2.456765861875351, "grad_norm": 0.4219895005226135, "learning_rate": 6.014901226730444e-06, "loss": 0.3842, "step": 2917 }, { "epoch": 2.4576080853453117, "grad_norm": 0.3904012441635132, "learning_rate": 6.012021259533655e-06, "loss": 0.4014, "step": 2918 }, { "epoch": 2.4584503088152725, "grad_norm": 0.36380842328071594, "learning_rate": 6.00914094219486e-06, "loss": 0.3765, "step": 2919 }, { "epoch": 2.4592925322852333, "grad_norm": 0.4141122102737427, "learning_rate": 6.006260275710605e-06, "loss": 0.3859, "step": 2920 }, { "epoch": 2.4601347557551936, "grad_norm": 0.3878091275691986, "learning_rate": 6.003379261077545e-06, "loss": 0.3834, "step": 2921 }, { "epoch": 2.4609769792251543, "grad_norm": 0.38289251923561096, "learning_rate": 6.000497899292467e-06, "loss": 0.4337, "step": 2922 }, { "epoch": 2.461819202695115, "grad_norm": 0.38565587997436523, "learning_rate": 5.997616191352268e-06, "loss": 0.3753, "step": 2923 }, { "epoch": 2.462661426165076, "grad_norm": 0.44887575507164, "learning_rate": 5.994734138253974e-06, "loss": 0.4359, "step": 2924 }, { "epoch": 2.4635036496350367, "grad_norm": 0.37654122710227966, "learning_rate": 5.991851740994722e-06, "loss": 0.4113, "step": 2925 }, { "epoch": 2.464345873104997, "grad_norm": 0.41039392352104187, "learning_rate": 5.988969000571775e-06, "loss": 0.3263, "step": 2926 }, { "epoch": 2.4651880965749577, "grad_norm": 0.4163614511489868, "learning_rate": 5.986085917982509e-06, "loss": 0.4187, "step": 2927 }, { "epoch": 2.4660303200449185, "grad_norm": 0.35163211822509766, "learning_rate": 5.983202494224425e-06, "loss": 0.3353, "step": 2928 }, { "epoch": 2.4668725435148793, "grad_norm": 0.4342266321182251, "learning_rate": 5.9803187302951335e-06, "loss": 0.3964, "step": 2929 }, { "epoch": 2.46771476698484, "grad_norm": 0.38451331853866577, "learning_rate": 5.977434627192372e-06, "loss": 0.4107, "step": 2930 }, { "epoch": 2.468556990454801, "grad_norm": 0.3875698149204254, "learning_rate": 5.974550185913988e-06, "loss": 0.3834, "step": 2931 }, { "epoch": 2.4693992139247616, "grad_norm": 0.41777557134628296, "learning_rate": 5.971665407457949e-06, "loss": 0.3908, "step": 2932 }, { "epoch": 2.470241437394722, "grad_norm": 0.42312246561050415, "learning_rate": 5.968780292822338e-06, "loss": 0.4161, "step": 2933 }, { "epoch": 2.4710836608646827, "grad_norm": 0.3689686357975006, "learning_rate": 5.9658948430053574e-06, "loss": 0.3808, "step": 2934 }, { "epoch": 2.4719258843346434, "grad_norm": 0.3967268466949463, "learning_rate": 5.963009059005321e-06, "loss": 0.3822, "step": 2935 }, { "epoch": 2.472768107804604, "grad_norm": 0.4296450912952423, "learning_rate": 5.960122941820664e-06, "loss": 0.395, "step": 2936 }, { "epoch": 2.473610331274565, "grad_norm": 0.44347310066223145, "learning_rate": 5.9572364924499305e-06, "loss": 0.4424, "step": 2937 }, { "epoch": 2.4744525547445253, "grad_norm": 0.3720146417617798, "learning_rate": 5.954349711891783e-06, "loss": 0.3927, "step": 2938 }, { "epoch": 2.475294778214486, "grad_norm": 0.41394662857055664, "learning_rate": 5.951462601144998e-06, "loss": 0.3855, "step": 2939 }, { "epoch": 2.476137001684447, "grad_norm": 0.39930808544158936, "learning_rate": 5.948575161208468e-06, "loss": 0.3652, "step": 2940 }, { "epoch": 2.4769792251544076, "grad_norm": 0.36931201815605164, "learning_rate": 5.945687393081196e-06, "loss": 0.4195, "step": 2941 }, { "epoch": 2.4778214486243684, "grad_norm": 0.41578754782676697, "learning_rate": 5.942799297762299e-06, "loss": 0.4038, "step": 2942 }, { "epoch": 2.478663672094329, "grad_norm": 0.41841816902160645, "learning_rate": 5.939910876251012e-06, "loss": 0.3923, "step": 2943 }, { "epoch": 2.47950589556429, "grad_norm": 0.423191636800766, "learning_rate": 5.937022129546675e-06, "loss": 0.3937, "step": 2944 }, { "epoch": 2.4803481190342502, "grad_norm": 0.4078480005264282, "learning_rate": 5.934133058648751e-06, "loss": 0.3995, "step": 2945 }, { "epoch": 2.481190342504211, "grad_norm": 0.47297680377960205, "learning_rate": 5.931243664556803e-06, "loss": 0.4003, "step": 2946 }, { "epoch": 2.4820325659741718, "grad_norm": 0.3979373574256897, "learning_rate": 5.928353948270515e-06, "loss": 0.3678, "step": 2947 }, { "epoch": 2.4828747894441325, "grad_norm": 0.43114572763442993, "learning_rate": 5.925463910789677e-06, "loss": 0.4049, "step": 2948 }, { "epoch": 2.4837170129140933, "grad_norm": 0.43983784317970276, "learning_rate": 5.922573553114196e-06, "loss": 0.3532, "step": 2949 }, { "epoch": 2.484559236384054, "grad_norm": 0.447295606136322, "learning_rate": 5.919682876244081e-06, "loss": 0.4095, "step": 2950 }, { "epoch": 2.485401459854015, "grad_norm": 0.37554731965065, "learning_rate": 5.916791881179464e-06, "loss": 0.3932, "step": 2951 }, { "epoch": 2.486243683323975, "grad_norm": 0.4453498125076294, "learning_rate": 5.913900568920571e-06, "loss": 0.373, "step": 2952 }, { "epoch": 2.487085906793936, "grad_norm": 0.4603624939918518, "learning_rate": 5.911008940467753e-06, "loss": 0.4104, "step": 2953 }, { "epoch": 2.4879281302638967, "grad_norm": 0.4295567274093628, "learning_rate": 5.9081169968214615e-06, "loss": 0.3486, "step": 2954 }, { "epoch": 2.4887703537338575, "grad_norm": 0.45903676748275757, "learning_rate": 5.90522473898226e-06, "loss": 0.3967, "step": 2955 }, { "epoch": 2.4896125772038182, "grad_norm": 0.46221959590911865, "learning_rate": 5.902332167950818e-06, "loss": 0.3883, "step": 2956 }, { "epoch": 2.4904548006737786, "grad_norm": 0.3932395279407501, "learning_rate": 5.899439284727919e-06, "loss": 0.3752, "step": 2957 }, { "epoch": 2.4912970241437393, "grad_norm": 0.4220406711101532, "learning_rate": 5.896546090314448e-06, "loss": 0.3957, "step": 2958 }, { "epoch": 2.4921392476137, "grad_norm": 0.3905918300151825, "learning_rate": 5.893652585711402e-06, "loss": 0.3882, "step": 2959 }, { "epoch": 2.492981471083661, "grad_norm": 0.44832712411880493, "learning_rate": 5.890758771919885e-06, "loss": 0.3806, "step": 2960 }, { "epoch": 2.4938236945536216, "grad_norm": 0.4459664523601532, "learning_rate": 5.887864649941105e-06, "loss": 0.422, "step": 2961 }, { "epoch": 2.4946659180235824, "grad_norm": 0.3669045567512512, "learning_rate": 5.884970220776379e-06, "loss": 0.3411, "step": 2962 }, { "epoch": 2.495508141493543, "grad_norm": 0.4066373407840729, "learning_rate": 5.8820754854271315e-06, "loss": 0.3956, "step": 2963 }, { "epoch": 2.4963503649635035, "grad_norm": 0.509726881980896, "learning_rate": 5.879180444894889e-06, "loss": 0.3964, "step": 2964 }, { "epoch": 2.4971925884334643, "grad_norm": 0.39185142517089844, "learning_rate": 5.876285100181287e-06, "loss": 0.3649, "step": 2965 }, { "epoch": 2.498034811903425, "grad_norm": 0.40244945883750916, "learning_rate": 5.8733894522880656e-06, "loss": 0.3843, "step": 2966 }, { "epoch": 2.498877035373386, "grad_norm": 0.4632101058959961, "learning_rate": 5.870493502217069e-06, "loss": 0.3566, "step": 2967 }, { "epoch": 2.4997192588433466, "grad_norm": 0.39297157526016235, "learning_rate": 5.867597250970245e-06, "loss": 0.3866, "step": 2968 }, { "epoch": 2.500561482313307, "grad_norm": 0.4257384240627289, "learning_rate": 5.864700699549648e-06, "loss": 0.4117, "step": 2969 }, { "epoch": 2.501403705783268, "grad_norm": 0.4061444103717804, "learning_rate": 5.861803848957434e-06, "loss": 0.3958, "step": 2970 }, { "epoch": 2.5022459292532284, "grad_norm": 0.3913414180278778, "learning_rate": 5.858906700195868e-06, "loss": 0.4088, "step": 2971 }, { "epoch": 2.503088152723189, "grad_norm": 0.4034712314605713, "learning_rate": 5.856009254267306e-06, "loss": 0.4002, "step": 2972 }, { "epoch": 2.50393037619315, "grad_norm": 0.353137344121933, "learning_rate": 5.853111512174223e-06, "loss": 0.3192, "step": 2973 }, { "epoch": 2.5047725996631107, "grad_norm": 0.3940814733505249, "learning_rate": 5.850213474919182e-06, "loss": 0.4349, "step": 2974 }, { "epoch": 2.5056148231330715, "grad_norm": 0.3553839921951294, "learning_rate": 5.847315143504859e-06, "loss": 0.3715, "step": 2975 }, { "epoch": 2.506457046603032, "grad_norm": 0.3574545085430145, "learning_rate": 5.844416518934021e-06, "loss": 0.3537, "step": 2976 }, { "epoch": 2.5072992700729926, "grad_norm": 0.3672139048576355, "learning_rate": 5.841517602209549e-06, "loss": 0.3774, "step": 2977 }, { "epoch": 2.5081414935429533, "grad_norm": 0.3733980357646942, "learning_rate": 5.838618394334412e-06, "loss": 0.3815, "step": 2978 }, { "epoch": 2.508983717012914, "grad_norm": 0.35536476969718933, "learning_rate": 5.835718896311692e-06, "loss": 0.3784, "step": 2979 }, { "epoch": 2.509825940482875, "grad_norm": 0.34747329354286194, "learning_rate": 5.832819109144561e-06, "loss": 0.3452, "step": 2980 }, { "epoch": 2.5106681639528357, "grad_norm": 0.4101678431034088, "learning_rate": 5.8299190338363e-06, "loss": 0.4341, "step": 2981 }, { "epoch": 2.5115103874227964, "grad_norm": 0.38497430086135864, "learning_rate": 5.827018671390281e-06, "loss": 0.4052, "step": 2982 }, { "epoch": 2.5123526108927567, "grad_norm": 0.3735649287700653, "learning_rate": 5.824118022809984e-06, "loss": 0.3485, "step": 2983 }, { "epoch": 2.5131948343627175, "grad_norm": 0.386465847492218, "learning_rate": 5.821217089098978e-06, "loss": 0.4019, "step": 2984 }, { "epoch": 2.5140370578326783, "grad_norm": 0.3724789023399353, "learning_rate": 5.818315871260941e-06, "loss": 0.3995, "step": 2985 }, { "epoch": 2.514879281302639, "grad_norm": 0.37907934188842773, "learning_rate": 5.815414370299644e-06, "loss": 0.3941, "step": 2986 }, { "epoch": 2.5157215047726, "grad_norm": 0.3845129907131195, "learning_rate": 5.812512587218956e-06, "loss": 0.3881, "step": 2987 }, { "epoch": 2.51656372824256, "grad_norm": 0.34884464740753174, "learning_rate": 5.809610523022844e-06, "loss": 0.3676, "step": 2988 }, { "epoch": 2.517405951712521, "grad_norm": 0.3748292028903961, "learning_rate": 5.806708178715372e-06, "loss": 0.3937, "step": 2989 }, { "epoch": 2.5182481751824817, "grad_norm": 0.33079707622528076, "learning_rate": 5.803805555300702e-06, "loss": 0.3721, "step": 2990 }, { "epoch": 2.5190903986524424, "grad_norm": 0.3665636479854584, "learning_rate": 5.800902653783093e-06, "loss": 0.3989, "step": 2991 }, { "epoch": 2.519932622122403, "grad_norm": 0.3741171956062317, "learning_rate": 5.797999475166897e-06, "loss": 0.3711, "step": 2992 }, { "epoch": 2.520774845592364, "grad_norm": 0.3749902844429016, "learning_rate": 5.7950960204565645e-06, "loss": 0.3857, "step": 2993 }, { "epoch": 2.5216170690623247, "grad_norm": 0.37113621830940247, "learning_rate": 5.792192290656643e-06, "loss": 0.3617, "step": 2994 }, { "epoch": 2.522459292532285, "grad_norm": 0.539384126663208, "learning_rate": 5.7892882867717705e-06, "loss": 0.4186, "step": 2995 }, { "epoch": 2.523301516002246, "grad_norm": 0.3763026297092438, "learning_rate": 5.786384009806685e-06, "loss": 0.3665, "step": 2996 }, { "epoch": 2.5241437394722066, "grad_norm": 0.4478371739387512, "learning_rate": 5.7834794607662135e-06, "loss": 0.3887, "step": 2997 }, { "epoch": 2.5249859629421674, "grad_norm": 0.43526655435562134, "learning_rate": 5.7805746406552855e-06, "loss": 0.3354, "step": 2998 }, { "epoch": 2.525828186412128, "grad_norm": 0.4430396258831024, "learning_rate": 5.777669550478911e-06, "loss": 0.38, "step": 2999 }, { "epoch": 2.5266704098820885, "grad_norm": 0.4721396267414093, "learning_rate": 5.77476419124221e-06, "loss": 0.4142, "step": 3000 }, { "epoch": 2.5275126333520497, "grad_norm": 0.42492103576660156, "learning_rate": 5.77185856395038e-06, "loss": 0.3976, "step": 3001 }, { "epoch": 2.52835485682201, "grad_norm": 0.456470251083374, "learning_rate": 5.768952669608724e-06, "loss": 0.3936, "step": 3002 }, { "epoch": 2.5291970802919708, "grad_norm": 0.4306612014770508, "learning_rate": 5.766046509222626e-06, "loss": 0.4009, "step": 3003 }, { "epoch": 2.5300393037619315, "grad_norm": 0.365220308303833, "learning_rate": 5.763140083797573e-06, "loss": 0.3766, "step": 3004 }, { "epoch": 2.5308815272318923, "grad_norm": 0.3650074899196625, "learning_rate": 5.760233394339132e-06, "loss": 0.3677, "step": 3005 }, { "epoch": 2.531723750701853, "grad_norm": 0.4445051848888397, "learning_rate": 5.757326441852974e-06, "loss": 0.375, "step": 3006 }, { "epoch": 2.5325659741718134, "grad_norm": 0.4065120220184326, "learning_rate": 5.7544192273448516e-06, "loss": 0.4149, "step": 3007 }, { "epoch": 2.533408197641774, "grad_norm": 0.46158796548843384, "learning_rate": 5.751511751820612e-06, "loss": 0.4025, "step": 3008 }, { "epoch": 2.534250421111735, "grad_norm": 0.3789588510990143, "learning_rate": 5.748604016286193e-06, "loss": 0.3859, "step": 3009 }, { "epoch": 2.5350926445816957, "grad_norm": 0.3844226002693176, "learning_rate": 5.745696021747617e-06, "loss": 0.4118, "step": 3010 }, { "epoch": 2.5359348680516565, "grad_norm": 0.4050949215888977, "learning_rate": 5.742787769211005e-06, "loss": 0.3795, "step": 3011 }, { "epoch": 2.5367770915216172, "grad_norm": 0.4401671886444092, "learning_rate": 5.7398792596825605e-06, "loss": 0.4125, "step": 3012 }, { "epoch": 2.537619314991578, "grad_norm": 0.39571118354797363, "learning_rate": 5.736970494168577e-06, "loss": 0.4307, "step": 3013 }, { "epoch": 2.5384615384615383, "grad_norm": 0.4175921082496643, "learning_rate": 5.7340614736754395e-06, "loss": 0.3897, "step": 3014 }, { "epoch": 2.539303761931499, "grad_norm": 0.39563316106796265, "learning_rate": 5.731152199209619e-06, "loss": 0.3935, "step": 3015 }, { "epoch": 2.54014598540146, "grad_norm": 0.3585171103477478, "learning_rate": 5.728242671777672e-06, "loss": 0.3614, "step": 3016 }, { "epoch": 2.5409882088714206, "grad_norm": 0.48127251863479614, "learning_rate": 5.725332892386248e-06, "loss": 0.4298, "step": 3017 }, { "epoch": 2.5418304323413814, "grad_norm": 0.39607399702072144, "learning_rate": 5.7224228620420795e-06, "loss": 0.395, "step": 3018 }, { "epoch": 2.5426726558113417, "grad_norm": 0.43835321068763733, "learning_rate": 5.719512581751988e-06, "loss": 0.3881, "step": 3019 }, { "epoch": 2.5435148792813025, "grad_norm": 0.3913244903087616, "learning_rate": 5.716602052522879e-06, "loss": 0.3741, "step": 3020 }, { "epoch": 2.5443571027512633, "grad_norm": 0.3401782810688019, "learning_rate": 5.713691275361746e-06, "loss": 0.4043, "step": 3021 }, { "epoch": 2.545199326221224, "grad_norm": 0.39362087845802307, "learning_rate": 5.710780251275671e-06, "loss": 0.3873, "step": 3022 }, { "epoch": 2.546041549691185, "grad_norm": 0.3383714556694031, "learning_rate": 5.707868981271815e-06, "loss": 0.3554, "step": 3023 }, { "epoch": 2.5468837731611456, "grad_norm": 0.3880877196788788, "learning_rate": 5.704957466357428e-06, "loss": 0.4365, "step": 3024 }, { "epoch": 2.5477259966311063, "grad_norm": 0.3583623766899109, "learning_rate": 5.702045707539844e-06, "loss": 0.3904, "step": 3025 }, { "epoch": 2.5485682201010667, "grad_norm": 0.3455004394054413, "learning_rate": 5.699133705826485e-06, "loss": 0.344, "step": 3026 }, { "epoch": 2.5494104435710274, "grad_norm": 0.3816172480583191, "learning_rate": 5.696221462224848e-06, "loss": 0.3899, "step": 3027 }, { "epoch": 2.550252667040988, "grad_norm": 0.3781954050064087, "learning_rate": 5.693308977742525e-06, "loss": 0.4241, "step": 3028 }, { "epoch": 2.551094890510949, "grad_norm": 0.3922063410282135, "learning_rate": 5.690396253387182e-06, "loss": 0.3392, "step": 3029 }, { "epoch": 2.5519371139809097, "grad_norm": 0.4144710898399353, "learning_rate": 5.687483290166574e-06, "loss": 0.3925, "step": 3030 }, { "epoch": 2.55277933745087, "grad_norm": 0.3982730507850647, "learning_rate": 5.684570089088533e-06, "loss": 0.3781, "step": 3031 }, { "epoch": 2.5536215609208313, "grad_norm": 0.405192106962204, "learning_rate": 5.681656651160981e-06, "loss": 0.3625, "step": 3032 }, { "epoch": 2.5544637843907916, "grad_norm": 0.3838033080101013, "learning_rate": 5.678742977391912e-06, "loss": 0.3824, "step": 3033 }, { "epoch": 2.5553060078607523, "grad_norm": 0.41642579436302185, "learning_rate": 5.6758290687894135e-06, "loss": 0.4442, "step": 3034 }, { "epoch": 2.556148231330713, "grad_norm": 0.40523770451545715, "learning_rate": 5.672914926361644e-06, "loss": 0.3827, "step": 3035 }, { "epoch": 2.556990454800674, "grad_norm": 0.43969064950942993, "learning_rate": 5.670000551116847e-06, "loss": 0.466, "step": 3036 }, { "epoch": 2.5578326782706347, "grad_norm": 0.41155844926834106, "learning_rate": 5.667085944063349e-06, "loss": 0.3855, "step": 3037 }, { "epoch": 2.558674901740595, "grad_norm": 0.4095722734928131, "learning_rate": 5.664171106209551e-06, "loss": 0.4004, "step": 3038 }, { "epoch": 2.5595171252105557, "grad_norm": 0.40058058500289917, "learning_rate": 5.661256038563937e-06, "loss": 0.3863, "step": 3039 }, { "epoch": 2.5603593486805165, "grad_norm": 0.3895190358161926, "learning_rate": 5.658340742135073e-06, "loss": 0.3842, "step": 3040 }, { "epoch": 2.5612015721504773, "grad_norm": 0.4184803366661072, "learning_rate": 5.6554252179315994e-06, "loss": 0.4082, "step": 3041 }, { "epoch": 2.562043795620438, "grad_norm": 0.36171776056289673, "learning_rate": 5.6525094669622395e-06, "loss": 0.3787, "step": 3042 }, { "epoch": 2.562886019090399, "grad_norm": 0.3943457007408142, "learning_rate": 5.649593490235789e-06, "loss": 0.419, "step": 3043 }, { "epoch": 2.5637282425603596, "grad_norm": 0.39112892746925354, "learning_rate": 5.646677288761132e-06, "loss": 0.3511, "step": 3044 }, { "epoch": 2.56457046603032, "grad_norm": 0.382711797952652, "learning_rate": 5.64376086354722e-06, "loss": 0.3922, "step": 3045 }, { "epoch": 2.5654126895002807, "grad_norm": 0.37876996397972107, "learning_rate": 5.640844215603085e-06, "loss": 0.3884, "step": 3046 }, { "epoch": 2.5662549129702414, "grad_norm": 0.39426007866859436, "learning_rate": 5.637927345937843e-06, "loss": 0.397, "step": 3047 }, { "epoch": 2.567097136440202, "grad_norm": 0.4113256633281708, "learning_rate": 5.635010255560673e-06, "loss": 0.396, "step": 3048 }, { "epoch": 2.567939359910163, "grad_norm": 0.3657740652561188, "learning_rate": 5.632092945480847e-06, "loss": 0.3943, "step": 3049 }, { "epoch": 2.5687815833801233, "grad_norm": 0.41256630420684814, "learning_rate": 5.629175416707696e-06, "loss": 0.3982, "step": 3050 }, { "epoch": 2.569623806850084, "grad_norm": 0.3764817416667938, "learning_rate": 5.626257670250641e-06, "loss": 0.4004, "step": 3051 }, { "epoch": 2.570466030320045, "grad_norm": 0.4086979627609253, "learning_rate": 5.6233397071191675e-06, "loss": 0.3746, "step": 3052 }, { "epoch": 2.5713082537900056, "grad_norm": 0.36587581038475037, "learning_rate": 5.6204215283228455e-06, "loss": 0.3639, "step": 3053 }, { "epoch": 2.5721504772599664, "grad_norm": 0.428960919380188, "learning_rate": 5.61750313487131e-06, "loss": 0.3918, "step": 3054 }, { "epoch": 2.572992700729927, "grad_norm": 0.39079415798187256, "learning_rate": 5.61458452777428e-06, "loss": 0.3918, "step": 3055 }, { "epoch": 2.573834924199888, "grad_norm": 0.3480561673641205, "learning_rate": 5.611665708041538e-06, "loss": 0.3832, "step": 3056 }, { "epoch": 2.5746771476698482, "grad_norm": 0.40771323442459106, "learning_rate": 5.608746676682952e-06, "loss": 0.3881, "step": 3057 }, { "epoch": 2.575519371139809, "grad_norm": 0.4093663990497589, "learning_rate": 5.605827434708451e-06, "loss": 0.3906, "step": 3058 }, { "epoch": 2.5763615946097698, "grad_norm": 0.3954639434814453, "learning_rate": 5.602907983128045e-06, "loss": 0.3786, "step": 3059 }, { "epoch": 2.5772038180797305, "grad_norm": 0.4054628908634186, "learning_rate": 5.5999883229518155e-06, "loss": 0.3805, "step": 3060 }, { "epoch": 2.5780460415496913, "grad_norm": 0.3492980897426605, "learning_rate": 5.597068455189914e-06, "loss": 0.3567, "step": 3061 }, { "epoch": 2.5788882650196516, "grad_norm": 0.3909687399864197, "learning_rate": 5.594148380852563e-06, "loss": 0.4177, "step": 3062 }, { "epoch": 2.579730488489613, "grad_norm": 0.3912852704524994, "learning_rate": 5.59122810095006e-06, "loss": 0.4124, "step": 3063 }, { "epoch": 2.580572711959573, "grad_norm": 0.3870551586151123, "learning_rate": 5.588307616492771e-06, "loss": 0.4071, "step": 3064 }, { "epoch": 2.581414935429534, "grad_norm": 0.4188952147960663, "learning_rate": 5.585386928491134e-06, "loss": 0.3919, "step": 3065 }, { "epoch": 2.5822571588994947, "grad_norm": 0.3753717541694641, "learning_rate": 5.582466037955657e-06, "loss": 0.4008, "step": 3066 }, { "epoch": 2.5830993823694555, "grad_norm": 0.43499618768692017, "learning_rate": 5.579544945896918e-06, "loss": 0.4138, "step": 3067 }, { "epoch": 2.5839416058394162, "grad_norm": 0.3667522966861725, "learning_rate": 5.576623653325563e-06, "loss": 0.349, "step": 3068 }, { "epoch": 2.5847838293093766, "grad_norm": 0.3791707158088684, "learning_rate": 5.573702161252313e-06, "loss": 0.376, "step": 3069 }, { "epoch": 2.5856260527793373, "grad_norm": 0.3904006779193878, "learning_rate": 5.57078047068795e-06, "loss": 0.3872, "step": 3070 }, { "epoch": 2.586468276249298, "grad_norm": 0.39937901496887207, "learning_rate": 5.56785858264333e-06, "loss": 0.384, "step": 3071 }, { "epoch": 2.587310499719259, "grad_norm": 0.37881705164909363, "learning_rate": 5.564936498129378e-06, "loss": 0.384, "step": 3072 }, { "epoch": 2.5881527231892196, "grad_norm": 0.34295156598091125, "learning_rate": 5.562014218157085e-06, "loss": 0.3743, "step": 3073 }, { "epoch": 2.5889949466591804, "grad_norm": 0.3662683367729187, "learning_rate": 5.559091743737508e-06, "loss": 0.3704, "step": 3074 }, { "epoch": 2.589837170129141, "grad_norm": 0.38300392031669617, "learning_rate": 5.556169075881774e-06, "loss": 0.411, "step": 3075 }, { "epoch": 2.5906793935991015, "grad_norm": 0.3559247851371765, "learning_rate": 5.553246215601076e-06, "loss": 0.3882, "step": 3076 }, { "epoch": 2.5915216170690623, "grad_norm": 0.3867385983467102, "learning_rate": 5.550323163906672e-06, "loss": 0.401, "step": 3077 }, { "epoch": 2.592363840539023, "grad_norm": 0.3551448583602905, "learning_rate": 5.5473999218098885e-06, "loss": 0.3582, "step": 3078 }, { "epoch": 2.593206064008984, "grad_norm": 0.36177685856819153, "learning_rate": 5.544476490322121e-06, "loss": 0.3758, "step": 3079 }, { "epoch": 2.5940482874789446, "grad_norm": 0.38491183519363403, "learning_rate": 5.54155287045482e-06, "loss": 0.4155, "step": 3080 }, { "epoch": 2.594890510948905, "grad_norm": 0.34598803520202637, "learning_rate": 5.5386290632195135e-06, "loss": 0.35, "step": 3081 }, { "epoch": 2.5957327344188657, "grad_norm": 0.3992313742637634, "learning_rate": 5.535705069627785e-06, "loss": 0.4226, "step": 3082 }, { "epoch": 2.5965749578888264, "grad_norm": 0.35024377703666687, "learning_rate": 5.5327808906912895e-06, "loss": 0.3546, "step": 3083 }, { "epoch": 2.597417181358787, "grad_norm": 0.3785564601421356, "learning_rate": 5.529856527421738e-06, "loss": 0.3687, "step": 3084 }, { "epoch": 2.598259404828748, "grad_norm": 0.40748465061187744, "learning_rate": 5.526931980830916e-06, "loss": 0.4018, "step": 3085 }, { "epoch": 2.5991016282987087, "grad_norm": 0.37709474563598633, "learning_rate": 5.524007251930661e-06, "loss": 0.4145, "step": 3086 }, { "epoch": 2.5999438517686695, "grad_norm": 0.39273905754089355, "learning_rate": 5.521082341732883e-06, "loss": 0.4059, "step": 3087 }, { "epoch": 2.60078607523863, "grad_norm": 0.3337588310241699, "learning_rate": 5.518157251249548e-06, "loss": 0.3541, "step": 3088 }, { "epoch": 2.6016282987085906, "grad_norm": 0.390048623085022, "learning_rate": 5.51523198149269e-06, "loss": 0.4058, "step": 3089 }, { "epoch": 2.6024705221785513, "grad_norm": 0.3355844020843506, "learning_rate": 5.512306533474398e-06, "loss": 0.3879, "step": 3090 }, { "epoch": 2.603312745648512, "grad_norm": 0.40150943398475647, "learning_rate": 5.509380908206831e-06, "loss": 0.3942, "step": 3091 }, { "epoch": 2.604154969118473, "grad_norm": 0.3397875428199768, "learning_rate": 5.506455106702203e-06, "loss": 0.3727, "step": 3092 }, { "epoch": 2.604997192588433, "grad_norm": 0.369993656873703, "learning_rate": 5.503529129972792e-06, "loss": 0.4066, "step": 3093 }, { "epoch": 2.6058394160583944, "grad_norm": 0.4002041518688202, "learning_rate": 5.5006029790309356e-06, "loss": 0.3698, "step": 3094 }, { "epoch": 2.6066816395283547, "grad_norm": 0.37943804264068604, "learning_rate": 5.497676654889032e-06, "loss": 0.4484, "step": 3095 }, { "epoch": 2.6075238629983155, "grad_norm": 0.3716532588005066, "learning_rate": 5.494750158559538e-06, "loss": 0.3714, "step": 3096 }, { "epoch": 2.6083660864682763, "grad_norm": 0.40470871329307556, "learning_rate": 5.4918234910549736e-06, "loss": 0.417, "step": 3097 }, { "epoch": 2.609208309938237, "grad_norm": 0.34361937642097473, "learning_rate": 5.488896653387913e-06, "loss": 0.3461, "step": 3098 }, { "epoch": 2.610050533408198, "grad_norm": 0.4504651725292206, "learning_rate": 5.485969646570992e-06, "loss": 0.4235, "step": 3099 }, { "epoch": 2.610892756878158, "grad_norm": 0.370505690574646, "learning_rate": 5.483042471616909e-06, "loss": 0.3927, "step": 3100 }, { "epoch": 2.611734980348119, "grad_norm": 0.3409562408924103, "learning_rate": 5.480115129538409e-06, "loss": 0.3897, "step": 3101 }, { "epoch": 2.6125772038180797, "grad_norm": 0.4113667607307434, "learning_rate": 5.477187621348309e-06, "loss": 0.3716, "step": 3102 }, { "epoch": 2.6134194272880404, "grad_norm": 0.38605421781539917, "learning_rate": 5.474259948059471e-06, "loss": 0.377, "step": 3103 }, { "epoch": 2.614261650758001, "grad_norm": 0.38146403431892395, "learning_rate": 5.471332110684826e-06, "loss": 0.3693, "step": 3104 }, { "epoch": 2.615103874227962, "grad_norm": 0.3647834062576294, "learning_rate": 5.4684041102373495e-06, "loss": 0.366, "step": 3105 }, { "epoch": 2.6159460976979227, "grad_norm": 0.43951794505119324, "learning_rate": 5.4654759477300845e-06, "loss": 0.4014, "step": 3106 }, { "epoch": 2.616788321167883, "grad_norm": 0.38851335644721985, "learning_rate": 5.46254762417612e-06, "loss": 0.4182, "step": 3107 }, { "epoch": 2.617630544637844, "grad_norm": 0.42629769444465637, "learning_rate": 5.4596191405886114e-06, "loss": 0.411, "step": 3108 }, { "epoch": 2.6184727681078046, "grad_norm": 0.43121039867401123, "learning_rate": 5.456690497980758e-06, "loss": 0.4097, "step": 3109 }, { "epoch": 2.6193149915777654, "grad_norm": 0.3703431487083435, "learning_rate": 5.453761697365825e-06, "loss": 0.3473, "step": 3110 }, { "epoch": 2.620157215047726, "grad_norm": 0.39413881301879883, "learning_rate": 5.450832739757121e-06, "loss": 0.3689, "step": 3111 }, { "epoch": 2.6209994385176865, "grad_norm": 0.4629417061805725, "learning_rate": 5.447903626168022e-06, "loss": 0.3965, "step": 3112 }, { "epoch": 2.6218416619876472, "grad_norm": 0.37741607427597046, "learning_rate": 5.444974357611947e-06, "loss": 0.3744, "step": 3113 }, { "epoch": 2.622683885457608, "grad_norm": 0.3974893093109131, "learning_rate": 5.442044935102376e-06, "loss": 0.3912, "step": 3114 }, { "epoch": 2.6235261089275688, "grad_norm": 0.39115428924560547, "learning_rate": 5.439115359652834e-06, "loss": 0.3797, "step": 3115 }, { "epoch": 2.6243683323975295, "grad_norm": 0.40451374650001526, "learning_rate": 5.436185632276908e-06, "loss": 0.3918, "step": 3116 }, { "epoch": 2.6252105558674903, "grad_norm": 0.3677242398262024, "learning_rate": 5.433255753988232e-06, "loss": 0.3763, "step": 3117 }, { "epoch": 2.626052779337451, "grad_norm": 0.3624890148639679, "learning_rate": 5.430325725800492e-06, "loss": 0.4031, "step": 3118 }, { "epoch": 2.6268950028074114, "grad_norm": 0.45376747846603394, "learning_rate": 5.427395548727432e-06, "loss": 0.4177, "step": 3119 }, { "epoch": 2.627737226277372, "grad_norm": 0.3797759711742401, "learning_rate": 5.424465223782839e-06, "loss": 0.371, "step": 3120 }, { "epoch": 2.628579449747333, "grad_norm": 0.4049317538738251, "learning_rate": 5.421534751980556e-06, "loss": 0.4026, "step": 3121 }, { "epoch": 2.6294216732172937, "grad_norm": 0.3794504702091217, "learning_rate": 5.4186041343344764e-06, "loss": 0.3626, "step": 3122 }, { "epoch": 2.6302638966872545, "grad_norm": 0.41850709915161133, "learning_rate": 5.415673371858544e-06, "loss": 0.3818, "step": 3123 }, { "epoch": 2.631106120157215, "grad_norm": 0.3612701892852783, "learning_rate": 5.4127424655667515e-06, "loss": 0.3592, "step": 3124 }, { "epoch": 2.631948343627176, "grad_norm": 0.3870510756969452, "learning_rate": 5.409811416473144e-06, "loss": 0.4442, "step": 3125 }, { "epoch": 2.6327905670971363, "grad_norm": 0.3946583569049835, "learning_rate": 5.406880225591812e-06, "loss": 0.3774, "step": 3126 }, { "epoch": 2.633632790567097, "grad_norm": 0.39812400937080383, "learning_rate": 5.403948893936899e-06, "loss": 0.4196, "step": 3127 }, { "epoch": 2.634475014037058, "grad_norm": 0.34786084294319153, "learning_rate": 5.401017422522594e-06, "loss": 0.4074, "step": 3128 }, { "epoch": 2.6353172375070186, "grad_norm": 0.3542838990688324, "learning_rate": 5.398085812363136e-06, "loss": 0.3591, "step": 3129 }, { "epoch": 2.6361594609769794, "grad_norm": 0.3497769832611084, "learning_rate": 5.395154064472814e-06, "loss": 0.3661, "step": 3130 }, { "epoch": 2.6370016844469397, "grad_norm": 0.3513258099555969, "learning_rate": 5.39222217986596e-06, "loss": 0.3957, "step": 3131 }, { "epoch": 2.6378439079169005, "grad_norm": 0.39762082695961, "learning_rate": 5.389290159556958e-06, "loss": 0.4053, "step": 3132 }, { "epoch": 2.6386861313868613, "grad_norm": 0.4580541253089905, "learning_rate": 5.386358004560234e-06, "loss": 0.4328, "step": 3133 }, { "epoch": 2.639528354856822, "grad_norm": 0.3332023620605469, "learning_rate": 5.383425715890266e-06, "loss": 0.3719, "step": 3134 }, { "epoch": 2.640370578326783, "grad_norm": 0.4008234739303589, "learning_rate": 5.380493294561573e-06, "loss": 0.3686, "step": 3135 }, { "epoch": 2.6412128017967436, "grad_norm": 0.39120978116989136, "learning_rate": 5.377560741588727e-06, "loss": 0.4102, "step": 3136 }, { "epoch": 2.6420550252667043, "grad_norm": 0.38271060585975647, "learning_rate": 5.374628057986334e-06, "loss": 0.3985, "step": 3137 }, { "epoch": 2.6428972487366647, "grad_norm": 0.3737398087978363, "learning_rate": 5.371695244769059e-06, "loss": 0.3907, "step": 3138 }, { "epoch": 2.6437394722066254, "grad_norm": 0.39865002036094666, "learning_rate": 5.368762302951601e-06, "loss": 0.3718, "step": 3139 }, { "epoch": 2.644581695676586, "grad_norm": 0.37358734011650085, "learning_rate": 5.3658292335487105e-06, "loss": 0.3726, "step": 3140 }, { "epoch": 2.645423919146547, "grad_norm": 0.3682592511177063, "learning_rate": 5.362896037575176e-06, "loss": 0.4132, "step": 3141 }, { "epoch": 2.6462661426165077, "grad_norm": 0.4024572968482971, "learning_rate": 5.359962716045836e-06, "loss": 0.3747, "step": 3142 }, { "epoch": 2.647108366086468, "grad_norm": 0.3973121643066406, "learning_rate": 5.3570292699755675e-06, "loss": 0.3809, "step": 3143 }, { "epoch": 2.647950589556429, "grad_norm": 0.3686797022819519, "learning_rate": 5.354095700379294e-06, "loss": 0.3955, "step": 3144 }, { "epoch": 2.6487928130263896, "grad_norm": 0.37866440415382385, "learning_rate": 5.351162008271978e-06, "loss": 0.3874, "step": 3145 }, { "epoch": 2.6496350364963503, "grad_norm": 0.41592052578926086, "learning_rate": 5.3482281946686295e-06, "loss": 0.382, "step": 3146 }, { "epoch": 2.650477259966311, "grad_norm": 0.4094146192073822, "learning_rate": 5.345294260584296e-06, "loss": 0.4215, "step": 3147 }, { "epoch": 2.651319483436272, "grad_norm": 0.3455742597579956, "learning_rate": 5.3423602070340686e-06, "loss": 0.3792, "step": 3148 }, { "epoch": 2.6521617069062327, "grad_norm": 0.3974559009075165, "learning_rate": 5.33942603503308e-06, "loss": 0.4112, "step": 3149 }, { "epoch": 2.653003930376193, "grad_norm": 0.3913099765777588, "learning_rate": 5.3364917455965014e-06, "loss": 0.4018, "step": 3150 }, { "epoch": 2.6538461538461537, "grad_norm": 0.37142035365104675, "learning_rate": 5.33355733973955e-06, "loss": 0.3729, "step": 3151 }, { "epoch": 2.6546883773161145, "grad_norm": 0.37174728512763977, "learning_rate": 5.3306228184774765e-06, "loss": 0.4191, "step": 3152 }, { "epoch": 2.6555306007860753, "grad_norm": 0.38738808035850525, "learning_rate": 5.327688182825579e-06, "loss": 0.3943, "step": 3153 }, { "epoch": 2.656372824256036, "grad_norm": 0.41029322147369385, "learning_rate": 5.324753433799187e-06, "loss": 0.4068, "step": 3154 }, { "epoch": 2.6572150477259964, "grad_norm": 0.37115633487701416, "learning_rate": 5.321818572413676e-06, "loss": 0.3581, "step": 3155 }, { "epoch": 2.6580572711959576, "grad_norm": 0.4003068506717682, "learning_rate": 5.3188835996844555e-06, "loss": 0.3914, "step": 3156 }, { "epoch": 2.658899494665918, "grad_norm": 0.4772467017173767, "learning_rate": 5.31594851662698e-06, "loss": 0.4141, "step": 3157 }, { "epoch": 2.6597417181358787, "grad_norm": 0.37931397557258606, "learning_rate": 5.313013324256732e-06, "loss": 0.3808, "step": 3158 }, { "epoch": 2.6605839416058394, "grad_norm": 0.38873109221458435, "learning_rate": 5.3100780235892425e-06, "loss": 0.3855, "step": 3159 }, { "epoch": 2.6614261650758, "grad_norm": 0.38812899589538574, "learning_rate": 5.307142615640072e-06, "loss": 0.3863, "step": 3160 }, { "epoch": 2.662268388545761, "grad_norm": 0.3902297019958496, "learning_rate": 5.3042071014248245e-06, "loss": 0.3721, "step": 3161 }, { "epoch": 2.6631106120157213, "grad_norm": 0.556327223777771, "learning_rate": 5.301271481959134e-06, "loss": 0.4008, "step": 3162 }, { "epoch": 2.663952835485682, "grad_norm": 0.3599003553390503, "learning_rate": 5.298335758258678e-06, "loss": 0.3829, "step": 3163 }, { "epoch": 2.664795058955643, "grad_norm": 0.4147457778453827, "learning_rate": 5.295399931339162e-06, "loss": 0.3831, "step": 3164 }, { "epoch": 2.6656372824256036, "grad_norm": 0.41180235147476196, "learning_rate": 5.2924640022163375e-06, "loss": 0.3627, "step": 3165 }, { "epoch": 2.6664795058955644, "grad_norm": 0.3867041766643524, "learning_rate": 5.289527971905982e-06, "loss": 0.3992, "step": 3166 }, { "epoch": 2.667321729365525, "grad_norm": 0.4360514283180237, "learning_rate": 5.286591841423913e-06, "loss": 0.3949, "step": 3167 }, { "epoch": 2.668163952835486, "grad_norm": 0.3877964913845062, "learning_rate": 5.28365561178598e-06, "loss": 0.4048, "step": 3168 }, { "epoch": 2.6690061763054462, "grad_norm": 0.38870421051979065, "learning_rate": 5.280719284008069e-06, "loss": 0.4183, "step": 3169 }, { "epoch": 2.669848399775407, "grad_norm": 0.4055136740207672, "learning_rate": 5.277782859106099e-06, "loss": 0.3989, "step": 3170 }, { "epoch": 2.6706906232453678, "grad_norm": 0.36197206377983093, "learning_rate": 5.274846338096022e-06, "loss": 0.3498, "step": 3171 }, { "epoch": 2.6715328467153285, "grad_norm": 0.36664846539497375, "learning_rate": 5.271909721993826e-06, "loss": 0.3759, "step": 3172 }, { "epoch": 2.6723750701852893, "grad_norm": 0.43300771713256836, "learning_rate": 5.268973011815526e-06, "loss": 0.4182, "step": 3173 }, { "epoch": 2.6732172936552496, "grad_norm": 0.3705335855484009, "learning_rate": 5.266036208577178e-06, "loss": 0.4041, "step": 3174 }, { "epoch": 2.6740595171252104, "grad_norm": 0.3514716327190399, "learning_rate": 5.2630993132948615e-06, "loss": 0.3787, "step": 3175 }, { "epoch": 2.674901740595171, "grad_norm": 0.42734426259994507, "learning_rate": 5.260162326984694e-06, "loss": 0.4289, "step": 3176 }, { "epoch": 2.675743964065132, "grad_norm": 0.3766149580478668, "learning_rate": 5.257225250662823e-06, "loss": 0.3607, "step": 3177 }, { "epoch": 2.6765861875350927, "grad_norm": 0.3946704864501953, "learning_rate": 5.254288085345426e-06, "loss": 0.3652, "step": 3178 }, { "epoch": 2.6774284110050535, "grad_norm": 0.36888572573661804, "learning_rate": 5.251350832048711e-06, "loss": 0.3717, "step": 3179 }, { "epoch": 2.6782706344750142, "grad_norm": 0.3710102438926697, "learning_rate": 5.2484134917889175e-06, "loss": 0.4124, "step": 3180 }, { "epoch": 2.6791128579449746, "grad_norm": 0.35768911242485046, "learning_rate": 5.245476065582315e-06, "loss": 0.3759, "step": 3181 }, { "epoch": 2.6799550814149353, "grad_norm": 0.40674057602882385, "learning_rate": 5.242538554445202e-06, "loss": 0.3984, "step": 3182 }, { "epoch": 2.680797304884896, "grad_norm": 0.37219923734664917, "learning_rate": 5.239600959393909e-06, "loss": 0.4036, "step": 3183 }, { "epoch": 2.681639528354857, "grad_norm": 0.36389583349227905, "learning_rate": 5.236663281444792e-06, "loss": 0.3674, "step": 3184 }, { "epoch": 2.6824817518248176, "grad_norm": 0.3803427219390869, "learning_rate": 5.233725521614237e-06, "loss": 0.3698, "step": 3185 }, { "epoch": 2.683323975294778, "grad_norm": 0.3597753942012787, "learning_rate": 5.230787680918657e-06, "loss": 0.385, "step": 3186 }, { "epoch": 2.684166198764739, "grad_norm": 0.38453593850135803, "learning_rate": 5.227849760374501e-06, "loss": 0.4084, "step": 3187 }, { "epoch": 2.6850084222346995, "grad_norm": 0.3709598779678345, "learning_rate": 5.22491176099823e-06, "loss": 0.3812, "step": 3188 }, { "epoch": 2.6858506457046603, "grad_norm": 0.36196935176849365, "learning_rate": 5.22197368380635e-06, "loss": 0.3974, "step": 3189 }, { "epoch": 2.686692869174621, "grad_norm": 0.38163137435913086, "learning_rate": 5.2190355298153784e-06, "loss": 0.3647, "step": 3190 }, { "epoch": 2.687535092644582, "grad_norm": 0.36286211013793945, "learning_rate": 5.21609730004187e-06, "loss": 0.3912, "step": 3191 }, { "epoch": 2.6883773161145426, "grad_norm": 0.39574795961380005, "learning_rate": 5.213158995502402e-06, "loss": 0.3985, "step": 3192 }, { "epoch": 2.689219539584503, "grad_norm": 0.33197519183158875, "learning_rate": 5.210220617213575e-06, "loss": 0.3537, "step": 3193 }, { "epoch": 2.6900617630544637, "grad_norm": 0.3796265125274658, "learning_rate": 5.207282166192019e-06, "loss": 0.4051, "step": 3194 }, { "epoch": 2.6909039865244244, "grad_norm": 0.37042126059532166, "learning_rate": 5.204343643454388e-06, "loss": 0.3645, "step": 3195 }, { "epoch": 2.691746209994385, "grad_norm": 0.33985665440559387, "learning_rate": 5.201405050017361e-06, "loss": 0.3925, "step": 3196 }, { "epoch": 2.692588433464346, "grad_norm": 0.36096906661987305, "learning_rate": 5.198466386897638e-06, "loss": 0.4091, "step": 3197 }, { "epoch": 2.6934306569343067, "grad_norm": 0.35122644901275635, "learning_rate": 5.1955276551119495e-06, "loss": 0.3702, "step": 3198 }, { "epoch": 2.6942728804042675, "grad_norm": 0.360970139503479, "learning_rate": 5.192588855677044e-06, "loss": 0.3854, "step": 3199 }, { "epoch": 2.695115103874228, "grad_norm": 0.372591108083725, "learning_rate": 5.189649989609697e-06, "loss": 0.3814, "step": 3200 }, { "epoch": 2.6959573273441886, "grad_norm": 0.34262174367904663, "learning_rate": 5.186711057926705e-06, "loss": 0.3679, "step": 3201 }, { "epoch": 2.6967995508141493, "grad_norm": 0.35509973764419556, "learning_rate": 5.18377206164489e-06, "loss": 0.4297, "step": 3202 }, { "epoch": 2.69764177428411, "grad_norm": 0.3610772490501404, "learning_rate": 5.180833001781091e-06, "loss": 0.3828, "step": 3203 }, { "epoch": 2.698483997754071, "grad_norm": 0.3429718017578125, "learning_rate": 5.177893879352173e-06, "loss": 0.3402, "step": 3204 }, { "epoch": 2.699326221224031, "grad_norm": 0.33700406551361084, "learning_rate": 5.174954695375023e-06, "loss": 0.3859, "step": 3205 }, { "epoch": 2.700168444693992, "grad_norm": 0.3637140691280365, "learning_rate": 5.17201545086655e-06, "loss": 0.4122, "step": 3206 }, { "epoch": 2.7010106681639527, "grad_norm": 0.3163653612136841, "learning_rate": 5.169076146843679e-06, "loss": 0.3668, "step": 3207 }, { "epoch": 2.7018528916339135, "grad_norm": 0.3610011339187622, "learning_rate": 5.166136784323362e-06, "loss": 0.3792, "step": 3208 }, { "epoch": 2.7026951151038743, "grad_norm": 0.38508179783821106, "learning_rate": 5.163197364322563e-06, "loss": 0.4319, "step": 3209 }, { "epoch": 2.703537338573835, "grad_norm": 0.3577013611793518, "learning_rate": 5.160257887858278e-06, "loss": 0.3988, "step": 3210 }, { "epoch": 2.704379562043796, "grad_norm": 0.34796565771102905, "learning_rate": 5.157318355947507e-06, "loss": 0.3803, "step": 3211 }, { "epoch": 2.705221785513756, "grad_norm": 0.386869877576828, "learning_rate": 5.154378769607287e-06, "loss": 0.3766, "step": 3212 }, { "epoch": 2.706064008983717, "grad_norm": 0.3618468940258026, "learning_rate": 5.151439129854655e-06, "loss": 0.364, "step": 3213 }, { "epoch": 2.7069062324536777, "grad_norm": 0.45711106061935425, "learning_rate": 5.148499437706684e-06, "loss": 0.4039, "step": 3214 }, { "epoch": 2.7077484559236384, "grad_norm": 0.3537876605987549, "learning_rate": 5.145559694180452e-06, "loss": 0.3623, "step": 3215 }, { "epoch": 2.708590679393599, "grad_norm": 0.3212081491947174, "learning_rate": 5.142619900293064e-06, "loss": 0.3755, "step": 3216 }, { "epoch": 2.7094329028635595, "grad_norm": 0.3734576404094696, "learning_rate": 5.139680057061634e-06, "loss": 0.3941, "step": 3217 }, { "epoch": 2.7102751263335207, "grad_norm": 0.37947675585746765, "learning_rate": 5.136740165503298e-06, "loss": 0.4026, "step": 3218 }, { "epoch": 2.711117349803481, "grad_norm": 0.3476470112800598, "learning_rate": 5.133800226635211e-06, "loss": 0.378, "step": 3219 }, { "epoch": 2.711959573273442, "grad_norm": 0.39289024472236633, "learning_rate": 5.130860241474539e-06, "loss": 0.4, "step": 3220 }, { "epoch": 2.7128017967434026, "grad_norm": 0.3927766978740692, "learning_rate": 5.127920211038467e-06, "loss": 0.3908, "step": 3221 }, { "epoch": 2.7136440202133634, "grad_norm": 0.40153294801712036, "learning_rate": 5.124980136344196e-06, "loss": 0.4014, "step": 3222 }, { "epoch": 2.714486243683324, "grad_norm": 0.3925326466560364, "learning_rate": 5.122040018408941e-06, "loss": 0.3593, "step": 3223 }, { "epoch": 2.7153284671532845, "grad_norm": 0.4521726667881012, "learning_rate": 5.119099858249931e-06, "loss": 0.3875, "step": 3224 }, { "epoch": 2.7161706906232452, "grad_norm": 0.457263320684433, "learning_rate": 5.116159656884414e-06, "loss": 0.4094, "step": 3225 }, { "epoch": 2.717012914093206, "grad_norm": 0.3893144726753235, "learning_rate": 5.113219415329645e-06, "loss": 0.3924, "step": 3226 }, { "epoch": 2.7178551375631668, "grad_norm": 0.4042649269104004, "learning_rate": 5.110279134602901e-06, "loss": 0.3522, "step": 3227 }, { "epoch": 2.7186973610331275, "grad_norm": 0.43926742672920227, "learning_rate": 5.107338815721468e-06, "loss": 0.4106, "step": 3228 }, { "epoch": 2.7195395845030883, "grad_norm": 0.48955562710762024, "learning_rate": 5.1043984597026446e-06, "loss": 0.4011, "step": 3229 }, { "epoch": 2.720381807973049, "grad_norm": 0.3846514821052551, "learning_rate": 5.101458067563743e-06, "loss": 0.4174, "step": 3230 }, { "epoch": 2.7212240314430094, "grad_norm": 0.3860557973384857, "learning_rate": 5.09851764032209e-06, "loss": 0.3949, "step": 3231 }, { "epoch": 2.72206625491297, "grad_norm": 0.42849665880203247, "learning_rate": 5.095577178995022e-06, "loss": 0.3911, "step": 3232 }, { "epoch": 2.722908478382931, "grad_norm": 0.4091935157775879, "learning_rate": 5.092636684599891e-06, "loss": 0.3717, "step": 3233 }, { "epoch": 2.7237507018528917, "grad_norm": 0.35414057970046997, "learning_rate": 5.089696158154054e-06, "loss": 0.3506, "step": 3234 }, { "epoch": 2.7245929253228525, "grad_norm": 0.39574792981147766, "learning_rate": 5.0867556006748845e-06, "loss": 0.376, "step": 3235 }, { "epoch": 2.725435148792813, "grad_norm": 0.37956681847572327, "learning_rate": 5.083815013179765e-06, "loss": 0.3832, "step": 3236 }, { "epoch": 2.7262773722627736, "grad_norm": 0.3764835000038147, "learning_rate": 5.080874396686087e-06, "loss": 0.4041, "step": 3237 }, { "epoch": 2.7271195957327343, "grad_norm": 0.3597446382045746, "learning_rate": 5.077933752211255e-06, "loss": 0.3557, "step": 3238 }, { "epoch": 2.727961819202695, "grad_norm": 0.38447996973991394, "learning_rate": 5.0749930807726795e-06, "loss": 0.4015, "step": 3239 }, { "epoch": 2.728804042672656, "grad_norm": 0.3611484467983246, "learning_rate": 5.072052383387787e-06, "loss": 0.347, "step": 3240 }, { "epoch": 2.7296462661426166, "grad_norm": 0.39225009083747864, "learning_rate": 5.069111661074003e-06, "loss": 0.4074, "step": 3241 }, { "epoch": 2.7304884896125774, "grad_norm": 0.37258288264274597, "learning_rate": 5.066170914848769e-06, "loss": 0.3997, "step": 3242 }, { "epoch": 2.7313307130825377, "grad_norm": 0.37461987137794495, "learning_rate": 5.0632301457295356e-06, "loss": 0.3856, "step": 3243 }, { "epoch": 2.7321729365524985, "grad_norm": 0.3335910141468048, "learning_rate": 5.060289354733753e-06, "loss": 0.3666, "step": 3244 }, { "epoch": 2.7330151600224593, "grad_norm": 0.3433711528778076, "learning_rate": 5.057348542878889e-06, "loss": 0.3594, "step": 3245 }, { "epoch": 2.73385738349242, "grad_norm": 0.3783188462257385, "learning_rate": 5.054407711182412e-06, "loss": 0.3803, "step": 3246 }, { "epoch": 2.734699606962381, "grad_norm": 0.38391411304473877, "learning_rate": 5.0514668606618e-06, "loss": 0.4149, "step": 3247 }, { "epoch": 2.735541830432341, "grad_norm": 0.34899774193763733, "learning_rate": 5.048525992334537e-06, "loss": 0.3693, "step": 3248 }, { "epoch": 2.7363840539023023, "grad_norm": 0.3249429166316986, "learning_rate": 5.045585107218112e-06, "loss": 0.3513, "step": 3249 }, { "epoch": 2.7372262773722627, "grad_norm": 0.42736122012138367, "learning_rate": 5.042644206330022e-06, "loss": 0.3953, "step": 3250 }, { "epoch": 2.7380685008422234, "grad_norm": 0.4307456612586975, "learning_rate": 5.039703290687767e-06, "loss": 0.417, "step": 3251 }, { "epoch": 2.738910724312184, "grad_norm": 0.34132078289985657, "learning_rate": 5.036762361308854e-06, "loss": 0.3509, "step": 3252 }, { "epoch": 2.739752947782145, "grad_norm": 0.37428924441337585, "learning_rate": 5.033821419210796e-06, "loss": 0.3604, "step": 3253 }, { "epoch": 2.7405951712521057, "grad_norm": 0.36881107091903687, "learning_rate": 5.030880465411105e-06, "loss": 0.3969, "step": 3254 }, { "epoch": 2.741437394722066, "grad_norm": 0.3787890076637268, "learning_rate": 5.027939500927303e-06, "loss": 0.396, "step": 3255 }, { "epoch": 2.742279618192027, "grad_norm": 0.3476279675960541, "learning_rate": 5.024998526776914e-06, "loss": 0.4122, "step": 3256 }, { "epoch": 2.7431218416619876, "grad_norm": 0.36552414298057556, "learning_rate": 5.022057543977462e-06, "loss": 0.3752, "step": 3257 }, { "epoch": 2.7439640651319483, "grad_norm": 0.3623371422290802, "learning_rate": 5.019116553546478e-06, "loss": 0.4162, "step": 3258 }, { "epoch": 2.744806288601909, "grad_norm": 0.36667168140411377, "learning_rate": 5.016175556501495e-06, "loss": 0.3663, "step": 3259 }, { "epoch": 2.74564851207187, "grad_norm": 0.3829328119754791, "learning_rate": 5.013234553860046e-06, "loss": 0.3851, "step": 3260 }, { "epoch": 2.7464907355418307, "grad_norm": 0.4154627025127411, "learning_rate": 5.010293546639669e-06, "loss": 0.3815, "step": 3261 }, { "epoch": 2.747332959011791, "grad_norm": 0.38452231884002686, "learning_rate": 5.0073525358579e-06, "loss": 0.3927, "step": 3262 }, { "epoch": 2.7481751824817517, "grad_norm": 0.3348240256309509, "learning_rate": 5.004411522532282e-06, "loss": 0.3559, "step": 3263 }, { "epoch": 2.7490174059517125, "grad_norm": 0.353033185005188, "learning_rate": 5.0014705076803506e-06, "loss": 0.3892, "step": 3264 }, { "epoch": 2.7498596294216733, "grad_norm": 0.3825405538082123, "learning_rate": 4.998529492319651e-06, "loss": 0.353, "step": 3265 }, { "epoch": 2.750701852891634, "grad_norm": 0.4127037227153778, "learning_rate": 4.99558847746772e-06, "loss": 0.3994, "step": 3266 }, { "epoch": 2.7515440763615944, "grad_norm": 0.3470294177532196, "learning_rate": 4.9926474641421004e-06, "loss": 0.3724, "step": 3267 }, { "epoch": 2.7523862998315556, "grad_norm": 0.398176372051239, "learning_rate": 4.9897064533603315e-06, "loss": 0.4091, "step": 3268 }, { "epoch": 2.753228523301516, "grad_norm": 0.4389145076274872, "learning_rate": 4.986765446139956e-06, "loss": 0.3797, "step": 3269 }, { "epoch": 2.7540707467714767, "grad_norm": 0.35522156953811646, "learning_rate": 4.983824443498507e-06, "loss": 0.3776, "step": 3270 }, { "epoch": 2.7549129702414374, "grad_norm": 0.34461742639541626, "learning_rate": 4.980883446453523e-06, "loss": 0.3762, "step": 3271 }, { "epoch": 2.755755193711398, "grad_norm": 0.3601379990577698, "learning_rate": 4.97794245602254e-06, "loss": 0.3979, "step": 3272 }, { "epoch": 2.756597417181359, "grad_norm": 0.3703101575374603, "learning_rate": 4.975001473223087e-06, "loss": 0.3757, "step": 3273 }, { "epoch": 2.7574396406513193, "grad_norm": 0.39139309525489807, "learning_rate": 4.9720604990726985e-06, "loss": 0.4031, "step": 3274 }, { "epoch": 2.75828186412128, "grad_norm": 0.3938475251197815, "learning_rate": 4.969119534588896e-06, "loss": 0.4033, "step": 3275 }, { "epoch": 2.759124087591241, "grad_norm": 0.35373279452323914, "learning_rate": 4.966178580789206e-06, "loss": 0.3711, "step": 3276 }, { "epoch": 2.7599663110612016, "grad_norm": 0.3386717140674591, "learning_rate": 4.963237638691147e-06, "loss": 0.346, "step": 3277 }, { "epoch": 2.7608085345311624, "grad_norm": 0.3637336194515228, "learning_rate": 4.960296709312235e-06, "loss": 0.4062, "step": 3278 }, { "epoch": 2.7616507580011227, "grad_norm": 0.3853926658630371, "learning_rate": 4.957355793669979e-06, "loss": 0.3966, "step": 3279 }, { "epoch": 2.762492981471084, "grad_norm": 0.375365674495697, "learning_rate": 4.95441489278189e-06, "loss": 0.3889, "step": 3280 }, { "epoch": 2.7633352049410442, "grad_norm": 0.34903812408447266, "learning_rate": 4.951474007665465e-06, "loss": 0.3737, "step": 3281 }, { "epoch": 2.764177428411005, "grad_norm": 0.37843239307403564, "learning_rate": 4.948533139338203e-06, "loss": 0.4273, "step": 3282 }, { "epoch": 2.7650196518809658, "grad_norm": 0.344363272190094, "learning_rate": 4.945592288817588e-06, "loss": 0.3711, "step": 3283 }, { "epoch": 2.7658618753509265, "grad_norm": 0.3693193793296814, "learning_rate": 4.942651457121113e-06, "loss": 0.3844, "step": 3284 }, { "epoch": 2.7667040988208873, "grad_norm": 0.37740299105644226, "learning_rate": 4.939710645266248e-06, "loss": 0.3929, "step": 3285 }, { "epoch": 2.7675463222908476, "grad_norm": 0.33349597454071045, "learning_rate": 4.936769854270467e-06, "loss": 0.3538, "step": 3286 }, { "epoch": 2.7683885457608084, "grad_norm": 0.3963141143321991, "learning_rate": 4.9338290851512306e-06, "loss": 0.4135, "step": 3287 }, { "epoch": 2.769230769230769, "grad_norm": 0.393900990486145, "learning_rate": 4.930888338925999e-06, "loss": 0.3524, "step": 3288 }, { "epoch": 2.77007299270073, "grad_norm": 0.3808240294456482, "learning_rate": 4.927947616612216e-06, "loss": 0.3954, "step": 3289 }, { "epoch": 2.7709152161706907, "grad_norm": 0.33060190081596375, "learning_rate": 4.92500691922732e-06, "loss": 0.383, "step": 3290 }, { "epoch": 2.7717574396406515, "grad_norm": 0.3743993639945984, "learning_rate": 4.922066247788746e-06, "loss": 0.4067, "step": 3291 }, { "epoch": 2.7725996631106122, "grad_norm": 0.39137643575668335, "learning_rate": 4.919125603313914e-06, "loss": 0.4108, "step": 3292 }, { "epoch": 2.7734418865805726, "grad_norm": 0.3895357847213745, "learning_rate": 4.916184986820238e-06, "loss": 0.387, "step": 3293 }, { "epoch": 2.7742841100505333, "grad_norm": 0.3506632149219513, "learning_rate": 4.913244399325117e-06, "loss": 0.3901, "step": 3294 }, { "epoch": 2.775126333520494, "grad_norm": 0.3889302909374237, "learning_rate": 4.910303841845947e-06, "loss": 0.3861, "step": 3295 }, { "epoch": 2.775968556990455, "grad_norm": 0.3557712733745575, "learning_rate": 4.90736331540011e-06, "loss": 0.3999, "step": 3296 }, { "epoch": 2.7768107804604156, "grad_norm": 0.36833539605140686, "learning_rate": 4.904422821004979e-06, "loss": 0.3986, "step": 3297 }, { "epoch": 2.777653003930376, "grad_norm": 0.31886833906173706, "learning_rate": 4.9014823596779114e-06, "loss": 0.3489, "step": 3298 }, { "epoch": 2.778495227400337, "grad_norm": 0.3486751616001129, "learning_rate": 4.898541932436259e-06, "loss": 0.4069, "step": 3299 }, { "epoch": 2.7793374508702975, "grad_norm": 0.3436608612537384, "learning_rate": 4.895601540297358e-06, "loss": 0.3736, "step": 3300 }, { "epoch": 2.7801796743402583, "grad_norm": 0.3806122839450836, "learning_rate": 4.8926611842785345e-06, "loss": 0.3799, "step": 3301 }, { "epoch": 2.781021897810219, "grad_norm": 0.3670487701892853, "learning_rate": 4.889720865397099e-06, "loss": 0.3928, "step": 3302 }, { "epoch": 2.78186412128018, "grad_norm": 0.33658260107040405, "learning_rate": 4.886780584670356e-06, "loss": 0.3434, "step": 3303 }, { "epoch": 2.7827063447501406, "grad_norm": 0.41673147678375244, "learning_rate": 4.883840343115588e-06, "loss": 0.4247, "step": 3304 }, { "epoch": 2.783548568220101, "grad_norm": 0.4022274911403656, "learning_rate": 4.880900141750069e-06, "loss": 0.3925, "step": 3305 }, { "epoch": 2.7843907916900617, "grad_norm": 0.5091547966003418, "learning_rate": 4.87795998159106e-06, "loss": 0.413, "step": 3306 }, { "epoch": 2.7852330151600224, "grad_norm": 0.34983500838279724, "learning_rate": 4.875019863655805e-06, "loss": 0.357, "step": 3307 }, { "epoch": 2.786075238629983, "grad_norm": 0.3496151864528656, "learning_rate": 4.8720797889615334e-06, "loss": 0.368, "step": 3308 }, { "epoch": 2.786917462099944, "grad_norm": 0.4048369228839874, "learning_rate": 4.869139758525462e-06, "loss": 0.4054, "step": 3309 }, { "epoch": 2.7877596855699043, "grad_norm": 0.38814833760261536, "learning_rate": 4.866199773364789e-06, "loss": 0.3467, "step": 3310 }, { "epoch": 2.7886019090398655, "grad_norm": 0.3891519010066986, "learning_rate": 4.863259834496703e-06, "loss": 0.3906, "step": 3311 }, { "epoch": 2.789444132509826, "grad_norm": 0.372099906206131, "learning_rate": 4.860319942938369e-06, "loss": 0.3544, "step": 3312 }, { "epoch": 2.7902863559797866, "grad_norm": 0.3983099162578583, "learning_rate": 4.857380099706939e-06, "loss": 0.4215, "step": 3313 }, { "epoch": 2.7911285794497473, "grad_norm": 0.39714524149894714, "learning_rate": 4.854440305819548e-06, "loss": 0.404, "step": 3314 }, { "epoch": 2.791970802919708, "grad_norm": 0.39656686782836914, "learning_rate": 4.851500562293317e-06, "loss": 0.3562, "step": 3315 }, { "epoch": 2.792813026389669, "grad_norm": 0.43281349539756775, "learning_rate": 4.848560870145346e-06, "loss": 0.4241, "step": 3316 }, { "epoch": 2.793655249859629, "grad_norm": 0.3837519884109497, "learning_rate": 4.845621230392717e-06, "loss": 0.3992, "step": 3317 }, { "epoch": 2.79449747332959, "grad_norm": 0.395278662443161, "learning_rate": 4.8426816440524925e-06, "loss": 0.3712, "step": 3318 }, { "epoch": 2.7953396967995507, "grad_norm": 0.3467051386833191, "learning_rate": 4.839742112141725e-06, "loss": 0.3562, "step": 3319 }, { "epoch": 2.7961819202695115, "grad_norm": 0.39265191555023193, "learning_rate": 4.836802635677439e-06, "loss": 0.4365, "step": 3320 }, { "epoch": 2.7970241437394723, "grad_norm": 0.37114933133125305, "learning_rate": 4.833863215676641e-06, "loss": 0.3727, "step": 3321 }, { "epoch": 2.797866367209433, "grad_norm": 0.3501802384853363, "learning_rate": 4.830923853156321e-06, "loss": 0.375, "step": 3322 }, { "epoch": 2.798708590679394, "grad_norm": 0.38778460025787354, "learning_rate": 4.827984549133451e-06, "loss": 0.4051, "step": 3323 }, { "epoch": 2.799550814149354, "grad_norm": 0.3693714439868927, "learning_rate": 4.825045304624979e-06, "loss": 0.3535, "step": 3324 }, { "epoch": 2.800393037619315, "grad_norm": 0.36160045862197876, "learning_rate": 4.8221061206478296e-06, "loss": 0.4023, "step": 3325 }, { "epoch": 2.8012352610892757, "grad_norm": 0.3930370807647705, "learning_rate": 4.819166998218912e-06, "loss": 0.421, "step": 3326 }, { "epoch": 2.8020774845592364, "grad_norm": 0.35952308773994446, "learning_rate": 4.816227938355113e-06, "loss": 0.3574, "step": 3327 }, { "epoch": 2.802919708029197, "grad_norm": 0.40605443716049194, "learning_rate": 4.8132889420732955e-06, "loss": 0.3945, "step": 3328 }, { "epoch": 2.8037619314991575, "grad_norm": 0.4092300236225128, "learning_rate": 4.8103500103903054e-06, "loss": 0.3952, "step": 3329 }, { "epoch": 2.8046041549691187, "grad_norm": 0.3934885859489441, "learning_rate": 4.807411144322957e-06, "loss": 0.3893, "step": 3330 }, { "epoch": 2.805446378439079, "grad_norm": 0.4013734757900238, "learning_rate": 4.804472344888052e-06, "loss": 0.4096, "step": 3331 }, { "epoch": 2.80628860190904, "grad_norm": 0.3820874094963074, "learning_rate": 4.801533613102363e-06, "loss": 0.3889, "step": 3332 }, { "epoch": 2.8071308253790006, "grad_norm": 0.3735508322715759, "learning_rate": 4.7985949499826415e-06, "loss": 0.3983, "step": 3333 }, { "epoch": 2.8079730488489614, "grad_norm": 0.3642943799495697, "learning_rate": 4.795656356545611e-06, "loss": 0.3539, "step": 3334 }, { "epoch": 2.808815272318922, "grad_norm": 0.41332969069480896, "learning_rate": 4.792717833807982e-06, "loss": 0.4153, "step": 3335 }, { "epoch": 2.8096574957888825, "grad_norm": 0.3448829650878906, "learning_rate": 4.789779382786426e-06, "loss": 0.3758, "step": 3336 }, { "epoch": 2.8104997192588432, "grad_norm": 0.3552119731903076, "learning_rate": 4.786841004497601e-06, "loss": 0.3854, "step": 3337 }, { "epoch": 2.811341942728804, "grad_norm": 0.35045966506004333, "learning_rate": 4.78390269995813e-06, "loss": 0.3966, "step": 3338 }, { "epoch": 2.8121841661987648, "grad_norm": 0.3772416412830353, "learning_rate": 4.780964470184623e-06, "loss": 0.3939, "step": 3339 }, { "epoch": 2.8130263896687255, "grad_norm": 0.35485920310020447, "learning_rate": 4.778026316193652e-06, "loss": 0.3669, "step": 3340 }, { "epoch": 2.813868613138686, "grad_norm": 0.3820016086101532, "learning_rate": 4.775088239001769e-06, "loss": 0.395, "step": 3341 }, { "epoch": 2.814710836608647, "grad_norm": 0.3758009076118469, "learning_rate": 4.772150239625501e-06, "loss": 0.4004, "step": 3342 }, { "epoch": 2.8155530600786074, "grad_norm": 0.37317588925361633, "learning_rate": 4.7692123190813434e-06, "loss": 0.3926, "step": 3343 }, { "epoch": 2.816395283548568, "grad_norm": 0.41338276863098145, "learning_rate": 4.7662744783857654e-06, "loss": 0.4263, "step": 3344 }, { "epoch": 2.817237507018529, "grad_norm": 0.3830922245979309, "learning_rate": 4.76333671855521e-06, "loss": 0.3839, "step": 3345 }, { "epoch": 2.8180797304884897, "grad_norm": 0.4112812876701355, "learning_rate": 4.760399040606093e-06, "loss": 0.3884, "step": 3346 }, { "epoch": 2.8189219539584505, "grad_norm": 0.3565369248390198, "learning_rate": 4.757461445554799e-06, "loss": 0.3292, "step": 3347 }, { "epoch": 2.819764177428411, "grad_norm": 0.399267315864563, "learning_rate": 4.754523934417688e-06, "loss": 0.4026, "step": 3348 }, { "epoch": 2.8206064008983716, "grad_norm": 0.3992313742637634, "learning_rate": 4.751586508211085e-06, "loss": 0.4055, "step": 3349 }, { "epoch": 2.8214486243683323, "grad_norm": 0.37861233949661255, "learning_rate": 4.748649167951291e-06, "loss": 0.3658, "step": 3350 }, { "epoch": 2.822290847838293, "grad_norm": 0.3987797498703003, "learning_rate": 4.745711914654576e-06, "loss": 0.4125, "step": 3351 }, { "epoch": 2.823133071308254, "grad_norm": 0.4217543601989746, "learning_rate": 4.742774749337179e-06, "loss": 0.3992, "step": 3352 }, { "epoch": 2.8239752947782146, "grad_norm": 0.3598562180995941, "learning_rate": 4.7398376730153056e-06, "loss": 0.3901, "step": 3353 }, { "epoch": 2.8248175182481754, "grad_norm": 0.34645864367485046, "learning_rate": 4.73690068670514e-06, "loss": 0.3443, "step": 3354 }, { "epoch": 2.8256597417181357, "grad_norm": 0.3611631393432617, "learning_rate": 4.733963791422824e-06, "loss": 0.3821, "step": 3355 }, { "epoch": 2.8265019651880965, "grad_norm": 0.370447039604187, "learning_rate": 4.731026988184476e-06, "loss": 0.3893, "step": 3356 }, { "epoch": 2.8273441886580573, "grad_norm": 0.35088613629341125, "learning_rate": 4.728090278006175e-06, "loss": 0.3893, "step": 3357 }, { "epoch": 2.828186412128018, "grad_norm": 0.366792231798172, "learning_rate": 4.7251536619039794e-06, "loss": 0.4, "step": 3358 }, { "epoch": 2.829028635597979, "grad_norm": 0.35955408215522766, "learning_rate": 4.722217140893903e-06, "loss": 0.3714, "step": 3359 }, { "epoch": 2.829870859067939, "grad_norm": 0.35453200340270996, "learning_rate": 4.719280715991933e-06, "loss": 0.3706, "step": 3360 }, { "epoch": 2.8307130825379003, "grad_norm": 0.3622379004955292, "learning_rate": 4.716344388214021e-06, "loss": 0.383, "step": 3361 }, { "epoch": 2.8315553060078607, "grad_norm": 0.35662004351615906, "learning_rate": 4.71340815857609e-06, "loss": 0.3753, "step": 3362 }, { "epoch": 2.8323975294778214, "grad_norm": 0.41121333837509155, "learning_rate": 4.71047202809402e-06, "loss": 0.414, "step": 3363 }, { "epoch": 2.833239752947782, "grad_norm": 0.37091460824012756, "learning_rate": 4.707535997783664e-06, "loss": 0.3766, "step": 3364 }, { "epoch": 2.834081976417743, "grad_norm": 0.3845052719116211, "learning_rate": 4.704600068660837e-06, "loss": 0.387, "step": 3365 }, { "epoch": 2.8349241998877037, "grad_norm": 0.4112201929092407, "learning_rate": 4.701664241741324e-06, "loss": 0.4044, "step": 3366 }, { "epoch": 2.835766423357664, "grad_norm": 0.406110018491745, "learning_rate": 4.6987285180408676e-06, "loss": 0.4266, "step": 3367 }, { "epoch": 2.836608646827625, "grad_norm": 0.3916867673397064, "learning_rate": 4.695792898575178e-06, "loss": 0.3627, "step": 3368 }, { "epoch": 2.8374508702975856, "grad_norm": 0.4837842583656311, "learning_rate": 4.692857384359929e-06, "loss": 0.4251, "step": 3369 }, { "epoch": 2.8382930937675463, "grad_norm": 0.37299010157585144, "learning_rate": 4.689921976410758e-06, "loss": 0.3622, "step": 3370 }, { "epoch": 2.839135317237507, "grad_norm": 0.40570783615112305, "learning_rate": 4.68698667574327e-06, "loss": 0.3751, "step": 3371 }, { "epoch": 2.8399775407074674, "grad_norm": 0.4303179979324341, "learning_rate": 4.684051483373022e-06, "loss": 0.4087, "step": 3372 }, { "epoch": 2.8408197641774287, "grad_norm": 0.3578893840312958, "learning_rate": 4.681116400315544e-06, "loss": 0.3805, "step": 3373 }, { "epoch": 2.841661987647389, "grad_norm": 0.33763808012008667, "learning_rate": 4.678181427586325e-06, "loss": 0.3505, "step": 3374 }, { "epoch": 2.8425042111173497, "grad_norm": 0.4951254427433014, "learning_rate": 4.675246566200815e-06, "loss": 0.4628, "step": 3375 }, { "epoch": 2.8433464345873105, "grad_norm": 0.3352121114730835, "learning_rate": 4.672311817174423e-06, "loss": 0.3533, "step": 3376 }, { "epoch": 2.8441886580572713, "grad_norm": 0.3898068368434906, "learning_rate": 4.6693771815225235e-06, "loss": 0.3437, "step": 3377 }, { "epoch": 2.845030881527232, "grad_norm": 0.41643980145454407, "learning_rate": 4.666442660260451e-06, "loss": 0.4191, "step": 3378 }, { "epoch": 2.8458731049971924, "grad_norm": 0.33636415004730225, "learning_rate": 4.663508254403499e-06, "loss": 0.3703, "step": 3379 }, { "epoch": 2.846715328467153, "grad_norm": 0.3583265542984009, "learning_rate": 4.660573964966924e-06, "loss": 0.4013, "step": 3380 }, { "epoch": 2.847557551937114, "grad_norm": 0.404694527387619, "learning_rate": 4.657639792965933e-06, "loss": 0.4374, "step": 3381 }, { "epoch": 2.8483997754070747, "grad_norm": 0.36417537927627563, "learning_rate": 4.654705739415705e-06, "loss": 0.3752, "step": 3382 }, { "epoch": 2.8492419988770354, "grad_norm": 0.34344208240509033, "learning_rate": 4.651771805331372e-06, "loss": 0.3461, "step": 3383 }, { "epoch": 2.850084222346996, "grad_norm": 0.43420127034187317, "learning_rate": 4.648837991728024e-06, "loss": 0.3976, "step": 3384 }, { "epoch": 2.850926445816957, "grad_norm": 0.38066279888153076, "learning_rate": 4.645904299620707e-06, "loss": 0.4262, "step": 3385 }, { "epoch": 2.8517686692869173, "grad_norm": 0.4080761671066284, "learning_rate": 4.642970730024433e-06, "loss": 0.4046, "step": 3386 }, { "epoch": 2.852610892756878, "grad_norm": 0.3516145646572113, "learning_rate": 4.640037283954165e-06, "loss": 0.3748, "step": 3387 }, { "epoch": 2.853453116226839, "grad_norm": 0.35951608419418335, "learning_rate": 4.637103962424826e-06, "loss": 0.3999, "step": 3388 }, { "epoch": 2.8542953396967996, "grad_norm": 0.37325355410575867, "learning_rate": 4.63417076645129e-06, "loss": 0.3612, "step": 3389 }, { "epoch": 2.8551375631667604, "grad_norm": 0.3542419373989105, "learning_rate": 4.6312376970484e-06, "loss": 0.3942, "step": 3390 }, { "epoch": 2.8559797866367207, "grad_norm": 0.3786540925502777, "learning_rate": 4.628304755230943e-06, "loss": 0.3963, "step": 3391 }, { "epoch": 2.856822010106682, "grad_norm": 0.3793127238750458, "learning_rate": 4.625371942013666e-06, "loss": 0.3777, "step": 3392 }, { "epoch": 2.8576642335766422, "grad_norm": 0.34970179200172424, "learning_rate": 4.622439258411275e-06, "loss": 0.3667, "step": 3393 }, { "epoch": 2.858506457046603, "grad_norm": 0.31596073508262634, "learning_rate": 4.619506705438428e-06, "loss": 0.3823, "step": 3394 }, { "epoch": 2.8593486805165638, "grad_norm": 0.35486894845962524, "learning_rate": 4.616574284109737e-06, "loss": 0.3818, "step": 3395 }, { "epoch": 2.8601909039865245, "grad_norm": 0.5438322424888611, "learning_rate": 4.613641995439767e-06, "loss": 0.3846, "step": 3396 }, { "epoch": 2.8610331274564853, "grad_norm": 0.36255455017089844, "learning_rate": 4.610709840443044e-06, "loss": 0.3822, "step": 3397 }, { "epoch": 2.8618753509264456, "grad_norm": 0.3483372926712036, "learning_rate": 4.607777820134041e-06, "loss": 0.3935, "step": 3398 }, { "epoch": 2.8627175743964064, "grad_norm": 0.32457348704338074, "learning_rate": 4.604845935527189e-06, "loss": 0.3805, "step": 3399 }, { "epoch": 2.863559797866367, "grad_norm": 0.3459053635597229, "learning_rate": 4.6019141876368644e-06, "loss": 0.3725, "step": 3400 }, { "epoch": 2.864402021336328, "grad_norm": 0.3585667014122009, "learning_rate": 4.598982577477407e-06, "loss": 0.3782, "step": 3401 }, { "epoch": 2.8652442448062887, "grad_norm": 0.4370427429676056, "learning_rate": 4.596051106063103e-06, "loss": 0.4578, "step": 3402 }, { "epoch": 2.866086468276249, "grad_norm": 0.3412383198738098, "learning_rate": 4.59311977440819e-06, "loss": 0.3654, "step": 3403 }, { "epoch": 2.8669286917462102, "grad_norm": 0.409564733505249, "learning_rate": 4.590188583526858e-06, "loss": 0.3925, "step": 3404 }, { "epoch": 2.8677709152161706, "grad_norm": 0.374717652797699, "learning_rate": 4.587257534433249e-06, "loss": 0.3659, "step": 3405 }, { "epoch": 2.8686131386861313, "grad_norm": 0.3659243881702423, "learning_rate": 4.584326628141457e-06, "loss": 0.3896, "step": 3406 }, { "epoch": 2.869455362156092, "grad_norm": 0.3434193730354309, "learning_rate": 4.581395865665526e-06, "loss": 0.3596, "step": 3407 }, { "epoch": 2.870297585626053, "grad_norm": 0.42939457297325134, "learning_rate": 4.578465248019445e-06, "loss": 0.4009, "step": 3408 }, { "epoch": 2.8711398090960136, "grad_norm": 0.3444993495941162, "learning_rate": 4.575534776217163e-06, "loss": 0.371, "step": 3409 }, { "epoch": 2.871982032565974, "grad_norm": 0.3433338403701782, "learning_rate": 4.5726044512725695e-06, "loss": 0.4111, "step": 3410 }, { "epoch": 2.8728242560359347, "grad_norm": 0.3380384147167206, "learning_rate": 4.5696742741995086e-06, "loss": 0.3943, "step": 3411 }, { "epoch": 2.8736664795058955, "grad_norm": 0.4371155798435211, "learning_rate": 4.566744246011769e-06, "loss": 0.4025, "step": 3412 }, { "epoch": 2.8745087029758563, "grad_norm": 0.3194515109062195, "learning_rate": 4.563814367723094e-06, "loss": 0.3555, "step": 3413 }, { "epoch": 2.875350926445817, "grad_norm": 0.32909831404685974, "learning_rate": 4.5608846403471675e-06, "loss": 0.3629, "step": 3414 }, { "epoch": 2.876193149915778, "grad_norm": 0.3443525433540344, "learning_rate": 4.557955064897626e-06, "loss": 0.375, "step": 3415 }, { "epoch": 2.8770353733857386, "grad_norm": 0.36552128195762634, "learning_rate": 4.555025642388052e-06, "loss": 0.4301, "step": 3416 }, { "epoch": 2.877877596855699, "grad_norm": 0.3260931074619293, "learning_rate": 4.552096373831979e-06, "loss": 0.3861, "step": 3417 }, { "epoch": 2.8787198203256597, "grad_norm": 0.3591124415397644, "learning_rate": 4.54916726024288e-06, "loss": 0.3657, "step": 3418 }, { "epoch": 2.8795620437956204, "grad_norm": 0.358987420797348, "learning_rate": 4.546238302634179e-06, "loss": 0.4173, "step": 3419 }, { "epoch": 2.880404267265581, "grad_norm": 0.35178810358047485, "learning_rate": 4.543309502019243e-06, "loss": 0.3783, "step": 3420 }, { "epoch": 2.881246490735542, "grad_norm": 0.33616045117378235, "learning_rate": 4.540380859411391e-06, "loss": 0.3444, "step": 3421 }, { "epoch": 2.8820887142055023, "grad_norm": 0.3856150209903717, "learning_rate": 4.537452375823881e-06, "loss": 0.4028, "step": 3422 }, { "epoch": 2.8829309376754635, "grad_norm": 0.3714916706085205, "learning_rate": 4.534524052269918e-06, "loss": 0.3861, "step": 3423 }, { "epoch": 2.883773161145424, "grad_norm": 0.3448173701763153, "learning_rate": 4.5315958897626504e-06, "loss": 0.3763, "step": 3424 }, { "epoch": 2.8846153846153846, "grad_norm": 0.35262489318847656, "learning_rate": 4.528667889315175e-06, "loss": 0.3775, "step": 3425 }, { "epoch": 2.8854576080853453, "grad_norm": 0.3744317889213562, "learning_rate": 4.5257400519405296e-06, "loss": 0.4096, "step": 3426 }, { "epoch": 2.886299831555306, "grad_norm": 0.35317638516426086, "learning_rate": 4.5228123786516935e-06, "loss": 0.4153, "step": 3427 }, { "epoch": 2.887142055025267, "grad_norm": 0.37539616227149963, "learning_rate": 4.5198848704615915e-06, "loss": 0.4031, "step": 3428 }, { "epoch": 2.887984278495227, "grad_norm": 0.3549993336200714, "learning_rate": 4.516957528383094e-06, "loss": 0.3437, "step": 3429 }, { "epoch": 2.888826501965188, "grad_norm": 0.3770570456981659, "learning_rate": 4.514030353429009e-06, "loss": 0.4238, "step": 3430 }, { "epoch": 2.8896687254351487, "grad_norm": 0.35150107741355896, "learning_rate": 4.51110334661209e-06, "loss": 0.3881, "step": 3431 }, { "epoch": 2.8905109489051095, "grad_norm": 0.3420199453830719, "learning_rate": 4.508176508945028e-06, "loss": 0.3779, "step": 3432 }, { "epoch": 2.8913531723750703, "grad_norm": 0.381055623292923, "learning_rate": 4.5052498414404626e-06, "loss": 0.4127, "step": 3433 }, { "epoch": 2.892195395845031, "grad_norm": 0.35072922706604004, "learning_rate": 4.502323345110969e-06, "loss": 0.3585, "step": 3434 }, { "epoch": 2.893037619314992, "grad_norm": 0.4031289517879486, "learning_rate": 4.499397020969067e-06, "loss": 0.4016, "step": 3435 }, { "epoch": 2.893879842784952, "grad_norm": 0.39417752623558044, "learning_rate": 4.496470870027209e-06, "loss": 0.3791, "step": 3436 }, { "epoch": 2.894722066254913, "grad_norm": 0.38716426491737366, "learning_rate": 4.4935448932977985e-06, "loss": 0.3798, "step": 3437 }, { "epoch": 2.8955642897248737, "grad_norm": 0.4067290723323822, "learning_rate": 4.49061909179317e-06, "loss": 0.428, "step": 3438 }, { "epoch": 2.8964065131948344, "grad_norm": 0.30207526683807373, "learning_rate": 4.487693466525604e-06, "loss": 0.3453, "step": 3439 }, { "epoch": 2.897248736664795, "grad_norm": 0.3850337266921997, "learning_rate": 4.484768018507311e-06, "loss": 0.3972, "step": 3440 }, { "epoch": 2.8980909601347555, "grad_norm": 0.4676113724708557, "learning_rate": 4.481842748750453e-06, "loss": 0.4379, "step": 3441 }, { "epoch": 2.8989331836047163, "grad_norm": 0.3709852993488312, "learning_rate": 4.478917658267119e-06, "loss": 0.3769, "step": 3442 }, { "epoch": 2.899775407074677, "grad_norm": 0.3523048162460327, "learning_rate": 4.475992748069339e-06, "loss": 0.3803, "step": 3443 }, { "epoch": 2.900617630544638, "grad_norm": 0.34478995203971863, "learning_rate": 4.473068019169085e-06, "loss": 0.3425, "step": 3444 }, { "epoch": 2.9014598540145986, "grad_norm": 0.37726929783821106, "learning_rate": 4.4701434725782625e-06, "loss": 0.4127, "step": 3445 }, { "epoch": 2.9023020774845594, "grad_norm": 0.3838403820991516, "learning_rate": 4.467219109308713e-06, "loss": 0.3636, "step": 3446 }, { "epoch": 2.90314430095452, "grad_norm": 0.3735176920890808, "learning_rate": 4.464294930372215e-06, "loss": 0.3978, "step": 3447 }, { "epoch": 2.9039865244244805, "grad_norm": 0.36596742272377014, "learning_rate": 4.461370936780487e-06, "loss": 0.3985, "step": 3448 }, { "epoch": 2.9048287478944412, "grad_norm": 0.40170106291770935, "learning_rate": 4.4584471295451815e-06, "loss": 0.3986, "step": 3449 }, { "epoch": 2.905670971364402, "grad_norm": 0.3590144217014313, "learning_rate": 4.455523509677882e-06, "loss": 0.3913, "step": 3450 }, { "epoch": 2.9065131948343628, "grad_norm": 0.3746359944343567, "learning_rate": 4.4526000781901115e-06, "loss": 0.3912, "step": 3451 }, { "epoch": 2.9073554183043235, "grad_norm": 0.34565290808677673, "learning_rate": 4.44967683609333e-06, "loss": 0.3469, "step": 3452 }, { "epoch": 2.908197641774284, "grad_norm": 0.43344736099243164, "learning_rate": 4.446753784398926e-06, "loss": 0.4174, "step": 3453 }, { "epoch": 2.909039865244245, "grad_norm": 0.3440655469894409, "learning_rate": 4.4438309241182285e-06, "loss": 0.3451, "step": 3454 }, { "epoch": 2.9098820887142054, "grad_norm": 0.3583086133003235, "learning_rate": 4.440908256262493e-06, "loss": 0.376, "step": 3455 }, { "epoch": 2.910724312184166, "grad_norm": 0.3772081136703491, "learning_rate": 4.437985781842916e-06, "loss": 0.3847, "step": 3456 }, { "epoch": 2.911566535654127, "grad_norm": 0.3909258544445038, "learning_rate": 4.4350635018706226e-06, "loss": 0.3978, "step": 3457 }, { "epoch": 2.9124087591240877, "grad_norm": 0.3816378712654114, "learning_rate": 4.432141417356672e-06, "loss": 0.3942, "step": 3458 }, { "epoch": 2.9132509825940485, "grad_norm": 0.3340766429901123, "learning_rate": 4.429219529312051e-06, "loss": 0.3883, "step": 3459 }, { "epoch": 2.914093206064009, "grad_norm": 0.3301750421524048, "learning_rate": 4.42629783874769e-06, "loss": 0.3512, "step": 3460 }, { "epoch": 2.9149354295339696, "grad_norm": 0.37822258472442627, "learning_rate": 4.423376346674438e-06, "loss": 0.3791, "step": 3461 }, { "epoch": 2.9157776530039303, "grad_norm": 0.39141935110092163, "learning_rate": 4.420455054103086e-06, "loss": 0.4106, "step": 3462 }, { "epoch": 2.916619876473891, "grad_norm": 0.338744193315506, "learning_rate": 4.417533962044344e-06, "loss": 0.3875, "step": 3463 }, { "epoch": 2.917462099943852, "grad_norm": 0.3611360192298889, "learning_rate": 4.4146130715088675e-06, "loss": 0.3644, "step": 3464 }, { "epoch": 2.9183043234138126, "grad_norm": 0.3930618464946747, "learning_rate": 4.41169238350723e-06, "loss": 0.4502, "step": 3465 }, { "epoch": 2.9191465468837734, "grad_norm": 0.37907007336616516, "learning_rate": 4.408771899049942e-06, "loss": 0.3782, "step": 3466 }, { "epoch": 2.9199887703537337, "grad_norm": 0.3457649052143097, "learning_rate": 4.4058516191474375e-06, "loss": 0.3873, "step": 3467 }, { "epoch": 2.9208309938236945, "grad_norm": 0.3970572054386139, "learning_rate": 4.402931544810088e-06, "loss": 0.404, "step": 3468 }, { "epoch": 2.9216732172936553, "grad_norm": 0.44512972235679626, "learning_rate": 4.400011677048185e-06, "loss": 0.4335, "step": 3469 }, { "epoch": 2.922515440763616, "grad_norm": 0.38319161534309387, "learning_rate": 4.397092016871956e-06, "loss": 0.3512, "step": 3470 }, { "epoch": 2.923357664233577, "grad_norm": 0.35547494888305664, "learning_rate": 4.394172565291549e-06, "loss": 0.4075, "step": 3471 }, { "epoch": 2.924199887703537, "grad_norm": 0.37775808572769165, "learning_rate": 4.391253323317049e-06, "loss": 0.3931, "step": 3472 }, { "epoch": 2.925042111173498, "grad_norm": 0.3838792145252228, "learning_rate": 4.388334291958463e-06, "loss": 0.4007, "step": 3473 }, { "epoch": 2.9258843346434587, "grad_norm": 0.37166842818260193, "learning_rate": 4.385415472225722e-06, "loss": 0.3732, "step": 3474 }, { "epoch": 2.9267265581134194, "grad_norm": 0.370702862739563, "learning_rate": 4.3824968651286905e-06, "loss": 0.3579, "step": 3475 }, { "epoch": 2.92756878158338, "grad_norm": 0.3584776818752289, "learning_rate": 4.379578471677156e-06, "loss": 0.3862, "step": 3476 }, { "epoch": 2.928411005053341, "grad_norm": 0.3390013873577118, "learning_rate": 4.376660292880834e-06, "loss": 0.3897, "step": 3477 }, { "epoch": 2.9292532285233017, "grad_norm": 0.3573378324508667, "learning_rate": 4.373742329749362e-06, "loss": 0.3629, "step": 3478 }, { "epoch": 2.930095451993262, "grad_norm": 0.36938679218292236, "learning_rate": 4.370824583292305e-06, "loss": 0.3922, "step": 3479 }, { "epoch": 2.930937675463223, "grad_norm": 0.38143593072891235, "learning_rate": 4.367907054519155e-06, "loss": 0.3994, "step": 3480 }, { "epoch": 2.9317798989331836, "grad_norm": 0.36056673526763916, "learning_rate": 4.364989744439328e-06, "loss": 0.3723, "step": 3481 }, { "epoch": 2.9326221224031443, "grad_norm": 0.3665173351764679, "learning_rate": 4.36207265406216e-06, "loss": 0.3739, "step": 3482 }, { "epoch": 2.933464345873105, "grad_norm": 0.37079668045043945, "learning_rate": 4.359155784396915e-06, "loss": 0.4277, "step": 3483 }, { "epoch": 2.9343065693430654, "grad_norm": 0.35325831174850464, "learning_rate": 4.356239136452782e-06, "loss": 0.3957, "step": 3484 }, { "epoch": 2.9351487928130267, "grad_norm": 0.3273209035396576, "learning_rate": 4.3533227112388694e-06, "loss": 0.3475, "step": 3485 }, { "epoch": 2.935991016282987, "grad_norm": 0.33608072996139526, "learning_rate": 4.350406509764212e-06, "loss": 0.3486, "step": 3486 }, { "epoch": 2.9368332397529477, "grad_norm": 0.3555096685886383, "learning_rate": 4.347490533037762e-06, "loss": 0.3941, "step": 3487 }, { "epoch": 2.9376754632229085, "grad_norm": 0.34013667702674866, "learning_rate": 4.344574782068402e-06, "loss": 0.3741, "step": 3488 }, { "epoch": 2.9385176866928693, "grad_norm": 0.3670366704463959, "learning_rate": 4.341659257864928e-06, "loss": 0.3906, "step": 3489 }, { "epoch": 2.93935991016283, "grad_norm": 0.35386982560157776, "learning_rate": 4.338743961436063e-06, "loss": 0.3757, "step": 3490 }, { "epoch": 2.9402021336327904, "grad_norm": 0.35840052366256714, "learning_rate": 4.335828893790452e-06, "loss": 0.4084, "step": 3491 }, { "epoch": 2.941044357102751, "grad_norm": 0.3767787218093872, "learning_rate": 4.332914055936654e-06, "loss": 0.3535, "step": 3492 }, { "epoch": 2.941886580572712, "grad_norm": 0.3485800623893738, "learning_rate": 4.329999448883154e-06, "loss": 0.3866, "step": 3493 }, { "epoch": 2.9427288040426727, "grad_norm": 0.3460424840450287, "learning_rate": 4.327085073638357e-06, "loss": 0.3845, "step": 3494 }, { "epoch": 2.9435710275126334, "grad_norm": 0.4264163672924042, "learning_rate": 4.324170931210587e-06, "loss": 0.3735, "step": 3495 }, { "epoch": 2.944413250982594, "grad_norm": 0.3548147976398468, "learning_rate": 4.32125702260809e-06, "loss": 0.3719, "step": 3496 }, { "epoch": 2.945255474452555, "grad_norm": 0.3726392388343811, "learning_rate": 4.3183433488390225e-06, "loss": 0.3609, "step": 3497 }, { "epoch": 2.9460976979225153, "grad_norm": 0.38176876306533813, "learning_rate": 4.315429910911468e-06, "loss": 0.3854, "step": 3498 }, { "epoch": 2.946939921392476, "grad_norm": 0.35401636362075806, "learning_rate": 4.312516709833429e-06, "loss": 0.3934, "step": 3499 }, { "epoch": 2.947782144862437, "grad_norm": 0.3570374548435211, "learning_rate": 4.309603746612821e-06, "loss": 0.4252, "step": 3500 }, { "epoch": 2.9486243683323976, "grad_norm": 0.34709882736206055, "learning_rate": 4.306691022257477e-06, "loss": 0.3531, "step": 3501 }, { "epoch": 2.9494665918023584, "grad_norm": 0.4132761061191559, "learning_rate": 4.303778537775153e-06, "loss": 0.4177, "step": 3502 }, { "epoch": 2.9503088152723187, "grad_norm": 0.3630152940750122, "learning_rate": 4.300866294173517e-06, "loss": 0.3821, "step": 3503 }, { "epoch": 2.9511510387422795, "grad_norm": 0.37796467542648315, "learning_rate": 4.297954292460157e-06, "loss": 0.3864, "step": 3504 }, { "epoch": 2.9519932622122402, "grad_norm": 0.3930572271347046, "learning_rate": 4.295042533642574e-06, "loss": 0.3873, "step": 3505 }, { "epoch": 2.952835485682201, "grad_norm": 0.3514994978904724, "learning_rate": 4.292131018728187e-06, "loss": 0.3864, "step": 3506 }, { "epoch": 2.9536777091521618, "grad_norm": 0.34604310989379883, "learning_rate": 4.2892197487243305e-06, "loss": 0.3686, "step": 3507 }, { "epoch": 2.9545199326221225, "grad_norm": 0.37372148036956787, "learning_rate": 4.286308724638254e-06, "loss": 0.3985, "step": 3508 }, { "epoch": 2.9553621560920833, "grad_norm": 0.3509046733379364, "learning_rate": 4.283397947477123e-06, "loss": 0.3807, "step": 3509 }, { "epoch": 2.9562043795620436, "grad_norm": 0.3683563768863678, "learning_rate": 4.280487418248014e-06, "loss": 0.4089, "step": 3510 }, { "epoch": 2.9570466030320044, "grad_norm": 0.37716740369796753, "learning_rate": 4.277577137957922e-06, "loss": 0.4049, "step": 3511 }, { "epoch": 2.957888826501965, "grad_norm": 0.32556504011154175, "learning_rate": 4.2746671076137534e-06, "loss": 0.3513, "step": 3512 }, { "epoch": 2.958731049971926, "grad_norm": 0.3594387173652649, "learning_rate": 4.27175732822233e-06, "loss": 0.3823, "step": 3513 }, { "epoch": 2.9595732734418867, "grad_norm": 0.3730241060256958, "learning_rate": 4.268847800790382e-06, "loss": 0.4398, "step": 3514 }, { "epoch": 2.960415496911847, "grad_norm": 0.3384752571582794, "learning_rate": 4.265938526324562e-06, "loss": 0.3654, "step": 3515 }, { "epoch": 2.9612577203818082, "grad_norm": 0.3597032129764557, "learning_rate": 4.263029505831424e-06, "loss": 0.3956, "step": 3516 }, { "epoch": 2.9620999438517686, "grad_norm": 0.3970354497432709, "learning_rate": 4.260120740317443e-06, "loss": 0.4018, "step": 3517 }, { "epoch": 2.9629421673217293, "grad_norm": 0.3462546467781067, "learning_rate": 4.257212230788995e-06, "loss": 0.3725, "step": 3518 }, { "epoch": 2.96378439079169, "grad_norm": 0.36012592911720276, "learning_rate": 4.254303978252384e-06, "loss": 0.4165, "step": 3519 }, { "epoch": 2.964626614261651, "grad_norm": 0.3670101463794708, "learning_rate": 4.25139598371381e-06, "loss": 0.3595, "step": 3520 }, { "epoch": 2.9654688377316116, "grad_norm": 0.3570830523967743, "learning_rate": 4.2484882481793895e-06, "loss": 0.3735, "step": 3521 }, { "epoch": 2.966311061201572, "grad_norm": 0.36583441495895386, "learning_rate": 4.245580772655148e-06, "loss": 0.3801, "step": 3522 }, { "epoch": 2.9671532846715327, "grad_norm": 0.39011871814727783, "learning_rate": 4.242673558147027e-06, "loss": 0.4043, "step": 3523 }, { "epoch": 2.9679955081414935, "grad_norm": 0.4172125458717346, "learning_rate": 4.239766605660869e-06, "loss": 0.3759, "step": 3524 }, { "epoch": 2.9688377316114543, "grad_norm": 0.36064642667770386, "learning_rate": 4.236859916202431e-06, "loss": 0.3768, "step": 3525 }, { "epoch": 2.969679955081415, "grad_norm": 0.4034394919872284, "learning_rate": 4.233953490777375e-06, "loss": 0.3676, "step": 3526 }, { "epoch": 2.970522178551376, "grad_norm": 0.42775920033454895, "learning_rate": 4.231047330391278e-06, "loss": 0.396, "step": 3527 }, { "epoch": 2.9713644020213366, "grad_norm": 0.3727170526981354, "learning_rate": 4.228141436049621e-06, "loss": 0.3867, "step": 3528 }, { "epoch": 2.972206625491297, "grad_norm": 0.38267409801483154, "learning_rate": 4.225235808757792e-06, "loss": 0.3637, "step": 3529 }, { "epoch": 2.9730488489612577, "grad_norm": 0.42474135756492615, "learning_rate": 4.222330449521088e-06, "loss": 0.4074, "step": 3530 }, { "epoch": 2.9738910724312184, "grad_norm": 0.3576635718345642, "learning_rate": 4.219425359344716e-06, "loss": 0.3757, "step": 3531 }, { "epoch": 2.974733295901179, "grad_norm": 0.38514816761016846, "learning_rate": 4.216520539233787e-06, "loss": 0.3868, "step": 3532 }, { "epoch": 2.97557551937114, "grad_norm": 0.337303102016449, "learning_rate": 4.213615990193317e-06, "loss": 0.3638, "step": 3533 }, { "epoch": 2.9764177428411003, "grad_norm": 0.3491435945034027, "learning_rate": 4.21071171322823e-06, "loss": 0.4168, "step": 3534 }, { "epoch": 2.977259966311061, "grad_norm": 0.32285502552986145, "learning_rate": 4.207807709343358e-06, "loss": 0.3348, "step": 3535 }, { "epoch": 2.978102189781022, "grad_norm": 0.3480975031852722, "learning_rate": 4.204903979543437e-06, "loss": 0.408, "step": 3536 }, { "epoch": 2.9789444132509826, "grad_norm": 0.3222920298576355, "learning_rate": 4.2020005248331056e-06, "loss": 0.3742, "step": 3537 }, { "epoch": 2.9797866367209433, "grad_norm": 0.36344534158706665, "learning_rate": 4.199097346216909e-06, "loss": 0.4206, "step": 3538 }, { "epoch": 2.980628860190904, "grad_norm": 0.35345157980918884, "learning_rate": 4.196194444699299e-06, "loss": 0.396, "step": 3539 }, { "epoch": 2.981471083660865, "grad_norm": 0.32129764556884766, "learning_rate": 4.193291821284629e-06, "loss": 0.3508, "step": 3540 }, { "epoch": 2.982313307130825, "grad_norm": 0.36073607206344604, "learning_rate": 4.190389476977157e-06, "loss": 0.3352, "step": 3541 }, { "epoch": 2.983155530600786, "grad_norm": 0.4163888096809387, "learning_rate": 4.1874874127810455e-06, "loss": 0.4334, "step": 3542 }, { "epoch": 2.9839977540707467, "grad_norm": 0.3317297101020813, "learning_rate": 4.184585629700357e-06, "loss": 0.3732, "step": 3543 }, { "epoch": 2.9848399775407075, "grad_norm": 0.36670857667922974, "learning_rate": 4.18168412873906e-06, "loss": 0.4215, "step": 3544 }, { "epoch": 2.9856822010106683, "grad_norm": 0.5680432319641113, "learning_rate": 4.178782910901023e-06, "loss": 0.3533, "step": 3545 }, { "epoch": 2.9865244244806286, "grad_norm": 0.3893413543701172, "learning_rate": 4.1758819771900186e-06, "loss": 0.3975, "step": 3546 }, { "epoch": 2.98736664795059, "grad_norm": 0.3605082035064697, "learning_rate": 4.172981328609721e-06, "loss": 0.3497, "step": 3547 }, { "epoch": 2.98820887142055, "grad_norm": 0.3714129328727722, "learning_rate": 4.1700809661637025e-06, "loss": 0.4126, "step": 3548 }, { "epoch": 2.989051094890511, "grad_norm": 0.37081167101860046, "learning_rate": 4.167180890855439e-06, "loss": 0.4178, "step": 3549 }, { "epoch": 2.9898933183604717, "grad_norm": 0.36814579367637634, "learning_rate": 4.164281103688309e-06, "loss": 0.3733, "step": 3550 }, { "epoch": 2.9907355418304324, "grad_norm": 0.47460606694221497, "learning_rate": 4.16138160566559e-06, "loss": 0.3613, "step": 3551 }, { "epoch": 2.991577765300393, "grad_norm": 0.4040283262729645, "learning_rate": 4.158482397790454e-06, "loss": 0.4084, "step": 3552 }, { "epoch": 2.9924199887703535, "grad_norm": 0.35584935545921326, "learning_rate": 4.155583481065979e-06, "loss": 0.4005, "step": 3553 }, { "epoch": 2.9932622122403143, "grad_norm": 0.36283445358276367, "learning_rate": 4.152684856495143e-06, "loss": 0.3424, "step": 3554 }, { "epoch": 2.994104435710275, "grad_norm": 0.3890397846698761, "learning_rate": 4.149786525080819e-06, "loss": 0.3777, "step": 3555 }, { "epoch": 2.994946659180236, "grad_norm": 0.41362252831459045, "learning_rate": 4.146888487825779e-06, "loss": 0.4308, "step": 3556 }, { "epoch": 2.9957888826501966, "grad_norm": 0.3721930682659149, "learning_rate": 4.143990745732693e-06, "loss": 0.3709, "step": 3557 }, { "epoch": 2.9966311061201574, "grad_norm": 0.36227431893348694, "learning_rate": 4.1410932998041345e-06, "loss": 0.3545, "step": 3558 }, { "epoch": 2.997473329590118, "grad_norm": 0.39524856209754944, "learning_rate": 4.1381961510425665e-06, "loss": 0.4002, "step": 3559 }, { "epoch": 2.9983155530600785, "grad_norm": 0.3264089822769165, "learning_rate": 4.135299300450355e-06, "loss": 0.3641, "step": 3560 }, { "epoch": 2.9991577765300392, "grad_norm": 0.36838218569755554, "learning_rate": 4.132402749029757e-06, "loss": 0.3914, "step": 3561 }, { "epoch": 3.0, "grad_norm": 0.6602257490158081, "learning_rate": 4.129506497782934e-06, "loss": 0.6057, "step": 3562 }, { "epoch": 3.0008422234699608, "grad_norm": 0.35544371604919434, "learning_rate": 4.126610547711936e-06, "loss": 0.3584, "step": 3563 }, { "epoch": 3.0016844469399215, "grad_norm": 0.37927478551864624, "learning_rate": 4.123714899818716e-06, "loss": 0.3714, "step": 3564 }, { "epoch": 3.0025266704098823, "grad_norm": 0.35972797870635986, "learning_rate": 4.120819555105112e-06, "loss": 0.3568, "step": 3565 }, { "epoch": 3.0033688938798426, "grad_norm": 0.36843010783195496, "learning_rate": 4.11792451457287e-06, "loss": 0.3704, "step": 3566 }, { "epoch": 3.0042111173498034, "grad_norm": 0.37994384765625, "learning_rate": 4.115029779223622e-06, "loss": 0.3771, "step": 3567 }, { "epoch": 3.005053340819764, "grad_norm": 0.40069103240966797, "learning_rate": 4.112135350058898e-06, "loss": 0.3538, "step": 3568 }, { "epoch": 3.005895564289725, "grad_norm": 0.3873416483402252, "learning_rate": 4.109241228080115e-06, "loss": 0.3622, "step": 3569 }, { "epoch": 3.0067377877596857, "grad_norm": 0.38649439811706543, "learning_rate": 4.106347414288599e-06, "loss": 0.3662, "step": 3570 }, { "epoch": 3.0075800112296465, "grad_norm": 0.4012371003627777, "learning_rate": 4.103453909685553e-06, "loss": 0.3467, "step": 3571 }, { "epoch": 3.008422234699607, "grad_norm": 0.37814468145370483, "learning_rate": 4.100560715272083e-06, "loss": 0.3415, "step": 3572 }, { "epoch": 3.0092644581695676, "grad_norm": 0.39964503049850464, "learning_rate": 4.097667832049182e-06, "loss": 0.3647, "step": 3573 }, { "epoch": 3.0101066816395283, "grad_norm": 0.4000207781791687, "learning_rate": 4.094775261017742e-06, "loss": 0.3511, "step": 3574 }, { "epoch": 3.010948905109489, "grad_norm": 0.39206433296203613, "learning_rate": 4.09188300317854e-06, "loss": 0.3669, "step": 3575 }, { "epoch": 3.01179112857945, "grad_norm": 0.4193984270095825, "learning_rate": 4.088991059532248e-06, "loss": 0.3472, "step": 3576 }, { "epoch": 3.0126333520494106, "grad_norm": 0.41627153754234314, "learning_rate": 4.086099431079429e-06, "loss": 0.3805, "step": 3577 }, { "epoch": 3.013475575519371, "grad_norm": 0.3983106315135956, "learning_rate": 4.083208118820538e-06, "loss": 0.3704, "step": 3578 }, { "epoch": 3.0143177989893317, "grad_norm": 0.3598789870738983, "learning_rate": 4.080317123755919e-06, "loss": 0.3579, "step": 3579 }, { "epoch": 3.0151600224592925, "grad_norm": 0.3839639723300934, "learning_rate": 4.0774264468858064e-06, "loss": 0.3679, "step": 3580 }, { "epoch": 3.0160022459292533, "grad_norm": 0.3644137680530548, "learning_rate": 4.074536089210323e-06, "loss": 0.3556, "step": 3581 }, { "epoch": 3.016844469399214, "grad_norm": 0.36483025550842285, "learning_rate": 4.0716460517294855e-06, "loss": 0.3655, "step": 3582 }, { "epoch": 3.017686692869175, "grad_norm": 0.3448069095611572, "learning_rate": 4.0687563354431986e-06, "loss": 0.361, "step": 3583 }, { "epoch": 3.018528916339135, "grad_norm": 0.35989972949028015, "learning_rate": 4.065866941351251e-06, "loss": 0.3605, "step": 3584 }, { "epoch": 3.019371139809096, "grad_norm": 0.3878214657306671, "learning_rate": 4.062977870453324e-06, "loss": 0.3436, "step": 3585 }, { "epoch": 3.0202133632790567, "grad_norm": 0.35377413034439087, "learning_rate": 4.060089123748989e-06, "loss": 0.3759, "step": 3586 }, { "epoch": 3.0210555867490174, "grad_norm": 0.3399575352668762, "learning_rate": 4.057200702237703e-06, "loss": 0.3532, "step": 3587 }, { "epoch": 3.021897810218978, "grad_norm": 0.4218409061431885, "learning_rate": 4.054312606918807e-06, "loss": 0.3518, "step": 3588 }, { "epoch": 3.022740033688939, "grad_norm": 0.3409987688064575, "learning_rate": 4.051424838791534e-06, "loss": 0.3625, "step": 3589 }, { "epoch": 3.0235822571588993, "grad_norm": 0.386404812335968, "learning_rate": 4.048537398855003e-06, "loss": 0.3475, "step": 3590 }, { "epoch": 3.02442448062886, "grad_norm": 0.4272609353065491, "learning_rate": 4.045650288108219e-06, "loss": 0.3799, "step": 3591 }, { "epoch": 3.025266704098821, "grad_norm": 0.33367684483528137, "learning_rate": 4.04276350755007e-06, "loss": 0.3471, "step": 3592 }, { "epoch": 3.0261089275687816, "grad_norm": 0.38943544030189514, "learning_rate": 4.039877058179338e-06, "loss": 0.3682, "step": 3593 }, { "epoch": 3.0269511510387423, "grad_norm": 0.409046471118927, "learning_rate": 4.0369909409946795e-06, "loss": 0.3532, "step": 3594 }, { "epoch": 3.027793374508703, "grad_norm": 0.3440558612346649, "learning_rate": 4.034105156994644e-06, "loss": 0.3496, "step": 3595 }, { "epoch": 3.028635597978664, "grad_norm": 0.38133475184440613, "learning_rate": 4.031219707177663e-06, "loss": 0.368, "step": 3596 }, { "epoch": 3.029477821448624, "grad_norm": 0.3708812892436981, "learning_rate": 4.028334592542054e-06, "loss": 0.3587, "step": 3597 }, { "epoch": 3.030320044918585, "grad_norm": 0.3824940621852875, "learning_rate": 4.025449814086015e-06, "loss": 0.3489, "step": 3598 }, { "epoch": 3.0311622683885457, "grad_norm": 0.39374464750289917, "learning_rate": 4.022565372807629e-06, "loss": 0.3601, "step": 3599 }, { "epoch": 3.0320044918585065, "grad_norm": 0.34313464164733887, "learning_rate": 4.0196812697048665e-06, "loss": 0.3566, "step": 3600 }, { "epoch": 3.0328467153284673, "grad_norm": 0.40061238408088684, "learning_rate": 4.0167975057755755e-06, "loss": 0.3519, "step": 3601 }, { "epoch": 3.033688938798428, "grad_norm": 0.3314759135246277, "learning_rate": 4.013914082017492e-06, "loss": 0.3568, "step": 3602 }, { "epoch": 3.0345311622683884, "grad_norm": 0.35567280650138855, "learning_rate": 4.011030999428227e-06, "loss": 0.3686, "step": 3603 }, { "epoch": 3.035373385738349, "grad_norm": 0.38111621141433716, "learning_rate": 4.008148259005279e-06, "loss": 0.3479, "step": 3604 }, { "epoch": 3.03621560920831, "grad_norm": 0.3542640209197998, "learning_rate": 4.005265861746028e-06, "loss": 0.3334, "step": 3605 }, { "epoch": 3.0370578326782707, "grad_norm": 1.0732508897781372, "learning_rate": 4.0023838086477334e-06, "loss": 0.3543, "step": 3606 }, { "epoch": 3.0379000561482314, "grad_norm": 0.393280953168869, "learning_rate": 3.999502100707536e-06, "loss": 0.3726, "step": 3607 }, { "epoch": 3.038742279618192, "grad_norm": 0.4093681573867798, "learning_rate": 3.996620738922456e-06, "loss": 0.3623, "step": 3608 }, { "epoch": 3.0395845030881525, "grad_norm": 0.3498075604438782, "learning_rate": 3.9937397242893965e-06, "loss": 0.3446, "step": 3609 }, { "epoch": 3.0404267265581133, "grad_norm": 0.3440095782279968, "learning_rate": 3.990859057805141e-06, "loss": 0.3558, "step": 3610 }, { "epoch": 3.041268950028074, "grad_norm": 0.3574011027812958, "learning_rate": 3.987978740466348e-06, "loss": 0.3724, "step": 3611 }, { "epoch": 3.042111173498035, "grad_norm": 0.3915237486362457, "learning_rate": 3.985098773269557e-06, "loss": 0.3492, "step": 3612 }, { "epoch": 3.0429533969679956, "grad_norm": 0.3410849869251251, "learning_rate": 3.98221915721119e-06, "loss": 0.3558, "step": 3613 }, { "epoch": 3.0437956204379564, "grad_norm": 0.3562261760234833, "learning_rate": 3.979339893287545e-06, "loss": 0.3706, "step": 3614 }, { "epoch": 3.0446378439079167, "grad_norm": 0.3682781755924225, "learning_rate": 3.976460982494797e-06, "loss": 0.3694, "step": 3615 }, { "epoch": 3.0454800673778775, "grad_norm": 0.3641355037689209, "learning_rate": 3.9735824258289975e-06, "loss": 0.3416, "step": 3616 }, { "epoch": 3.0463222908478382, "grad_norm": 0.3885120749473572, "learning_rate": 3.970704224286081e-06, "loss": 0.3641, "step": 3617 }, { "epoch": 3.047164514317799, "grad_norm": 0.3706432282924652, "learning_rate": 3.9678263788618545e-06, "loss": 0.3579, "step": 3618 }, { "epoch": 3.0480067377877598, "grad_norm": 0.3444303572177887, "learning_rate": 3.964948890552005e-06, "loss": 0.3523, "step": 3619 }, { "epoch": 3.0488489612577205, "grad_norm": 0.3500204384326935, "learning_rate": 3.962071760352088e-06, "loss": 0.3378, "step": 3620 }, { "epoch": 3.0496911847276813, "grad_norm": 0.3427300453186035, "learning_rate": 3.95919498925755e-06, "loss": 0.353, "step": 3621 }, { "epoch": 3.0505334081976416, "grad_norm": 0.35766637325286865, "learning_rate": 3.956318578263698e-06, "loss": 0.3871, "step": 3622 }, { "epoch": 3.0513756316676024, "grad_norm": 0.374696284532547, "learning_rate": 3.953442528365725e-06, "loss": 0.3763, "step": 3623 }, { "epoch": 3.052217855137563, "grad_norm": 0.3460894227027893, "learning_rate": 3.95056684055869e-06, "loss": 0.3526, "step": 3624 }, { "epoch": 3.053060078607524, "grad_norm": 0.3703653812408447, "learning_rate": 3.947691515837537e-06, "loss": 0.354, "step": 3625 }, { "epoch": 3.0539023020774847, "grad_norm": 0.387119859457016, "learning_rate": 3.9448165551970765e-06, "loss": 0.3621, "step": 3626 }, { "epoch": 3.0547445255474455, "grad_norm": 0.3542439341545105, "learning_rate": 3.941941959631996e-06, "loss": 0.3822, "step": 3627 }, { "epoch": 3.055586749017406, "grad_norm": 0.33287665247917175, "learning_rate": 3.939067730136854e-06, "loss": 0.3535, "step": 3628 }, { "epoch": 3.0564289724873666, "grad_norm": 0.3756121098995209, "learning_rate": 3.936193867706089e-06, "loss": 0.3594, "step": 3629 }, { "epoch": 3.0572711959573273, "grad_norm": 0.387335866689682, "learning_rate": 3.933320373334006e-06, "loss": 0.3607, "step": 3630 }, { "epoch": 3.058113419427288, "grad_norm": 0.34517335891723633, "learning_rate": 3.930447248014782e-06, "loss": 0.3673, "step": 3631 }, { "epoch": 3.058955642897249, "grad_norm": 0.37602949142456055, "learning_rate": 3.927574492742473e-06, "loss": 0.3679, "step": 3632 }, { "epoch": 3.0597978663672096, "grad_norm": 0.37320369482040405, "learning_rate": 3.924702108511002e-06, "loss": 0.3413, "step": 3633 }, { "epoch": 3.06064008983717, "grad_norm": 0.40494853258132935, "learning_rate": 3.9218300963141656e-06, "loss": 0.3418, "step": 3634 }, { "epoch": 3.0614823133071307, "grad_norm": 0.3786449730396271, "learning_rate": 3.918958457145629e-06, "loss": 0.3712, "step": 3635 }, { "epoch": 3.0623245367770915, "grad_norm": 0.3244706690311432, "learning_rate": 3.91608719199893e-06, "loss": 0.3315, "step": 3636 }, { "epoch": 3.0631667602470523, "grad_norm": 0.42366740107536316, "learning_rate": 3.913216301867478e-06, "loss": 0.361, "step": 3637 }, { "epoch": 3.064008983717013, "grad_norm": 0.36416223645210266, "learning_rate": 3.910345787744553e-06, "loss": 0.3371, "step": 3638 }, { "epoch": 3.064851207186974, "grad_norm": 0.37673014402389526, "learning_rate": 3.907475650623299e-06, "loss": 0.3818, "step": 3639 }, { "epoch": 3.065693430656934, "grad_norm": 0.34256190061569214, "learning_rate": 3.904605891496741e-06, "loss": 0.3524, "step": 3640 }, { "epoch": 3.066535654126895, "grad_norm": 0.35163572430610657, "learning_rate": 3.9017365113577616e-06, "loss": 0.3687, "step": 3641 }, { "epoch": 3.0673778775968557, "grad_norm": 0.3439660668373108, "learning_rate": 3.89886751119912e-06, "loss": 0.3517, "step": 3642 }, { "epoch": 3.0682201010668164, "grad_norm": 0.36781206727027893, "learning_rate": 3.895998892013436e-06, "loss": 0.3397, "step": 3643 }, { "epoch": 3.069062324536777, "grad_norm": 0.32478561997413635, "learning_rate": 3.89313065479321e-06, "loss": 0.3662, "step": 3644 }, { "epoch": 3.069904548006738, "grad_norm": 0.3287387490272522, "learning_rate": 3.890262800530797e-06, "loss": 0.3465, "step": 3645 }, { "epoch": 3.0707467714766983, "grad_norm": 0.3573455810546875, "learning_rate": 3.887395330218429e-06, "loss": 0.3564, "step": 3646 }, { "epoch": 3.071588994946659, "grad_norm": 0.3417898416519165, "learning_rate": 3.884528244848198e-06, "loss": 0.3427, "step": 3647 }, { "epoch": 3.07243121841662, "grad_norm": 0.3492777943611145, "learning_rate": 3.881661545412071e-06, "loss": 0.3709, "step": 3648 }, { "epoch": 3.0732734418865806, "grad_norm": 0.33717161417007446, "learning_rate": 3.878795232901873e-06, "loss": 0.3408, "step": 3649 }, { "epoch": 3.0741156653565413, "grad_norm": 0.38804158568382263, "learning_rate": 3.8759293083093005e-06, "loss": 0.3674, "step": 3650 }, { "epoch": 3.074957888826502, "grad_norm": 0.34814155101776123, "learning_rate": 3.873063772625914e-06, "loss": 0.3552, "step": 3651 }, { "epoch": 3.075800112296463, "grad_norm": 0.331232488155365, "learning_rate": 3.870198626843139e-06, "loss": 0.3471, "step": 3652 }, { "epoch": 3.076642335766423, "grad_norm": 0.34916186332702637, "learning_rate": 3.86733387195227e-06, "loss": 0.3573, "step": 3653 }, { "epoch": 3.077484559236384, "grad_norm": 0.35803014039993286, "learning_rate": 3.864469508944459e-06, "loss": 0.3374, "step": 3654 }, { "epoch": 3.0783267827063447, "grad_norm": 0.3368188440799713, "learning_rate": 3.861605538810727e-06, "loss": 0.3383, "step": 3655 }, { "epoch": 3.0791690061763055, "grad_norm": 0.3523022532463074, "learning_rate": 3.858741962541962e-06, "loss": 0.3267, "step": 3656 }, { "epoch": 3.0800112296462663, "grad_norm": 0.38356778025627136, "learning_rate": 3.85587878112891e-06, "loss": 0.3312, "step": 3657 }, { "epoch": 3.080853453116227, "grad_norm": 0.34285274147987366, "learning_rate": 3.853015995562182e-06, "loss": 0.3487, "step": 3658 }, { "epoch": 3.0816956765861874, "grad_norm": 0.3500497341156006, "learning_rate": 3.850153606832252e-06, "loss": 0.3832, "step": 3659 }, { "epoch": 3.082537900056148, "grad_norm": 0.34739524126052856, "learning_rate": 3.8472916159294604e-06, "loss": 0.3611, "step": 3660 }, { "epoch": 3.083380123526109, "grad_norm": 0.368439644575119, "learning_rate": 3.844430023844007e-06, "loss": 0.3681, "step": 3661 }, { "epoch": 3.0842223469960697, "grad_norm": 0.33926922082901, "learning_rate": 3.841568831565949e-06, "loss": 0.3604, "step": 3662 }, { "epoch": 3.0850645704660304, "grad_norm": 0.3222300112247467, "learning_rate": 3.838708040085214e-06, "loss": 0.3449, "step": 3663 }, { "epoch": 3.085906793935991, "grad_norm": 0.3402547836303711, "learning_rate": 3.8358476503915855e-06, "loss": 0.3419, "step": 3664 }, { "epoch": 3.0867490174059515, "grad_norm": 0.33858782052993774, "learning_rate": 3.83298766347471e-06, "loss": 0.3735, "step": 3665 }, { "epoch": 3.0875912408759123, "grad_norm": 0.3329651653766632, "learning_rate": 3.8301280803240945e-06, "loss": 0.3582, "step": 3666 }, { "epoch": 3.088433464345873, "grad_norm": 0.35738807916641235, "learning_rate": 3.827268901929102e-06, "loss": 0.3435, "step": 3667 }, { "epoch": 3.089275687815834, "grad_norm": 0.3382053077220917, "learning_rate": 3.8244101292789635e-06, "loss": 0.3609, "step": 3668 }, { "epoch": 3.0901179112857946, "grad_norm": 0.34214380383491516, "learning_rate": 3.821551763362764e-06, "loss": 0.3647, "step": 3669 }, { "epoch": 3.0909601347557554, "grad_norm": 0.3626634478569031, "learning_rate": 3.81869380516945e-06, "loss": 0.3508, "step": 3670 }, { "epoch": 3.0918023582257157, "grad_norm": 0.3381366729736328, "learning_rate": 3.815836255687821e-06, "loss": 0.3866, "step": 3671 }, { "epoch": 3.0926445816956765, "grad_norm": 0.3806320130825043, "learning_rate": 3.8129791159065467e-06, "loss": 0.3742, "step": 3672 }, { "epoch": 3.0934868051656372, "grad_norm": 0.3487924635410309, "learning_rate": 3.810122386814145e-06, "loss": 0.3567, "step": 3673 }, { "epoch": 3.094329028635598, "grad_norm": 0.3764026463031769, "learning_rate": 3.807266069398997e-06, "loss": 0.3539, "step": 3674 }, { "epoch": 3.0951712521055588, "grad_norm": 0.35273852944374084, "learning_rate": 3.8044101646493354e-06, "loss": 0.3588, "step": 3675 }, { "epoch": 3.0960134755755195, "grad_norm": 0.36708831787109375, "learning_rate": 3.8015546735532603e-06, "loss": 0.354, "step": 3676 }, { "epoch": 3.09685569904548, "grad_norm": 0.32196754217147827, "learning_rate": 3.7986995970987194e-06, "loss": 0.373, "step": 3677 }, { "epoch": 3.0976979225154406, "grad_norm": 0.36280587315559387, "learning_rate": 3.7958449362735193e-06, "loss": 0.3678, "step": 3678 }, { "epoch": 3.0985401459854014, "grad_norm": 0.37086722254753113, "learning_rate": 3.792990692065325e-06, "loss": 0.3684, "step": 3679 }, { "epoch": 3.099382369455362, "grad_norm": 0.3370133340358734, "learning_rate": 3.7901368654616577e-06, "loss": 0.3729, "step": 3680 }, { "epoch": 3.100224592925323, "grad_norm": 0.3605077266693115, "learning_rate": 3.7872834574498894e-06, "loss": 0.3583, "step": 3681 }, { "epoch": 3.1010668163952837, "grad_norm": 0.3595060408115387, "learning_rate": 3.7844304690172515e-06, "loss": 0.3756, "step": 3682 }, { "epoch": 3.1019090398652445, "grad_norm": 0.39103755354881287, "learning_rate": 3.781577901150829e-06, "loss": 0.356, "step": 3683 }, { "epoch": 3.102751263335205, "grad_norm": 0.3920332193374634, "learning_rate": 3.7787257548375635e-06, "loss": 0.3369, "step": 3684 }, { "epoch": 3.1035934868051656, "grad_norm": 0.4231106638908386, "learning_rate": 3.7758740310642454e-06, "loss": 0.3547, "step": 3685 }, { "epoch": 3.1044357102751263, "grad_norm": 0.37069451808929443, "learning_rate": 3.7730227308175242e-06, "loss": 0.3668, "step": 3686 }, { "epoch": 3.105277933745087, "grad_norm": 0.37163716554641724, "learning_rate": 3.7701718550838995e-06, "loss": 0.3518, "step": 3687 }, { "epoch": 3.106120157215048, "grad_norm": 0.35397908091545105, "learning_rate": 3.7673214048497274e-06, "loss": 0.3623, "step": 3688 }, { "epoch": 3.1069623806850086, "grad_norm": 0.3761151134967804, "learning_rate": 3.7644713811012145e-06, "loss": 0.3639, "step": 3689 }, { "epoch": 3.107804604154969, "grad_norm": 0.3620738983154297, "learning_rate": 3.7616217848244175e-06, "loss": 0.3721, "step": 3690 }, { "epoch": 3.1086468276249297, "grad_norm": 0.35935211181640625, "learning_rate": 3.758772617005251e-06, "loss": 0.3598, "step": 3691 }, { "epoch": 3.1094890510948905, "grad_norm": 0.3368039131164551, "learning_rate": 3.755923878629477e-06, "loss": 0.3652, "step": 3692 }, { "epoch": 3.1103312745648513, "grad_norm": 0.36376747488975525, "learning_rate": 3.7530755706827117e-06, "loss": 0.3391, "step": 3693 }, { "epoch": 3.111173498034812, "grad_norm": 0.3443908989429474, "learning_rate": 3.7502276941504157e-06, "loss": 0.3369, "step": 3694 }, { "epoch": 3.112015721504773, "grad_norm": 0.36841315031051636, "learning_rate": 3.747380250017912e-06, "loss": 0.3567, "step": 3695 }, { "epoch": 3.112857944974733, "grad_norm": 0.3613170385360718, "learning_rate": 3.744533239270364e-06, "loss": 0.3561, "step": 3696 }, { "epoch": 3.113700168444694, "grad_norm": 0.3501397371292114, "learning_rate": 3.741686662892789e-06, "loss": 0.326, "step": 3697 }, { "epoch": 3.1145423919146547, "grad_norm": 0.37171459197998047, "learning_rate": 3.7388405218700515e-06, "loss": 0.3748, "step": 3698 }, { "epoch": 3.1153846153846154, "grad_norm": 0.34873515367507935, "learning_rate": 3.735994817186872e-06, "loss": 0.3743, "step": 3699 }, { "epoch": 3.116226838854576, "grad_norm": 0.3764384388923645, "learning_rate": 3.733149549827813e-06, "loss": 0.3497, "step": 3700 }, { "epoch": 3.117069062324537, "grad_norm": 0.33856409788131714, "learning_rate": 3.730304720777288e-06, "loss": 0.3541, "step": 3701 }, { "epoch": 3.1179112857944973, "grad_norm": 0.34737029671669006, "learning_rate": 3.727460331019559e-06, "loss": 0.3519, "step": 3702 }, { "epoch": 3.118753509264458, "grad_norm": 0.33516451716423035, "learning_rate": 3.7246163815387383e-06, "loss": 0.3608, "step": 3703 }, { "epoch": 3.119595732734419, "grad_norm": 0.3532470464706421, "learning_rate": 3.7217728733187807e-06, "loss": 0.3801, "step": 3704 }, { "epoch": 3.1204379562043796, "grad_norm": 0.3239833116531372, "learning_rate": 3.7189298073434925e-06, "loss": 0.364, "step": 3705 }, { "epoch": 3.1212801796743403, "grad_norm": 0.3625008761882782, "learning_rate": 3.716087184596525e-06, "loss": 0.3446, "step": 3706 }, { "epoch": 3.122122403144301, "grad_norm": 0.3541141152381897, "learning_rate": 3.713245006061379e-06, "loss": 0.3436, "step": 3707 }, { "epoch": 3.1229646266142614, "grad_norm": 0.35260581970214844, "learning_rate": 3.710403272721399e-06, "loss": 0.3515, "step": 3708 }, { "epoch": 3.123806850084222, "grad_norm": 0.3394695818424225, "learning_rate": 3.7075619855597744e-06, "loss": 0.3286, "step": 3709 }, { "epoch": 3.124649073554183, "grad_norm": 0.3646598756313324, "learning_rate": 3.704721145559542e-06, "loss": 0.3415, "step": 3710 }, { "epoch": 3.1254912970241437, "grad_norm": 0.3365234136581421, "learning_rate": 3.701880753703585e-06, "loss": 0.3671, "step": 3711 }, { "epoch": 3.1263335204941045, "grad_norm": 0.36472997069358826, "learning_rate": 3.6990408109746314e-06, "loss": 0.385, "step": 3712 }, { "epoch": 3.1271757439640653, "grad_norm": 0.3284488320350647, "learning_rate": 3.6962013183552496e-06, "loss": 0.3489, "step": 3713 }, { "epoch": 3.128017967434026, "grad_norm": 0.3651183545589447, "learning_rate": 3.693362276827856e-06, "loss": 0.3672, "step": 3714 }, { "epoch": 3.1288601909039864, "grad_norm": 0.3655654489994049, "learning_rate": 3.690523687374712e-06, "loss": 0.3663, "step": 3715 }, { "epoch": 3.129702414373947, "grad_norm": 0.3372897803783417, "learning_rate": 3.687685550977921e-06, "loss": 0.3633, "step": 3716 }, { "epoch": 3.130544637843908, "grad_norm": 0.3364836871623993, "learning_rate": 3.684847868619428e-06, "loss": 0.3741, "step": 3717 }, { "epoch": 3.1313868613138687, "grad_norm": 0.3565663695335388, "learning_rate": 3.682010641281021e-06, "loss": 0.3463, "step": 3718 }, { "epoch": 3.1322290847838294, "grad_norm": 0.4008064270019531, "learning_rate": 3.6791738699443357e-06, "loss": 0.3742, "step": 3719 }, { "epoch": 3.13307130825379, "grad_norm": 0.33681344985961914, "learning_rate": 3.6763375555908443e-06, "loss": 0.3607, "step": 3720 }, { "epoch": 3.1339135317237505, "grad_norm": 0.3956893980503082, "learning_rate": 3.673501699201864e-06, "loss": 0.3797, "step": 3721 }, { "epoch": 3.1347557551937113, "grad_norm": 0.37154367566108704, "learning_rate": 3.6706663017585497e-06, "loss": 0.3384, "step": 3722 }, { "epoch": 3.135597978663672, "grad_norm": 0.3240630328655243, "learning_rate": 3.667831364241904e-06, "loss": 0.3492, "step": 3723 }, { "epoch": 3.136440202133633, "grad_norm": 0.33993226289749146, "learning_rate": 3.6649968876327644e-06, "loss": 0.3628, "step": 3724 }, { "epoch": 3.1372824256035936, "grad_norm": 0.3347960412502289, "learning_rate": 3.6621628729118135e-06, "loss": 0.3428, "step": 3725 }, { "epoch": 3.1381246490735544, "grad_norm": 0.3654036223888397, "learning_rate": 3.659329321059567e-06, "loss": 0.35, "step": 3726 }, { "epoch": 3.1389668725435147, "grad_norm": 0.33539989590644836, "learning_rate": 3.6564962330563906e-06, "loss": 0.3438, "step": 3727 }, { "epoch": 3.1398090960134755, "grad_norm": 0.36965152621269226, "learning_rate": 3.653663609882482e-06, "loss": 0.3639, "step": 3728 }, { "epoch": 3.1406513194834362, "grad_norm": 0.3650558888912201, "learning_rate": 3.65083145251788e-06, "loss": 0.3473, "step": 3729 }, { "epoch": 3.141493542953397, "grad_norm": 0.3295755684375763, "learning_rate": 3.6479997619424605e-06, "loss": 0.3697, "step": 3730 }, { "epoch": 3.1423357664233578, "grad_norm": 0.3562260568141937, "learning_rate": 3.6451685391359447e-06, "loss": 0.3633, "step": 3731 }, { "epoch": 3.1431779898933185, "grad_norm": 0.310283362865448, "learning_rate": 3.6423377850778843e-06, "loss": 0.3388, "step": 3732 }, { "epoch": 3.144020213363279, "grad_norm": 0.34845542907714844, "learning_rate": 3.639507500747671e-06, "loss": 0.3703, "step": 3733 }, { "epoch": 3.1448624368332396, "grad_norm": 0.35033535957336426, "learning_rate": 3.6366776871245345e-06, "loss": 0.3492, "step": 3734 }, { "epoch": 3.1457046603032004, "grad_norm": 0.3608390986919403, "learning_rate": 3.6338483451875456e-06, "loss": 0.3618, "step": 3735 }, { "epoch": 3.146546883773161, "grad_norm": 0.33405011892318726, "learning_rate": 3.6310194759156037e-06, "loss": 0.3411, "step": 3736 }, { "epoch": 3.147389107243122, "grad_norm": 0.33917078375816345, "learning_rate": 3.628191080287451e-06, "loss": 0.3757, "step": 3737 }, { "epoch": 3.1482313307130827, "grad_norm": 0.32163599133491516, "learning_rate": 3.625363159281663e-06, "loss": 0.3714, "step": 3738 }, { "epoch": 3.149073554183043, "grad_norm": 0.35519978404045105, "learning_rate": 3.6225357138766527e-06, "loss": 0.3605, "step": 3739 }, { "epoch": 3.149915777653004, "grad_norm": 0.3578715920448303, "learning_rate": 3.6197087450506684e-06, "loss": 0.3554, "step": 3740 }, { "epoch": 3.1507580011229646, "grad_norm": 0.3645786941051483, "learning_rate": 3.6168822537817906e-06, "loss": 0.3615, "step": 3741 }, { "epoch": 3.1516002245929253, "grad_norm": 0.3737828731536865, "learning_rate": 3.6140562410479385e-06, "loss": 0.3682, "step": 3742 }, { "epoch": 3.152442448062886, "grad_norm": 0.36485737562179565, "learning_rate": 3.611230707826864e-06, "loss": 0.3549, "step": 3743 }, { "epoch": 3.153284671532847, "grad_norm": 0.3139091432094574, "learning_rate": 3.6084056550961542e-06, "loss": 0.3352, "step": 3744 }, { "epoch": 3.1541268950028076, "grad_norm": 0.3602890372276306, "learning_rate": 3.6055810838332254e-06, "loss": 0.3494, "step": 3745 }, { "epoch": 3.154969118472768, "grad_norm": 0.39666301012039185, "learning_rate": 3.602756995015334e-06, "loss": 0.3387, "step": 3746 }, { "epoch": 3.1558113419427287, "grad_norm": 0.38664448261260986, "learning_rate": 3.5999333896195656e-06, "loss": 0.3766, "step": 3747 }, { "epoch": 3.1566535654126895, "grad_norm": 0.33268630504608154, "learning_rate": 3.5971102686228392e-06, "loss": 0.3677, "step": 3748 }, { "epoch": 3.1574957888826503, "grad_norm": 0.33481884002685547, "learning_rate": 3.5942876330019033e-06, "loss": 0.3518, "step": 3749 }, { "epoch": 3.158338012352611, "grad_norm": 0.3368445634841919, "learning_rate": 3.5914654837333472e-06, "loss": 0.3613, "step": 3750 }, { "epoch": 3.159180235822572, "grad_norm": 0.3745618760585785, "learning_rate": 3.588643821793582e-06, "loss": 0.3591, "step": 3751 }, { "epoch": 3.160022459292532, "grad_norm": 0.33904701471328735, "learning_rate": 3.5858226481588556e-06, "loss": 0.3297, "step": 3752 }, { "epoch": 3.160864682762493, "grad_norm": 0.37260785698890686, "learning_rate": 3.583001963805245e-06, "loss": 0.3662, "step": 3753 }, { "epoch": 3.1617069062324537, "grad_norm": 0.37788546085357666, "learning_rate": 3.5801817697086607e-06, "loss": 0.3559, "step": 3754 }, { "epoch": 3.1625491297024144, "grad_norm": 0.3544933497905731, "learning_rate": 3.5773620668448384e-06, "loss": 0.3613, "step": 3755 }, { "epoch": 3.163391353172375, "grad_norm": 0.37414270639419556, "learning_rate": 3.574542856189349e-06, "loss": 0.3655, "step": 3756 }, { "epoch": 3.164233576642336, "grad_norm": 0.3731330335140228, "learning_rate": 3.57172413871759e-06, "loss": 0.3658, "step": 3757 }, { "epoch": 3.1650758001122963, "grad_norm": 0.34042882919311523, "learning_rate": 3.5689059154047918e-06, "loss": 0.335, "step": 3758 }, { "epoch": 3.165918023582257, "grad_norm": 0.3460594415664673, "learning_rate": 3.566088187226008e-06, "loss": 0.3488, "step": 3759 }, { "epoch": 3.166760247052218, "grad_norm": 0.38330578804016113, "learning_rate": 3.563270955156126e-06, "loss": 0.359, "step": 3760 }, { "epoch": 3.1676024705221786, "grad_norm": 0.33506953716278076, "learning_rate": 3.5604542201698585e-06, "loss": 0.3418, "step": 3761 }, { "epoch": 3.1684446939921393, "grad_norm": 0.40221813321113586, "learning_rate": 3.557637983241749e-06, "loss": 0.357, "step": 3762 }, { "epoch": 3.1692869174621, "grad_norm": 0.36362424492836, "learning_rate": 3.554822245346168e-06, "loss": 0.3415, "step": 3763 }, { "epoch": 3.1701291409320604, "grad_norm": 0.37749430537223816, "learning_rate": 3.5520070074573103e-06, "loss": 0.3373, "step": 3764 }, { "epoch": 3.170971364402021, "grad_norm": 0.4104670584201813, "learning_rate": 3.5491922705492e-06, "loss": 0.3571, "step": 3765 }, { "epoch": 3.171813587871982, "grad_norm": 0.3986697196960449, "learning_rate": 3.546378035595689e-06, "loss": 0.3405, "step": 3766 }, { "epoch": 3.1726558113419427, "grad_norm": 0.34233662486076355, "learning_rate": 3.543564303570456e-06, "loss": 0.3578, "step": 3767 }, { "epoch": 3.1734980348119035, "grad_norm": 0.4340866506099701, "learning_rate": 3.5407510754469997e-06, "loss": 0.3567, "step": 3768 }, { "epoch": 3.1743402582818643, "grad_norm": 0.37797194719314575, "learning_rate": 3.537938352198651e-06, "loss": 0.3813, "step": 3769 }, { "epoch": 3.1751824817518246, "grad_norm": 0.3423570990562439, "learning_rate": 3.535126134798565e-06, "loss": 0.3529, "step": 3770 }, { "epoch": 3.1760247052217854, "grad_norm": 0.34908613562583923, "learning_rate": 3.5323144242197206e-06, "loss": 0.3536, "step": 3771 }, { "epoch": 3.176866928691746, "grad_norm": 0.37663257122039795, "learning_rate": 3.52950322143492e-06, "loss": 0.3635, "step": 3772 }, { "epoch": 3.177709152161707, "grad_norm": 0.3916175961494446, "learning_rate": 3.5266925274167903e-06, "loss": 0.3486, "step": 3773 }, { "epoch": 3.1785513756316677, "grad_norm": 0.31102195382118225, "learning_rate": 3.5238823431377854e-06, "loss": 0.3451, "step": 3774 }, { "epoch": 3.1793935991016284, "grad_norm": 0.3588477373123169, "learning_rate": 3.5210726695701814e-06, "loss": 0.3496, "step": 3775 }, { "epoch": 3.180235822571589, "grad_norm": 0.3541274070739746, "learning_rate": 3.5182635076860765e-06, "loss": 0.3473, "step": 3776 }, { "epoch": 3.1810780460415495, "grad_norm": 0.3668762743473053, "learning_rate": 3.5154548584573906e-06, "loss": 0.3534, "step": 3777 }, { "epoch": 3.1819202695115103, "grad_norm": 0.3604443073272705, "learning_rate": 3.5126467228558704e-06, "loss": 0.3533, "step": 3778 }, { "epoch": 3.182762492981471, "grad_norm": 0.3670567274093628, "learning_rate": 3.509839101853082e-06, "loss": 0.3472, "step": 3779 }, { "epoch": 3.183604716451432, "grad_norm": 0.3774792551994324, "learning_rate": 3.507031996420416e-06, "loss": 0.3759, "step": 3780 }, { "epoch": 3.1844469399213926, "grad_norm": 0.3404044806957245, "learning_rate": 3.5042254075290773e-06, "loss": 0.3574, "step": 3781 }, { "epoch": 3.1852891633913534, "grad_norm": 0.3592541217803955, "learning_rate": 3.5014193361501034e-06, "loss": 0.3926, "step": 3782 }, { "epoch": 3.1861313868613137, "grad_norm": 0.3874666392803192, "learning_rate": 3.498613783254345e-06, "loss": 0.3637, "step": 3783 }, { "epoch": 3.1869736103312745, "grad_norm": 0.3462994694709778, "learning_rate": 3.4958087498124748e-06, "loss": 0.3477, "step": 3784 }, { "epoch": 3.1878158338012352, "grad_norm": 0.32542794942855835, "learning_rate": 3.4930042367949844e-06, "loss": 0.3652, "step": 3785 }, { "epoch": 3.188658057271196, "grad_norm": 0.357170432806015, "learning_rate": 3.4902002451721916e-06, "loss": 0.3661, "step": 3786 }, { "epoch": 3.1895002807411568, "grad_norm": 0.3365301787853241, "learning_rate": 3.4873967759142265e-06, "loss": 0.346, "step": 3787 }, { "epoch": 3.1903425042111175, "grad_norm": 0.32911238074302673, "learning_rate": 3.4845938299910413e-06, "loss": 0.3387, "step": 3788 }, { "epoch": 3.191184727681078, "grad_norm": 0.3740141987800598, "learning_rate": 3.4817914083724074e-06, "loss": 0.3633, "step": 3789 }, { "epoch": 3.1920269511510386, "grad_norm": 0.4003942310810089, "learning_rate": 3.478989512027917e-06, "loss": 0.3704, "step": 3790 }, { "epoch": 3.1928691746209994, "grad_norm": 0.33969688415527344, "learning_rate": 3.4761881419269754e-06, "loss": 0.3364, "step": 3791 }, { "epoch": 3.19371139809096, "grad_norm": 0.3580355644226074, "learning_rate": 3.4733872990388083e-06, "loss": 0.3683, "step": 3792 }, { "epoch": 3.194553621560921, "grad_norm": 0.348638117313385, "learning_rate": 3.4705869843324614e-06, "loss": 0.3441, "step": 3793 }, { "epoch": 3.1953958450308817, "grad_norm": 0.4592438042163849, "learning_rate": 3.4677871987767953e-06, "loss": 0.3595, "step": 3794 }, { "epoch": 3.196238068500842, "grad_norm": 0.3424679636955261, "learning_rate": 3.4649879433404876e-06, "loss": 0.3672, "step": 3795 }, { "epoch": 3.197080291970803, "grad_norm": 0.3482770621776581, "learning_rate": 3.4621892189920307e-06, "loss": 0.3623, "step": 3796 }, { "epoch": 3.1979225154407636, "grad_norm": 0.3504347503185272, "learning_rate": 3.459391026699738e-06, "loss": 0.3483, "step": 3797 }, { "epoch": 3.1987647389107243, "grad_norm": 0.3628599941730499, "learning_rate": 3.4565933674317346e-06, "loss": 0.3641, "step": 3798 }, { "epoch": 3.199606962380685, "grad_norm": 0.378157377243042, "learning_rate": 3.4537962421559646e-06, "loss": 0.3584, "step": 3799 }, { "epoch": 3.200449185850646, "grad_norm": 0.34549403190612793, "learning_rate": 3.4509996518401794e-06, "loss": 0.3753, "step": 3800 }, { "epoch": 3.201291409320606, "grad_norm": 0.35378220677375793, "learning_rate": 3.4482035974519595e-06, "loss": 0.3906, "step": 3801 }, { "epoch": 3.202133632790567, "grad_norm": 0.3359163999557495, "learning_rate": 3.4454080799586866e-06, "loss": 0.3518, "step": 3802 }, { "epoch": 3.2029758562605277, "grad_norm": 0.34684810042381287, "learning_rate": 3.442613100327564e-06, "loss": 0.376, "step": 3803 }, { "epoch": 3.2038180797304885, "grad_norm": 0.37217870354652405, "learning_rate": 3.4398186595256035e-06, "loss": 0.3546, "step": 3804 }, { "epoch": 3.2046603032004493, "grad_norm": 0.32630985975265503, "learning_rate": 3.4370247585196387e-06, "loss": 0.3278, "step": 3805 }, { "epoch": 3.20550252667041, "grad_norm": 0.3434779942035675, "learning_rate": 3.4342313982763075e-06, "loss": 0.3506, "step": 3806 }, { "epoch": 3.206344750140371, "grad_norm": 0.34799695014953613, "learning_rate": 3.4314385797620664e-06, "loss": 0.3718, "step": 3807 }, { "epoch": 3.207186973610331, "grad_norm": 0.49243393540382385, "learning_rate": 3.428646303943181e-06, "loss": 0.3469, "step": 3808 }, { "epoch": 3.208029197080292, "grad_norm": 0.34673845767974854, "learning_rate": 3.4258545717857338e-06, "loss": 0.3474, "step": 3809 }, { "epoch": 3.2088714205502527, "grad_norm": 0.36602917313575745, "learning_rate": 3.4230633842556143e-06, "loss": 0.3497, "step": 3810 }, { "epoch": 3.2097136440202134, "grad_norm": 0.3561411499977112, "learning_rate": 3.4202727423185256e-06, "loss": 0.3453, "step": 3811 }, { "epoch": 3.210555867490174, "grad_norm": 0.3472278416156769, "learning_rate": 3.4174826469399807e-06, "loss": 0.3534, "step": 3812 }, { "epoch": 3.211398090960135, "grad_norm": 0.3283720314502716, "learning_rate": 3.414693099085308e-06, "loss": 0.3592, "step": 3813 }, { "epoch": 3.2122403144300953, "grad_norm": 0.34560295939445496, "learning_rate": 3.4119040997196418e-06, "loss": 0.3652, "step": 3814 }, { "epoch": 3.213082537900056, "grad_norm": 0.348244845867157, "learning_rate": 3.409115649807927e-06, "loss": 0.3589, "step": 3815 }, { "epoch": 3.213924761370017, "grad_norm": 0.35178840160369873, "learning_rate": 3.4063277503149196e-06, "loss": 0.3528, "step": 3816 }, { "epoch": 3.2147669848399776, "grad_norm": 0.3731711804866791, "learning_rate": 3.4035404022051856e-06, "loss": 0.3563, "step": 3817 }, { "epoch": 3.2156092083099383, "grad_norm": 0.3437410295009613, "learning_rate": 3.4007536064431013e-06, "loss": 0.3334, "step": 3818 }, { "epoch": 3.216451431779899, "grad_norm": 0.3453464210033417, "learning_rate": 3.3979673639928478e-06, "loss": 0.3609, "step": 3819 }, { "epoch": 3.2172936552498594, "grad_norm": 0.35908544063568115, "learning_rate": 3.3951816758184166e-06, "loss": 0.3385, "step": 3820 }, { "epoch": 3.21813587871982, "grad_norm": 0.3798445165157318, "learning_rate": 3.3923965428836102e-06, "loss": 0.3548, "step": 3821 }, { "epoch": 3.218978102189781, "grad_norm": 0.3302757441997528, "learning_rate": 3.389611966152038e-06, "loss": 0.3441, "step": 3822 }, { "epoch": 3.2198203256597417, "grad_norm": 0.3412247598171234, "learning_rate": 3.3868279465871123e-06, "loss": 0.3452, "step": 3823 }, { "epoch": 3.2206625491297025, "grad_norm": 0.370002418756485, "learning_rate": 3.3840444851520573e-06, "loss": 0.3494, "step": 3824 }, { "epoch": 3.2215047725996633, "grad_norm": 0.35597074031829834, "learning_rate": 3.381261582809904e-06, "loss": 0.3655, "step": 3825 }, { "epoch": 3.2223469960696236, "grad_norm": 0.34430015087127686, "learning_rate": 3.378479240523488e-06, "loss": 0.3551, "step": 3826 }, { "epoch": 3.2231892195395844, "grad_norm": 0.31613123416900635, "learning_rate": 3.3756974592554536e-06, "loss": 0.3604, "step": 3827 }, { "epoch": 3.224031443009545, "grad_norm": 0.38796523213386536, "learning_rate": 3.372916239968246e-06, "loss": 0.3738, "step": 3828 }, { "epoch": 3.224873666479506, "grad_norm": 1.3480110168457031, "learning_rate": 3.3701355836241224e-06, "loss": 0.3529, "step": 3829 }, { "epoch": 3.2257158899494667, "grad_norm": 0.3520650267601013, "learning_rate": 3.367355491185142e-06, "loss": 0.3609, "step": 3830 }, { "epoch": 3.2265581134194274, "grad_norm": 0.3054385483264923, "learning_rate": 3.3645759636131694e-06, "loss": 0.3769, "step": 3831 }, { "epoch": 3.2274003368893878, "grad_norm": 0.3450542986392975, "learning_rate": 3.3617970018698687e-06, "loss": 0.3644, "step": 3832 }, { "epoch": 3.2282425603593485, "grad_norm": 0.3805626630783081, "learning_rate": 3.3590186069167203e-06, "loss": 0.3736, "step": 3833 }, { "epoch": 3.2290847838293093, "grad_norm": 0.3959503471851349, "learning_rate": 3.3562407797149966e-06, "loss": 0.3675, "step": 3834 }, { "epoch": 3.22992700729927, "grad_norm": 0.3608335852622986, "learning_rate": 3.3534635212257803e-06, "loss": 0.365, "step": 3835 }, { "epoch": 3.230769230769231, "grad_norm": 0.3341190218925476, "learning_rate": 3.3506868324099517e-06, "loss": 0.3532, "step": 3836 }, { "epoch": 3.2316114542391916, "grad_norm": 0.3788456618785858, "learning_rate": 3.347910714228203e-06, "loss": 0.3695, "step": 3837 }, { "epoch": 3.2324536777091524, "grad_norm": 0.3860209882259369, "learning_rate": 3.3451351676410194e-06, "loss": 0.3631, "step": 3838 }, { "epoch": 3.2332959011791127, "grad_norm": 0.36175692081451416, "learning_rate": 3.3423601936086926e-06, "loss": 0.3414, "step": 3839 }, { "epoch": 3.2341381246490735, "grad_norm": 0.33441081643104553, "learning_rate": 3.339585793091318e-06, "loss": 0.3673, "step": 3840 }, { "epoch": 3.2349803481190342, "grad_norm": 0.3321206569671631, "learning_rate": 3.3368119670487907e-06, "loss": 0.3653, "step": 3841 }, { "epoch": 3.235822571588995, "grad_norm": 0.3607935607433319, "learning_rate": 3.334038716440805e-06, "loss": 0.369, "step": 3842 }, { "epoch": 3.2366647950589558, "grad_norm": 0.3625064194202423, "learning_rate": 3.331266042226858e-06, "loss": 0.388, "step": 3843 }, { "epoch": 3.2375070185289165, "grad_norm": 0.36212897300720215, "learning_rate": 3.328493945366249e-06, "loss": 0.3722, "step": 3844 }, { "epoch": 3.238349241998877, "grad_norm": 0.35092535614967346, "learning_rate": 3.325722426818076e-06, "loss": 0.3547, "step": 3845 }, { "epoch": 3.2391914654688376, "grad_norm": 0.6148232221603394, "learning_rate": 3.322951487541237e-06, "loss": 0.3678, "step": 3846 }, { "epoch": 3.2400336889387984, "grad_norm": 0.3746045231819153, "learning_rate": 3.3201811284944264e-06, "loss": 0.353, "step": 3847 }, { "epoch": 3.240875912408759, "grad_norm": 0.3138555586338043, "learning_rate": 3.317411350636145e-06, "loss": 0.3486, "step": 3848 }, { "epoch": 3.24171813587872, "grad_norm": 0.361093670129776, "learning_rate": 3.3146421549246866e-06, "loss": 0.3944, "step": 3849 }, { "epoch": 3.2425603593486807, "grad_norm": 0.31993037462234497, "learning_rate": 3.311873542318146e-06, "loss": 0.3609, "step": 3850 }, { "epoch": 3.243402582818641, "grad_norm": 0.3395708203315735, "learning_rate": 3.309105513774413e-06, "loss": 0.3827, "step": 3851 }, { "epoch": 3.244244806288602, "grad_norm": 0.3565386235713959, "learning_rate": 3.306338070251183e-06, "loss": 0.3561, "step": 3852 }, { "epoch": 3.2450870297585626, "grad_norm": 0.35056766867637634, "learning_rate": 3.3035712127059395e-06, "loss": 0.3749, "step": 3853 }, { "epoch": 3.2459292532285233, "grad_norm": 0.3629697561264038, "learning_rate": 3.3008049420959715e-06, "loss": 0.3618, "step": 3854 }, { "epoch": 3.246771476698484, "grad_norm": 0.3297300338745117, "learning_rate": 3.2980392593783563e-06, "loss": 0.3495, "step": 3855 }, { "epoch": 3.247613700168445, "grad_norm": 0.33613321185112, "learning_rate": 3.2952741655099784e-06, "loss": 0.3507, "step": 3856 }, { "epoch": 3.248455923638405, "grad_norm": 0.335597962141037, "learning_rate": 3.2925096614475087e-06, "loss": 0.3496, "step": 3857 }, { "epoch": 3.249298147108366, "grad_norm": 0.3591141104698181, "learning_rate": 3.2897457481474205e-06, "loss": 0.3529, "step": 3858 }, { "epoch": 3.2501403705783267, "grad_norm": 0.3363160490989685, "learning_rate": 3.2869824265659777e-06, "loss": 0.3448, "step": 3859 }, { "epoch": 3.2509825940482875, "grad_norm": 0.33144447207450867, "learning_rate": 3.2842196976592466e-06, "loss": 0.365, "step": 3860 }, { "epoch": 3.2518248175182483, "grad_norm": 0.3518591821193695, "learning_rate": 3.2814575623830802e-06, "loss": 0.3395, "step": 3861 }, { "epoch": 3.252667040988209, "grad_norm": 0.364288866519928, "learning_rate": 3.278696021693131e-06, "loss": 0.3495, "step": 3862 }, { "epoch": 3.2535092644581693, "grad_norm": 0.33157840371131897, "learning_rate": 3.275935076544845e-06, "loss": 0.3705, "step": 3863 }, { "epoch": 3.25435148792813, "grad_norm": 0.36609193682670593, "learning_rate": 3.273174727893463e-06, "loss": 0.331, "step": 3864 }, { "epoch": 3.255193711398091, "grad_norm": 0.3362012803554535, "learning_rate": 3.270414976694016e-06, "loss": 0.36, "step": 3865 }, { "epoch": 3.2560359348680517, "grad_norm": 0.37041476368904114, "learning_rate": 3.2676558239013324e-06, "loss": 0.3295, "step": 3866 }, { "epoch": 3.2568781583380124, "grad_norm": 0.3387551009654999, "learning_rate": 3.2648972704700287e-06, "loss": 0.3557, "step": 3867 }, { "epoch": 3.257720381807973, "grad_norm": 0.32523947954177856, "learning_rate": 3.2621393173545217e-06, "loss": 0.3505, "step": 3868 }, { "epoch": 3.258562605277934, "grad_norm": 0.3354189693927765, "learning_rate": 3.259381965509015e-06, "loss": 0.3536, "step": 3869 }, { "epoch": 3.2594048287478943, "grad_norm": 0.32876530289649963, "learning_rate": 3.2566252158875016e-06, "loss": 0.3561, "step": 3870 }, { "epoch": 3.260247052217855, "grad_norm": 0.36183592677116394, "learning_rate": 3.2538690694437714e-06, "loss": 0.3399, "step": 3871 }, { "epoch": 3.261089275687816, "grad_norm": 0.3199988603591919, "learning_rate": 3.2511135271314054e-06, "loss": 0.3523, "step": 3872 }, { "epoch": 3.2619314991577766, "grad_norm": 0.31709322333335876, "learning_rate": 3.248358589903773e-06, "loss": 0.3401, "step": 3873 }, { "epoch": 3.2627737226277373, "grad_norm": 0.34390029311180115, "learning_rate": 3.2456042587140347e-06, "loss": 0.3838, "step": 3874 }, { "epoch": 3.2636159460976977, "grad_norm": 0.3597283363342285, "learning_rate": 3.2428505345151407e-06, "loss": 0.351, "step": 3875 }, { "epoch": 3.2644581695676584, "grad_norm": 0.3593129515647888, "learning_rate": 3.2400974182598344e-06, "loss": 0.3706, "step": 3876 }, { "epoch": 3.265300393037619, "grad_norm": 0.3386133909225464, "learning_rate": 3.2373449109006476e-06, "loss": 0.3634, "step": 3877 }, { "epoch": 3.26614261650758, "grad_norm": 0.37388014793395996, "learning_rate": 3.2345930133898985e-06, "loss": 0.3815, "step": 3878 }, { "epoch": 3.2669848399775407, "grad_norm": 0.36309894919395447, "learning_rate": 3.2318417266796966e-06, "loss": 0.3412, "step": 3879 }, { "epoch": 3.2678270634475015, "grad_norm": 0.3403424620628357, "learning_rate": 3.229091051721941e-06, "loss": 0.3355, "step": 3880 }, { "epoch": 3.2686692869174623, "grad_norm": 0.3320697844028473, "learning_rate": 3.226340989468317e-06, "loss": 0.3615, "step": 3881 }, { "epoch": 3.2695115103874226, "grad_norm": 0.3660401403903961, "learning_rate": 3.2235915408703015e-06, "loss": 0.3762, "step": 3882 }, { "epoch": 3.2703537338573834, "grad_norm": 0.33671319484710693, "learning_rate": 3.2208427068791527e-06, "loss": 0.3712, "step": 3883 }, { "epoch": 3.271195957327344, "grad_norm": 0.3428006172180176, "learning_rate": 3.218094488445923e-06, "loss": 0.3707, "step": 3884 }, { "epoch": 3.272038180797305, "grad_norm": 0.33781513571739197, "learning_rate": 3.215346886521448e-06, "loss": 0.3716, "step": 3885 }, { "epoch": 3.2728804042672657, "grad_norm": 0.32511621713638306, "learning_rate": 3.212599902056351e-06, "loss": 0.3507, "step": 3886 }, { "epoch": 3.2737226277372264, "grad_norm": 0.4083346724510193, "learning_rate": 3.2098535360010384e-06, "loss": 0.3754, "step": 3887 }, { "epoch": 3.274564851207187, "grad_norm": 0.3658863604068756, "learning_rate": 3.2071077893057124e-06, "loss": 0.3724, "step": 3888 }, { "epoch": 3.2754070746771475, "grad_norm": 0.34039202332496643, "learning_rate": 3.2043626629203488e-06, "loss": 0.358, "step": 3889 }, { "epoch": 3.2762492981471083, "grad_norm": 0.34395745396614075, "learning_rate": 3.201618157794715e-06, "loss": 0.3567, "step": 3890 }, { "epoch": 3.277091521617069, "grad_norm": 0.41075563430786133, "learning_rate": 3.198874274878365e-06, "loss": 0.344, "step": 3891 }, { "epoch": 3.27793374508703, "grad_norm": 0.3465714454650879, "learning_rate": 3.196131015120635e-06, "loss": 0.3509, "step": 3892 }, { "epoch": 3.2787759685569906, "grad_norm": 0.33784720301628113, "learning_rate": 3.1933883794706434e-06, "loss": 0.3625, "step": 3893 }, { "epoch": 3.279618192026951, "grad_norm": 0.33036261796951294, "learning_rate": 3.190646368877296e-06, "loss": 0.3731, "step": 3894 }, { "epoch": 3.2804604154969117, "grad_norm": 0.37216717004776, "learning_rate": 3.1879049842892824e-06, "loss": 0.3601, "step": 3895 }, { "epoch": 3.2813026389668725, "grad_norm": 0.41309523582458496, "learning_rate": 3.1851642266550754e-06, "loss": 0.355, "step": 3896 }, { "epoch": 3.2821448624368332, "grad_norm": 0.33816322684288025, "learning_rate": 3.182424096922928e-06, "loss": 0.3524, "step": 3897 }, { "epoch": 3.282987085906794, "grad_norm": 0.3470172882080078, "learning_rate": 3.179684596040878e-06, "loss": 0.3408, "step": 3898 }, { "epoch": 3.2838293093767548, "grad_norm": 0.32205888628959656, "learning_rate": 3.1769457249567477e-06, "loss": 0.3809, "step": 3899 }, { "epoch": 3.2846715328467155, "grad_norm": 0.4093814194202423, "learning_rate": 3.174207484618138e-06, "loss": 0.3594, "step": 3900 }, { "epoch": 3.285513756316676, "grad_norm": 0.3864290118217468, "learning_rate": 3.171469875972436e-06, "loss": 0.3374, "step": 3901 }, { "epoch": 3.2863559797866366, "grad_norm": 0.3624088764190674, "learning_rate": 3.168732899966802e-06, "loss": 0.3722, "step": 3902 }, { "epoch": 3.2871982032565974, "grad_norm": 0.35173699259757996, "learning_rate": 3.165996557548188e-06, "loss": 0.373, "step": 3903 }, { "epoch": 3.288040426726558, "grad_norm": 0.4438587427139282, "learning_rate": 3.1632608496633183e-06, "loss": 0.3683, "step": 3904 }, { "epoch": 3.288882650196519, "grad_norm": 0.3858090043067932, "learning_rate": 3.160525777258703e-06, "loss": 0.3471, "step": 3905 }, { "epoch": 3.2897248736664797, "grad_norm": 0.34025838971138, "learning_rate": 3.157791341280627e-06, "loss": 0.3325, "step": 3906 }, { "epoch": 3.29056709713644, "grad_norm": 0.4680365324020386, "learning_rate": 3.155057542675163e-06, "loss": 0.3557, "step": 3907 }, { "epoch": 3.291409320606401, "grad_norm": 0.3655892014503479, "learning_rate": 3.1523243823881554e-06, "loss": 0.344, "step": 3908 }, { "epoch": 3.2922515440763616, "grad_norm": 0.3896203339099884, "learning_rate": 3.149591861365232e-06, "loss": 0.3638, "step": 3909 }, { "epoch": 3.2930937675463223, "grad_norm": 0.3546338677406311, "learning_rate": 3.1468599805517954e-06, "loss": 0.3796, "step": 3910 }, { "epoch": 3.293935991016283, "grad_norm": 0.40060073137283325, "learning_rate": 3.144128740893034e-06, "loss": 0.3389, "step": 3911 }, { "epoch": 3.294778214486244, "grad_norm": 0.372593492269516, "learning_rate": 3.141398143333907e-06, "loss": 0.3453, "step": 3912 }, { "epoch": 3.295620437956204, "grad_norm": 0.3653518855571747, "learning_rate": 3.1386681888191545e-06, "loss": 0.3745, "step": 3913 }, { "epoch": 3.296462661426165, "grad_norm": 0.3641628324985504, "learning_rate": 3.1359388782932937e-06, "loss": 0.3649, "step": 3914 }, { "epoch": 3.2973048848961257, "grad_norm": 0.406724750995636, "learning_rate": 3.133210212700622e-06, "loss": 0.3802, "step": 3915 }, { "epoch": 3.2981471083660865, "grad_norm": 0.3820309638977051, "learning_rate": 3.1304821929852074e-06, "loss": 0.3376, "step": 3916 }, { "epoch": 3.2989893318360473, "grad_norm": 0.3617289960384369, "learning_rate": 3.1277548200909e-06, "loss": 0.3652, "step": 3917 }, { "epoch": 3.299831555306008, "grad_norm": 0.35112613439559937, "learning_rate": 3.125028094961322e-06, "loss": 0.3329, "step": 3918 }, { "epoch": 3.300673778775969, "grad_norm": 0.3764786720275879, "learning_rate": 3.122302018539877e-06, "loss": 0.38, "step": 3919 }, { "epoch": 3.301516002245929, "grad_norm": 0.37302160263061523, "learning_rate": 3.1195765917697383e-06, "loss": 0.3458, "step": 3920 }, { "epoch": 3.30235822571589, "grad_norm": 0.3473343849182129, "learning_rate": 3.1168518155938577e-06, "loss": 0.3457, "step": 3921 }, { "epoch": 3.3032004491858507, "grad_norm": 0.38351672887802124, "learning_rate": 3.1141276909549583e-06, "loss": 0.3404, "step": 3922 }, { "epoch": 3.3040426726558114, "grad_norm": 0.3861021101474762, "learning_rate": 3.111404218795544e-06, "loss": 0.3784, "step": 3923 }, { "epoch": 3.304884896125772, "grad_norm": 0.321043998003006, "learning_rate": 3.1086814000578903e-06, "loss": 0.3679, "step": 3924 }, { "epoch": 3.3057271195957325, "grad_norm": 0.3557541072368622, "learning_rate": 3.105959235684042e-06, "loss": 0.3725, "step": 3925 }, { "epoch": 3.3065693430656933, "grad_norm": 0.3476422131061554, "learning_rate": 3.103237726615822e-06, "loss": 0.3661, "step": 3926 }, { "epoch": 3.307411566535654, "grad_norm": 0.3327978551387787, "learning_rate": 3.1005168737948277e-06, "loss": 0.3508, "step": 3927 }, { "epoch": 3.308253790005615, "grad_norm": 0.34753647446632385, "learning_rate": 3.097796678162428e-06, "loss": 0.3388, "step": 3928 }, { "epoch": 3.3090960134755756, "grad_norm": 0.3396444320678711, "learning_rate": 3.095077140659762e-06, "loss": 0.3639, "step": 3929 }, { "epoch": 3.3099382369455363, "grad_norm": 0.34610792994499207, "learning_rate": 3.092358262227742e-06, "loss": 0.3754, "step": 3930 }, { "epoch": 3.310780460415497, "grad_norm": 0.3500807583332062, "learning_rate": 3.089640043807056e-06, "loss": 0.3812, "step": 3931 }, { "epoch": 3.3116226838854574, "grad_norm": 0.33787235617637634, "learning_rate": 3.0869224863381606e-06, "loss": 0.3474, "step": 3932 }, { "epoch": 3.312464907355418, "grad_norm": 0.36964499950408936, "learning_rate": 3.0842055907612846e-06, "loss": 0.3644, "step": 3933 }, { "epoch": 3.313307130825379, "grad_norm": 0.37336137890815735, "learning_rate": 3.081489358016423e-06, "loss": 0.3588, "step": 3934 }, { "epoch": 3.3141493542953397, "grad_norm": 0.33108067512512207, "learning_rate": 3.0787737890433505e-06, "loss": 0.361, "step": 3935 }, { "epoch": 3.3149915777653005, "grad_norm": 0.3462041914463043, "learning_rate": 3.076058884781606e-06, "loss": 0.3597, "step": 3936 }, { "epoch": 3.3158338012352613, "grad_norm": 0.39362645149230957, "learning_rate": 3.0733446461705004e-06, "loss": 0.3572, "step": 3937 }, { "epoch": 3.3166760247052216, "grad_norm": 0.3428730368614197, "learning_rate": 3.0706310741491106e-06, "loss": 0.356, "step": 3938 }, { "epoch": 3.3175182481751824, "grad_norm": 0.3211665451526642, "learning_rate": 3.06791816965629e-06, "loss": 0.3685, "step": 3939 }, { "epoch": 3.318360471645143, "grad_norm": 0.3578265607357025, "learning_rate": 3.065205933630655e-06, "loss": 0.3887, "step": 3940 }, { "epoch": 3.319202695115104, "grad_norm": 0.3659106492996216, "learning_rate": 3.062494367010592e-06, "loss": 0.3854, "step": 3941 }, { "epoch": 3.3200449185850647, "grad_norm": 0.3483541011810303, "learning_rate": 3.059783470734259e-06, "loss": 0.3648, "step": 3942 }, { "epoch": 3.3208871420550254, "grad_norm": 0.35765302181243896, "learning_rate": 3.057073245739579e-06, "loss": 0.3736, "step": 3943 }, { "epoch": 3.3217293655249858, "grad_norm": 0.3412524163722992, "learning_rate": 3.054363692964242e-06, "loss": 0.3685, "step": 3944 }, { "epoch": 3.3225715889949465, "grad_norm": 0.342630535364151, "learning_rate": 3.0516548133457057e-06, "loss": 0.3579, "step": 3945 }, { "epoch": 3.3234138124649073, "grad_norm": 0.32356715202331543, "learning_rate": 3.0489466078212e-06, "loss": 0.3616, "step": 3946 }, { "epoch": 3.324256035934868, "grad_norm": 0.3270165026187897, "learning_rate": 3.0462390773277157e-06, "loss": 0.359, "step": 3947 }, { "epoch": 3.325098259404829, "grad_norm": 0.3534395694732666, "learning_rate": 3.04353222280201e-06, "loss": 0.3479, "step": 3948 }, { "epoch": 3.3259404828747896, "grad_norm": 0.3093346357345581, "learning_rate": 3.0408260451806104e-06, "loss": 0.3762, "step": 3949 }, { "epoch": 3.3267827063447504, "grad_norm": 0.38127076625823975, "learning_rate": 3.0381205453998077e-06, "loss": 0.3631, "step": 3950 }, { "epoch": 3.3276249298147107, "grad_norm": 0.34506914019584656, "learning_rate": 3.0354157243956585e-06, "loss": 0.3421, "step": 3951 }, { "epoch": 3.3284671532846715, "grad_norm": 0.32608020305633545, "learning_rate": 3.0327115831039834e-06, "loss": 0.3614, "step": 3952 }, { "epoch": 3.3293093767546322, "grad_norm": 0.32875287532806396, "learning_rate": 3.030008122460368e-06, "loss": 0.3152, "step": 3953 }, { "epoch": 3.330151600224593, "grad_norm": 0.3296133875846863, "learning_rate": 3.0273053434001665e-06, "loss": 0.3446, "step": 3954 }, { "epoch": 3.3309938236945538, "grad_norm": 0.3392683267593384, "learning_rate": 3.0246032468584924e-06, "loss": 0.3683, "step": 3955 }, { "epoch": 3.331836047164514, "grad_norm": 0.31416991353034973, "learning_rate": 3.021901833770226e-06, "loss": 0.3477, "step": 3956 }, { "epoch": 3.332678270634475, "grad_norm": 0.3733919560909271, "learning_rate": 3.0192011050700063e-06, "loss": 0.366, "step": 3957 }, { "epoch": 3.3335204941044356, "grad_norm": 0.32420992851257324, "learning_rate": 3.016501061692243e-06, "loss": 0.3711, "step": 3958 }, { "epoch": 3.3343627175743964, "grad_norm": 0.34191447496414185, "learning_rate": 3.0138017045711036e-06, "loss": 0.3424, "step": 3959 }, { "epoch": 3.335204941044357, "grad_norm": 0.3429345190525055, "learning_rate": 3.01110303464052e-06, "loss": 0.3545, "step": 3960 }, { "epoch": 3.336047164514318, "grad_norm": 0.3393116891384125, "learning_rate": 3.0084050528341826e-06, "loss": 0.3532, "step": 3961 }, { "epoch": 3.3368893879842787, "grad_norm": 0.33168676495552063, "learning_rate": 3.0057077600855512e-06, "loss": 0.3691, "step": 3962 }, { "epoch": 3.337731611454239, "grad_norm": 0.33751994371414185, "learning_rate": 3.00301115732784e-06, "loss": 0.3638, "step": 3963 }, { "epoch": 3.3385738349242, "grad_norm": 0.33293965458869934, "learning_rate": 3.0003152454940295e-06, "loss": 0.352, "step": 3964 }, { "epoch": 3.3394160583941606, "grad_norm": 0.32699623703956604, "learning_rate": 2.997620025516854e-06, "loss": 0.3611, "step": 3965 }, { "epoch": 3.3402582818641213, "grad_norm": 0.37057948112487793, "learning_rate": 2.9949254983288206e-06, "loss": 0.3462, "step": 3966 }, { "epoch": 3.341100505334082, "grad_norm": 0.3751826882362366, "learning_rate": 2.992231664862185e-06, "loss": 0.3404, "step": 3967 }, { "epoch": 3.341942728804043, "grad_norm": 0.3172108829021454, "learning_rate": 2.989538526048968e-06, "loss": 0.3665, "step": 3968 }, { "epoch": 3.342784952274003, "grad_norm": 0.33676087856292725, "learning_rate": 2.986846082820949e-06, "loss": 0.3674, "step": 3969 }, { "epoch": 3.343627175743964, "grad_norm": 0.33155152201652527, "learning_rate": 2.9841543361096693e-06, "loss": 0.3583, "step": 3970 }, { "epoch": 3.3444693992139247, "grad_norm": 0.33290767669677734, "learning_rate": 2.9814632868464254e-06, "loss": 0.3825, "step": 3971 }, { "epoch": 3.3453116226838855, "grad_norm": 0.3355264365673065, "learning_rate": 2.9787729359622747e-06, "loss": 0.3516, "step": 3972 }, { "epoch": 3.3461538461538463, "grad_norm": 0.3262758255004883, "learning_rate": 2.976083284388031e-06, "loss": 0.3578, "step": 3973 }, { "epoch": 3.346996069623807, "grad_norm": 0.3335486054420471, "learning_rate": 2.97339433305427e-06, "loss": 0.3498, "step": 3974 }, { "epoch": 3.3478382930937673, "grad_norm": 0.3504367470741272, "learning_rate": 2.9707060828913226e-06, "loss": 0.3617, "step": 3975 }, { "epoch": 3.348680516563728, "grad_norm": 0.3341201841831207, "learning_rate": 2.9680185348292756e-06, "loss": 0.3686, "step": 3976 }, { "epoch": 3.349522740033689, "grad_norm": 0.31374886631965637, "learning_rate": 2.9653316897979744e-06, "loss": 0.3554, "step": 3977 }, { "epoch": 3.3503649635036497, "grad_norm": 0.33699849247932434, "learning_rate": 2.9626455487270235e-06, "loss": 0.3693, "step": 3978 }, { "epoch": 3.3512071869736104, "grad_norm": 0.3691845238208771, "learning_rate": 2.959960112545781e-06, "loss": 0.3474, "step": 3979 }, { "epoch": 3.352049410443571, "grad_norm": 0.37192556262016296, "learning_rate": 2.95727538218336e-06, "loss": 0.3624, "step": 3980 }, { "epoch": 3.352891633913532, "grad_norm": 0.338464617729187, "learning_rate": 2.954591358568632e-06, "loss": 0.3715, "step": 3981 }, { "epoch": 3.3537338573834923, "grad_norm": 0.3359435200691223, "learning_rate": 2.9519080426302237e-06, "loss": 0.3614, "step": 3982 }, { "epoch": 3.354576080853453, "grad_norm": 0.37928587198257446, "learning_rate": 2.9492254352965184e-06, "loss": 0.3436, "step": 3983 }, { "epoch": 3.355418304323414, "grad_norm": 0.3882048726081848, "learning_rate": 2.946543537495648e-06, "loss": 0.3732, "step": 3984 }, { "epoch": 3.3562605277933746, "grad_norm": 0.34150663018226624, "learning_rate": 2.9438623501555046e-06, "loss": 0.3645, "step": 3985 }, { "epoch": 3.3571027512633353, "grad_norm": 0.32875126600265503, "learning_rate": 2.9411818742037347e-06, "loss": 0.3575, "step": 3986 }, { "epoch": 3.3579449747332957, "grad_norm": 0.3684440851211548, "learning_rate": 2.938502110567736e-06, "loss": 0.3423, "step": 3987 }, { "epoch": 3.3587871982032564, "grad_norm": 0.3557639420032501, "learning_rate": 2.9358230601746617e-06, "loss": 0.3805, "step": 3988 }, { "epoch": 3.359629421673217, "grad_norm": 0.3488691747188568, "learning_rate": 2.9331447239514146e-06, "loss": 0.3749, "step": 3989 }, { "epoch": 3.360471645143178, "grad_norm": 0.3367971181869507, "learning_rate": 2.9304671028246556e-06, "loss": 0.3665, "step": 3990 }, { "epoch": 3.3613138686131387, "grad_norm": 0.33799633383750916, "learning_rate": 2.9277901977207946e-06, "loss": 0.3423, "step": 3991 }, { "epoch": 3.3621560920830995, "grad_norm": 0.3693762421607971, "learning_rate": 2.9251140095659947e-06, "loss": 0.3269, "step": 3992 }, { "epoch": 3.3629983155530603, "grad_norm": 0.35711947083473206, "learning_rate": 2.9224385392861725e-06, "loss": 0.3449, "step": 3993 }, { "epoch": 3.3638405390230206, "grad_norm": 0.352403461933136, "learning_rate": 2.919763787806995e-06, "loss": 0.3619, "step": 3994 }, { "epoch": 3.3646827624929814, "grad_norm": 0.3579275906085968, "learning_rate": 2.9170897560538786e-06, "loss": 0.3681, "step": 3995 }, { "epoch": 3.365524985962942, "grad_norm": 0.32123368978500366, "learning_rate": 2.9144164449519917e-06, "loss": 0.3485, "step": 3996 }, { "epoch": 3.366367209432903, "grad_norm": 0.35754892230033875, "learning_rate": 2.911743855426258e-06, "loss": 0.34, "step": 3997 }, { "epoch": 3.3672094329028637, "grad_norm": 0.3561857044696808, "learning_rate": 2.909071988401343e-06, "loss": 0.3636, "step": 3998 }, { "epoch": 3.3680516563728244, "grad_norm": 0.3341846764087677, "learning_rate": 2.9064008448016713e-06, "loss": 0.3677, "step": 3999 }, { "epoch": 3.3688938798427848, "grad_norm": 0.33063259720802307, "learning_rate": 2.9037304255514087e-06, "loss": 0.3345, "step": 4000 }, { "epoch": 3.3697361033127455, "grad_norm": 0.3148464560508728, "learning_rate": 2.901060731574478e-06, "loss": 0.3768, "step": 4001 }, { "epoch": 3.3705783267827063, "grad_norm": 0.38679176568984985, "learning_rate": 2.898391763794545e-06, "loss": 0.3783, "step": 4002 }, { "epoch": 3.371420550252667, "grad_norm": 0.33660614490509033, "learning_rate": 2.8957235231350276e-06, "loss": 0.3492, "step": 4003 }, { "epoch": 3.372262773722628, "grad_norm": 0.3140118718147278, "learning_rate": 2.893056010519091e-06, "loss": 0.3493, "step": 4004 }, { "epoch": 3.3731049971925886, "grad_norm": 0.3523498773574829, "learning_rate": 2.890389226869651e-06, "loss": 0.3572, "step": 4005 }, { "epoch": 3.373947220662549, "grad_norm": 0.34020718932151794, "learning_rate": 2.887723173109366e-06, "loss": 0.3555, "step": 4006 }, { "epoch": 3.3747894441325097, "grad_norm": 0.3540451228618622, "learning_rate": 2.885057850160647e-06, "loss": 0.3796, "step": 4007 }, { "epoch": 3.3756316676024705, "grad_norm": 0.3257458806037903, "learning_rate": 2.882393258945647e-06, "loss": 0.3376, "step": 4008 }, { "epoch": 3.3764738910724312, "grad_norm": 0.344370037317276, "learning_rate": 2.879729400386272e-06, "loss": 0.3534, "step": 4009 }, { "epoch": 3.377316114542392, "grad_norm": 0.33173030614852905, "learning_rate": 2.8770662754041725e-06, "loss": 0.3619, "step": 4010 }, { "epoch": 3.3781583380123528, "grad_norm": 0.331860214471817, "learning_rate": 2.874403884920741e-06, "loss": 0.3561, "step": 4011 }, { "epoch": 3.3790005614823135, "grad_norm": 0.36565935611724854, "learning_rate": 2.871742229857119e-06, "loss": 0.3419, "step": 4012 }, { "epoch": 3.379842784952274, "grad_norm": 0.35981839895248413, "learning_rate": 2.869081311134194e-06, "loss": 0.3611, "step": 4013 }, { "epoch": 3.3806850084222346, "grad_norm": 0.32831263542175293, "learning_rate": 2.8664211296726006e-06, "loss": 0.3644, "step": 4014 }, { "epoch": 3.3815272318921954, "grad_norm": 0.3491017818450928, "learning_rate": 2.8637616863927155e-06, "loss": 0.3587, "step": 4015 }, { "epoch": 3.382369455362156, "grad_norm": 0.3530135750770569, "learning_rate": 2.861102982214656e-06, "loss": 0.3542, "step": 4016 }, { "epoch": 3.383211678832117, "grad_norm": 0.34738925099372864, "learning_rate": 2.858445018058291e-06, "loss": 0.355, "step": 4017 }, { "epoch": 3.3840539023020773, "grad_norm": 0.36480939388275146, "learning_rate": 2.8557877948432334e-06, "loss": 0.3614, "step": 4018 }, { "epoch": 3.384896125772038, "grad_norm": 0.30635493993759155, "learning_rate": 2.853131313488834e-06, "loss": 0.3404, "step": 4019 }, { "epoch": 3.385738349241999, "grad_norm": 0.3506971001625061, "learning_rate": 2.8504755749141884e-06, "loss": 0.3856, "step": 4020 }, { "epoch": 3.3865805727119596, "grad_norm": 0.35609039664268494, "learning_rate": 2.8478205800381383e-06, "loss": 0.3836, "step": 4021 }, { "epoch": 3.3874227961819203, "grad_norm": 0.33968210220336914, "learning_rate": 2.8451663297792682e-06, "loss": 0.355, "step": 4022 }, { "epoch": 3.388265019651881, "grad_norm": 0.35853901505470276, "learning_rate": 2.842512825055901e-06, "loss": 0.3582, "step": 4023 }, { "epoch": 3.389107243121842, "grad_norm": 0.3334835171699524, "learning_rate": 2.8398600667861032e-06, "loss": 0.3302, "step": 4024 }, { "epoch": 3.389949466591802, "grad_norm": 0.34102413058280945, "learning_rate": 2.8372080558876836e-06, "loss": 0.347, "step": 4025 }, { "epoch": 3.390791690061763, "grad_norm": 0.32541221380233765, "learning_rate": 2.834556793278196e-06, "loss": 0.3818, "step": 4026 }, { "epoch": 3.3916339135317237, "grad_norm": 0.3621748685836792, "learning_rate": 2.83190627987493e-06, "loss": 0.3746, "step": 4027 }, { "epoch": 3.3924761370016845, "grad_norm": 0.31530892848968506, "learning_rate": 2.8292565165949144e-06, "loss": 0.3499, "step": 4028 }, { "epoch": 3.3933183604716453, "grad_norm": 0.36171942949295044, "learning_rate": 2.8266075043549245e-06, "loss": 0.3872, "step": 4029 }, { "epoch": 3.394160583941606, "grad_norm": 0.3305433690547943, "learning_rate": 2.823959244071476e-06, "loss": 0.3815, "step": 4030 }, { "epoch": 3.3950028074115663, "grad_norm": 0.353001207113266, "learning_rate": 2.821311736660819e-06, "loss": 0.3406, "step": 4031 }, { "epoch": 3.395845030881527, "grad_norm": 0.3424752652645111, "learning_rate": 2.8186649830389406e-06, "loss": 0.3529, "step": 4032 }, { "epoch": 3.396687254351488, "grad_norm": 0.3342741131782532, "learning_rate": 2.816018984121581e-06, "loss": 0.3493, "step": 4033 }, { "epoch": 3.3975294778214487, "grad_norm": 0.34895989298820496, "learning_rate": 2.813373740824208e-06, "loss": 0.3625, "step": 4034 }, { "epoch": 3.3983717012914094, "grad_norm": 0.31386202573776245, "learning_rate": 2.8107292540620258e-06, "loss": 0.328, "step": 4035 }, { "epoch": 3.39921392476137, "grad_norm": 0.3151000440120697, "learning_rate": 2.8080855247499844e-06, "loss": 0.3453, "step": 4036 }, { "epoch": 3.4000561482313305, "grad_norm": 0.3565564751625061, "learning_rate": 2.805442553802772e-06, "loss": 0.347, "step": 4037 }, { "epoch": 3.4008983717012913, "grad_norm": 0.34958919882774353, "learning_rate": 2.8028003421348073e-06, "loss": 0.3826, "step": 4038 }, { "epoch": 3.401740595171252, "grad_norm": 0.35022613406181335, "learning_rate": 2.8001588906602495e-06, "loss": 0.3744, "step": 4039 }, { "epoch": 3.402582818641213, "grad_norm": 0.32433995604515076, "learning_rate": 2.797518200292997e-06, "loss": 0.3499, "step": 4040 }, { "epoch": 3.4034250421111736, "grad_norm": 0.3404334485530853, "learning_rate": 2.794878271946686e-06, "loss": 0.3653, "step": 4041 }, { "epoch": 3.4042672655811343, "grad_norm": 0.36992916464805603, "learning_rate": 2.7922391065346844e-06, "loss": 0.349, "step": 4042 }, { "epoch": 3.405109489051095, "grad_norm": 0.3656443655490875, "learning_rate": 2.7896007049700954e-06, "loss": 0.3776, "step": 4043 }, { "epoch": 3.4059517125210554, "grad_norm": 0.3335869610309601, "learning_rate": 2.7869630681657627e-06, "loss": 0.3316, "step": 4044 }, { "epoch": 3.406793935991016, "grad_norm": 0.3384896218776703, "learning_rate": 2.784326197034266e-06, "loss": 0.3519, "step": 4045 }, { "epoch": 3.407636159460977, "grad_norm": 0.353902667760849, "learning_rate": 2.7816900924879164e-06, "loss": 0.3437, "step": 4046 }, { "epoch": 3.4084783829309377, "grad_norm": 0.3571361303329468, "learning_rate": 2.7790547554387572e-06, "loss": 0.3844, "step": 4047 }, { "epoch": 3.4093206064008985, "grad_norm": 0.35536667704582214, "learning_rate": 2.776420186798572e-06, "loss": 0.3464, "step": 4048 }, { "epoch": 3.410162829870859, "grad_norm": 0.324716717004776, "learning_rate": 2.773786387478879e-06, "loss": 0.3365, "step": 4049 }, { "epoch": 3.4110050533408196, "grad_norm": 0.3453685939311981, "learning_rate": 2.7711533583909254e-06, "loss": 0.3584, "step": 4050 }, { "epoch": 3.4118472768107804, "grad_norm": 0.3238871395587921, "learning_rate": 2.7685211004456902e-06, "loss": 0.3746, "step": 4051 }, { "epoch": 3.412689500280741, "grad_norm": 0.3406463861465454, "learning_rate": 2.7658896145538984e-06, "loss": 0.3606, "step": 4052 }, { "epoch": 3.413531723750702, "grad_norm": 0.3186098635196686, "learning_rate": 2.7632589016259927e-06, "loss": 0.344, "step": 4053 }, { "epoch": 3.4143739472206627, "grad_norm": 0.32750093936920166, "learning_rate": 2.7606289625721573e-06, "loss": 0.3368, "step": 4054 }, { "epoch": 3.4152161706906234, "grad_norm": 0.3387409448623657, "learning_rate": 2.7579997983022997e-06, "loss": 0.3772, "step": 4055 }, { "epoch": 3.4160583941605838, "grad_norm": 0.32847222685813904, "learning_rate": 2.7553714097260752e-06, "loss": 0.3414, "step": 4056 }, { "epoch": 3.4169006176305445, "grad_norm": 0.34716543555259705, "learning_rate": 2.7527437977528563e-06, "loss": 0.3745, "step": 4057 }, { "epoch": 3.4177428411005053, "grad_norm": 0.3413195013999939, "learning_rate": 2.7501169632917507e-06, "loss": 0.3571, "step": 4058 }, { "epoch": 3.418585064570466, "grad_norm": 0.319532573223114, "learning_rate": 2.7474909072515994e-06, "loss": 0.3568, "step": 4059 }, { "epoch": 3.419427288040427, "grad_norm": 0.3598223924636841, "learning_rate": 2.7448656305409743e-06, "loss": 0.3729, "step": 4060 }, { "epoch": 3.4202695115103876, "grad_norm": 0.3487666845321655, "learning_rate": 2.7422411340681753e-06, "loss": 0.3756, "step": 4061 }, { "epoch": 3.421111734980348, "grad_norm": 0.3341454565525055, "learning_rate": 2.73961741874123e-06, "loss": 0.3432, "step": 4062 }, { "epoch": 3.4219539584503087, "grad_norm": 0.3133549690246582, "learning_rate": 2.736994485467902e-06, "loss": 0.3289, "step": 4063 }, { "epoch": 3.4227961819202695, "grad_norm": 0.33699503540992737, "learning_rate": 2.7343723351556823e-06, "loss": 0.3667, "step": 4064 }, { "epoch": 3.4236384053902302, "grad_norm": 0.3420717120170593, "learning_rate": 2.7317509687117883e-06, "loss": 0.3662, "step": 4065 }, { "epoch": 3.424480628860191, "grad_norm": 0.334624707698822, "learning_rate": 2.729130387043166e-06, "loss": 0.3612, "step": 4066 }, { "epoch": 3.4253228523301518, "grad_norm": 0.3380492925643921, "learning_rate": 2.7265105910564938e-06, "loss": 0.3664, "step": 4067 }, { "epoch": 3.426165075800112, "grad_norm": 0.3341931402683258, "learning_rate": 2.7238915816581778e-06, "loss": 0.3606, "step": 4068 }, { "epoch": 3.427007299270073, "grad_norm": 0.35635900497436523, "learning_rate": 2.721273359754349e-06, "loss": 0.365, "step": 4069 }, { "epoch": 3.4278495227400336, "grad_norm": 0.36625009775161743, "learning_rate": 2.7186559262508644e-06, "loss": 0.363, "step": 4070 }, { "epoch": 3.4286917462099944, "grad_norm": 0.34740814566612244, "learning_rate": 2.716039282053314e-06, "loss": 0.3582, "step": 4071 }, { "epoch": 3.429533969679955, "grad_norm": 0.32953277230262756, "learning_rate": 2.7134234280670126e-06, "loss": 0.3415, "step": 4072 }, { "epoch": 3.430376193149916, "grad_norm": 0.338922381401062, "learning_rate": 2.710808365197e-06, "loss": 0.3428, "step": 4073 }, { "epoch": 3.4312184166198767, "grad_norm": 0.36806726455688477, "learning_rate": 2.7081940943480413e-06, "loss": 0.3642, "step": 4074 }, { "epoch": 3.432060640089837, "grad_norm": 0.38384148478507996, "learning_rate": 2.7055806164246303e-06, "loss": 0.3493, "step": 4075 }, { "epoch": 3.432902863559798, "grad_norm": 0.3412560522556305, "learning_rate": 2.7029679323309882e-06, "loss": 0.3868, "step": 4076 }, { "epoch": 3.4337450870297586, "grad_norm": 0.3758077323436737, "learning_rate": 2.7003560429710547e-06, "loss": 0.3543, "step": 4077 }, { "epoch": 3.4345873104997193, "grad_norm": 0.31698790192604065, "learning_rate": 2.697744949248503e-06, "loss": 0.3651, "step": 4078 }, { "epoch": 3.43542953396968, "grad_norm": 0.3381580114364624, "learning_rate": 2.6951346520667222e-06, "loss": 0.3482, "step": 4079 }, { "epoch": 3.4362717574396404, "grad_norm": 0.3589038848876953, "learning_rate": 2.692525152328835e-06, "loss": 0.3605, "step": 4080 }, { "epoch": 3.437113980909601, "grad_norm": 0.3343205451965332, "learning_rate": 2.689916450937679e-06, "loss": 0.3618, "step": 4081 }, { "epoch": 3.437956204379562, "grad_norm": 0.3358353078365326, "learning_rate": 2.687308548795825e-06, "loss": 0.3645, "step": 4082 }, { "epoch": 3.4387984278495227, "grad_norm": 0.3352152407169342, "learning_rate": 2.684701446805558e-06, "loss": 0.343, "step": 4083 }, { "epoch": 3.4396406513194835, "grad_norm": 0.34096112847328186, "learning_rate": 2.682095145868894e-06, "loss": 0.3589, "step": 4084 }, { "epoch": 3.4404828747894443, "grad_norm": 0.356208473443985, "learning_rate": 2.679489646887564e-06, "loss": 0.3492, "step": 4085 }, { "epoch": 3.441325098259405, "grad_norm": 0.35380709171295166, "learning_rate": 2.6768849507630322e-06, "loss": 0.3455, "step": 4086 }, { "epoch": 3.4421673217293653, "grad_norm": 0.327147901058197, "learning_rate": 2.674281058396473e-06, "loss": 0.3682, "step": 4087 }, { "epoch": 3.443009545199326, "grad_norm": 0.3516271412372589, "learning_rate": 2.671677970688793e-06, "loss": 0.3495, "step": 4088 }, { "epoch": 3.443851768669287, "grad_norm": 0.34236788749694824, "learning_rate": 2.6690756885406123e-06, "loss": 0.3679, "step": 4089 }, { "epoch": 3.4446939921392477, "grad_norm": 0.33882007002830505, "learning_rate": 2.6664742128522768e-06, "loss": 0.3509, "step": 4090 }, { "epoch": 3.4455362156092084, "grad_norm": 0.32661867141723633, "learning_rate": 2.663873544523856e-06, "loss": 0.375, "step": 4091 }, { "epoch": 3.446378439079169, "grad_norm": 0.336135596036911, "learning_rate": 2.6612736844551333e-06, "loss": 0.3486, "step": 4092 }, { "epoch": 3.4472206625491295, "grad_norm": 0.355324387550354, "learning_rate": 2.6586746335456144e-06, "loss": 0.3584, "step": 4093 }, { "epoch": 3.4480628860190903, "grad_norm": 0.3183237910270691, "learning_rate": 2.656076392694528e-06, "loss": 0.3447, "step": 4094 }, { "epoch": 3.448905109489051, "grad_norm": 0.33511775732040405, "learning_rate": 2.653478962800823e-06, "loss": 0.3833, "step": 4095 }, { "epoch": 3.449747332959012, "grad_norm": 0.3309847414493561, "learning_rate": 2.6508823447631617e-06, "loss": 0.3433, "step": 4096 }, { "epoch": 3.4505895564289726, "grad_norm": 0.35080087184906006, "learning_rate": 2.6482865394799328e-06, "loss": 0.3445, "step": 4097 }, { "epoch": 3.4514317798989333, "grad_norm": 0.3411690890789032, "learning_rate": 2.6456915478492372e-06, "loss": 0.3459, "step": 4098 }, { "epoch": 3.4522740033688937, "grad_norm": 0.3299559950828552, "learning_rate": 2.643097370768901e-06, "loss": 0.3586, "step": 4099 }, { "epoch": 3.4531162268388544, "grad_norm": 0.3447459936141968, "learning_rate": 2.640504009136462e-06, "loss": 0.3675, "step": 4100 }, { "epoch": 3.453958450308815, "grad_norm": 0.32061296701431274, "learning_rate": 2.637911463849181e-06, "loss": 0.364, "step": 4101 }, { "epoch": 3.454800673778776, "grad_norm": 0.3258907198905945, "learning_rate": 2.635319735804032e-06, "loss": 0.3598, "step": 4102 }, { "epoch": 3.4556428972487367, "grad_norm": 0.32895973324775696, "learning_rate": 2.6327288258977123e-06, "loss": 0.3645, "step": 4103 }, { "epoch": 3.4564851207186975, "grad_norm": 0.33157727122306824, "learning_rate": 2.6301387350266274e-06, "loss": 0.3408, "step": 4104 }, { "epoch": 3.4573273441886583, "grad_norm": 0.3591616451740265, "learning_rate": 2.6275494640869082e-06, "loss": 0.3684, "step": 4105 }, { "epoch": 3.4581695676586186, "grad_norm": 0.31011444330215454, "learning_rate": 2.6249610139743947e-06, "loss": 0.3523, "step": 4106 }, { "epoch": 3.4590117911285794, "grad_norm": 0.35885125398635864, "learning_rate": 2.62237338558465e-06, "loss": 0.3823, "step": 4107 }, { "epoch": 3.45985401459854, "grad_norm": 0.3640524446964264, "learning_rate": 2.6197865798129462e-06, "loss": 0.3589, "step": 4108 }, { "epoch": 3.460696238068501, "grad_norm": 0.3379703164100647, "learning_rate": 2.617200597554276e-06, "loss": 0.3756, "step": 4109 }, { "epoch": 3.4615384615384617, "grad_norm": 0.31598788499832153, "learning_rate": 2.614615439703342e-06, "loss": 0.3677, "step": 4110 }, { "epoch": 3.462380685008422, "grad_norm": 0.4074869155883789, "learning_rate": 2.6120311071545683e-06, "loss": 0.3701, "step": 4111 }, { "epoch": 3.4632229084783828, "grad_norm": 0.3770851194858551, "learning_rate": 2.6094476008020854e-06, "loss": 0.329, "step": 4112 }, { "epoch": 3.4640651319483435, "grad_norm": 0.379361093044281, "learning_rate": 2.606864921539747e-06, "loss": 0.3855, "step": 4113 }, { "epoch": 3.4649073554183043, "grad_norm": 0.3306031823158264, "learning_rate": 2.6042830702611106e-06, "loss": 0.3523, "step": 4114 }, { "epoch": 3.465749578888265, "grad_norm": 0.3267592191696167, "learning_rate": 2.601702047859455e-06, "loss": 0.3458, "step": 4115 }, { "epoch": 3.466591802358226, "grad_norm": 0.33639127016067505, "learning_rate": 2.599121855227772e-06, "loss": 0.3586, "step": 4116 }, { "epoch": 3.4674340258281866, "grad_norm": 0.3386143743991852, "learning_rate": 2.5965424932587617e-06, "loss": 0.3498, "step": 4117 }, { "epoch": 3.468276249298147, "grad_norm": 0.3500303626060486, "learning_rate": 2.593963962844838e-06, "loss": 0.3507, "step": 4118 }, { "epoch": 3.4691184727681077, "grad_norm": 0.3567136526107788, "learning_rate": 2.5913862648781284e-06, "loss": 0.3372, "step": 4119 }, { "epoch": 3.4699606962380685, "grad_norm": 0.36097192764282227, "learning_rate": 2.588809400250476e-06, "loss": 0.3428, "step": 4120 }, { "epoch": 3.4708029197080292, "grad_norm": 0.324224591255188, "learning_rate": 2.586233369853428e-06, "loss": 0.3619, "step": 4121 }, { "epoch": 3.47164514317799, "grad_norm": 0.31880635023117065, "learning_rate": 2.5836581745782474e-06, "loss": 0.3717, "step": 4122 }, { "epoch": 3.4724873666479508, "grad_norm": 0.33335456252098083, "learning_rate": 2.581083815315907e-06, "loss": 0.3627, "step": 4123 }, { "epoch": 3.473329590117911, "grad_norm": 0.31850194931030273, "learning_rate": 2.578510292957094e-06, "loss": 0.358, "step": 4124 }, { "epoch": 3.474171813587872, "grad_norm": 0.32885199785232544, "learning_rate": 2.5759376083922006e-06, "loss": 0.3675, "step": 4125 }, { "epoch": 3.4750140370578326, "grad_norm": 0.3164002001285553, "learning_rate": 2.573365762511331e-06, "loss": 0.3564, "step": 4126 }, { "epoch": 3.4758562605277934, "grad_norm": 0.28986039757728577, "learning_rate": 2.570794756204299e-06, "loss": 0.3508, "step": 4127 }, { "epoch": 3.476698483997754, "grad_norm": 0.31401386857032776, "learning_rate": 2.5682245903606335e-06, "loss": 0.3656, "step": 4128 }, { "epoch": 3.477540707467715, "grad_norm": 0.3159821331501007, "learning_rate": 2.5656552658695645e-06, "loss": 0.3643, "step": 4129 }, { "epoch": 3.4783829309376753, "grad_norm": 0.33023855090141296, "learning_rate": 2.5630867836200324e-06, "loss": 0.3687, "step": 4130 }, { "epoch": 3.479225154407636, "grad_norm": 0.3189108669757843, "learning_rate": 2.5605191445006904e-06, "loss": 0.3644, "step": 4131 }, { "epoch": 3.480067377877597, "grad_norm": 0.3391682803630829, "learning_rate": 2.557952349399899e-06, "loss": 0.3509, "step": 4132 }, { "epoch": 3.4809096013475576, "grad_norm": 0.35380446910858154, "learning_rate": 2.555386399205723e-06, "loss": 0.3276, "step": 4133 }, { "epoch": 3.4817518248175183, "grad_norm": 0.30870428681373596, "learning_rate": 2.552821294805936e-06, "loss": 0.3541, "step": 4134 }, { "epoch": 3.482594048287479, "grad_norm": 0.33891648054122925, "learning_rate": 2.5502570370880227e-06, "loss": 0.3848, "step": 4135 }, { "epoch": 3.48343627175744, "grad_norm": 0.33283543586730957, "learning_rate": 2.547693626939173e-06, "loss": 0.3651, "step": 4136 }, { "epoch": 3.4842784952274, "grad_norm": 0.32089632749557495, "learning_rate": 2.5451310652462803e-06, "loss": 0.3692, "step": 4137 }, { "epoch": 3.485120718697361, "grad_norm": 0.32973116636276245, "learning_rate": 2.542569352895945e-06, "loss": 0.3634, "step": 4138 }, { "epoch": 3.4859629421673217, "grad_norm": 0.3439074456691742, "learning_rate": 2.540008490774483e-06, "loss": 0.3681, "step": 4139 }, { "epoch": 3.4868051656372825, "grad_norm": 0.31594017148017883, "learning_rate": 2.5374484797679034e-06, "loss": 0.3572, "step": 4140 }, { "epoch": 3.4876473891072433, "grad_norm": 0.33344870805740356, "learning_rate": 2.534889320761926e-06, "loss": 0.3714, "step": 4141 }, { "epoch": 3.4884896125772036, "grad_norm": 0.3502652049064636, "learning_rate": 2.532331014641977e-06, "loss": 0.3428, "step": 4142 }, { "epoch": 3.4893318360471643, "grad_norm": 0.3211594521999359, "learning_rate": 2.5297735622931873e-06, "loss": 0.3597, "step": 4143 }, { "epoch": 3.490174059517125, "grad_norm": 0.3066428005695343, "learning_rate": 2.527216964600391e-06, "loss": 0.3572, "step": 4144 }, { "epoch": 3.491016282987086, "grad_norm": 0.3135976493358612, "learning_rate": 2.524661222448126e-06, "loss": 0.375, "step": 4145 }, { "epoch": 3.4918585064570467, "grad_norm": 0.3422052562236786, "learning_rate": 2.522106336720635e-06, "loss": 0.3477, "step": 4146 }, { "epoch": 3.4927007299270074, "grad_norm": 0.3152230978012085, "learning_rate": 2.519552308301868e-06, "loss": 0.3593, "step": 4147 }, { "epoch": 3.493542953396968, "grad_norm": 0.3488667607307434, "learning_rate": 2.5169991380754734e-06, "loss": 0.3777, "step": 4148 }, { "epoch": 3.4943851768669285, "grad_norm": 0.31823256611824036, "learning_rate": 2.5144468269248006e-06, "loss": 0.3441, "step": 4149 }, { "epoch": 3.4952274003368893, "grad_norm": 0.3213183581829071, "learning_rate": 2.511895375732909e-06, "loss": 0.3579, "step": 4150 }, { "epoch": 3.49606962380685, "grad_norm": 0.3403041958808899, "learning_rate": 2.5093447853825586e-06, "loss": 0.36, "step": 4151 }, { "epoch": 3.496911847276811, "grad_norm": 0.3767974376678467, "learning_rate": 2.506795056756208e-06, "loss": 0.3688, "step": 4152 }, { "epoch": 3.4977540707467716, "grad_norm": 0.3247455656528473, "learning_rate": 2.504246190736017e-06, "loss": 0.3598, "step": 4153 }, { "epoch": 3.4985962942167323, "grad_norm": 0.3373487889766693, "learning_rate": 2.5016981882038528e-06, "loss": 0.354, "step": 4154 }, { "epoch": 3.499438517686693, "grad_norm": 0.34572064876556396, "learning_rate": 2.499151050041281e-06, "loss": 0.35, "step": 4155 }, { "epoch": 3.5002807411566534, "grad_norm": 0.383696049451828, "learning_rate": 2.496604777129567e-06, "loss": 0.3532, "step": 4156 }, { "epoch": 3.501122964626614, "grad_norm": 0.41186851263046265, "learning_rate": 2.4940593703496727e-06, "loss": 0.3685, "step": 4157 }, { "epoch": 3.501965188096575, "grad_norm": 0.31322145462036133, "learning_rate": 2.4915148305822737e-06, "loss": 0.3546, "step": 4158 }, { "epoch": 3.5028074115665357, "grad_norm": 0.3546171486377716, "learning_rate": 2.4889711587077333e-06, "loss": 0.3701, "step": 4159 }, { "epoch": 3.5036496350364965, "grad_norm": 0.38450533151626587, "learning_rate": 2.4864283556061182e-06, "loss": 0.3538, "step": 4160 }, { "epoch": 3.504491858506457, "grad_norm": 0.3794398903846741, "learning_rate": 2.48388642215719e-06, "loss": 0.3436, "step": 4161 }, { "epoch": 3.5053340819764176, "grad_norm": 0.3441806733608246, "learning_rate": 2.481345359240423e-06, "loss": 0.3341, "step": 4162 }, { "epoch": 3.5061763054463784, "grad_norm": 0.3659519851207733, "learning_rate": 2.4788051677349765e-06, "loss": 0.3542, "step": 4163 }, { "epoch": 3.507018528916339, "grad_norm": 0.3196214437484741, "learning_rate": 2.476265848519712e-06, "loss": 0.3809, "step": 4164 }, { "epoch": 3.5078607523863, "grad_norm": 0.35345837473869324, "learning_rate": 2.4737274024731926e-06, "loss": 0.3388, "step": 4165 }, { "epoch": 3.5087029758562607, "grad_norm": 0.3783683776855469, "learning_rate": 2.4711898304736785e-06, "loss": 0.3813, "step": 4166 }, { "epoch": 3.5095451993262214, "grad_norm": 0.376730352640152, "learning_rate": 2.4686531333991244e-06, "loss": 0.3429, "step": 4167 }, { "epoch": 3.5103874227961818, "grad_norm": 0.35761433839797974, "learning_rate": 2.4661173121271813e-06, "loss": 0.3773, "step": 4168 }, { "epoch": 3.5112296462661425, "grad_norm": 0.3899518549442291, "learning_rate": 2.463582367535203e-06, "loss": 0.3679, "step": 4169 }, { "epoch": 3.5120718697361033, "grad_norm": 0.3535291850566864, "learning_rate": 2.461048300500238e-06, "loss": 0.3523, "step": 4170 }, { "epoch": 3.512914093206064, "grad_norm": 0.3955758213996887, "learning_rate": 2.4585151118990286e-06, "loss": 0.3944, "step": 4171 }, { "epoch": 3.513756316676025, "grad_norm": 0.368109792470932, "learning_rate": 2.4559828026080116e-06, "loss": 0.3435, "step": 4172 }, { "epoch": 3.514598540145985, "grad_norm": 0.32325494289398193, "learning_rate": 2.4534513735033265e-06, "loss": 0.3711, "step": 4173 }, { "epoch": 3.5154407636159464, "grad_norm": 0.30904823541641235, "learning_rate": 2.4509208254608047e-06, "loss": 0.3629, "step": 4174 }, { "epoch": 3.5162829870859067, "grad_norm": 0.33851927518844604, "learning_rate": 2.4483911593559706e-06, "loss": 0.3576, "step": 4175 }, { "epoch": 3.5171252105558675, "grad_norm": 0.3652513325214386, "learning_rate": 2.4458623760640442e-06, "loss": 0.3412, "step": 4176 }, { "epoch": 3.5179674340258282, "grad_norm": 0.3588097393512726, "learning_rate": 2.443334476459943e-06, "loss": 0.3423, "step": 4177 }, { "epoch": 3.518809657495789, "grad_norm": 0.3631252348423004, "learning_rate": 2.4408074614182774e-06, "loss": 0.3568, "step": 4178 }, { "epoch": 3.5196518809657498, "grad_norm": 0.3365021347999573, "learning_rate": 2.4382813318133513e-06, "loss": 0.3709, "step": 4179 }, { "epoch": 3.52049410443571, "grad_norm": 0.33208146691322327, "learning_rate": 2.4357560885191588e-06, "loss": 0.3688, "step": 4180 }, { "epoch": 3.521336327905671, "grad_norm": 0.3363385796546936, "learning_rate": 2.433231732409394e-06, "loss": 0.3855, "step": 4181 }, { "epoch": 3.5221785513756316, "grad_norm": 0.3584827780723572, "learning_rate": 2.4307082643574413e-06, "loss": 0.384, "step": 4182 }, { "epoch": 3.5230207748455924, "grad_norm": 0.35280972719192505, "learning_rate": 2.428185685236375e-06, "loss": 0.3296, "step": 4183 }, { "epoch": 3.523862998315553, "grad_norm": 0.36235520243644714, "learning_rate": 2.4256639959189675e-06, "loss": 0.3378, "step": 4184 }, { "epoch": 3.5247052217855135, "grad_norm": 0.3435809910297394, "learning_rate": 2.423143197277676e-06, "loss": 0.3551, "step": 4185 }, { "epoch": 3.5255474452554747, "grad_norm": 0.3685091733932495, "learning_rate": 2.420623290184657e-06, "loss": 0.3443, "step": 4186 }, { "epoch": 3.526389668725435, "grad_norm": 0.3846760392189026, "learning_rate": 2.4181042755117525e-06, "loss": 0.3563, "step": 4187 }, { "epoch": 3.527231892195396, "grad_norm": 0.3705516755580902, "learning_rate": 2.415586154130499e-06, "loss": 0.3667, "step": 4188 }, { "epoch": 3.5280741156653566, "grad_norm": 0.3318955898284912, "learning_rate": 2.4130689269121265e-06, "loss": 0.3564, "step": 4189 }, { "epoch": 3.5289163391353173, "grad_norm": 0.35930299758911133, "learning_rate": 2.4105525947275497e-06, "loss": 0.3512, "step": 4190 }, { "epoch": 3.529758562605278, "grad_norm": 0.3631415367126465, "learning_rate": 2.408037158447375e-06, "loss": 0.3499, "step": 4191 }, { "epoch": 3.5306007860752384, "grad_norm": 0.33755242824554443, "learning_rate": 2.4055226189419017e-06, "loss": 0.3522, "step": 4192 }, { "epoch": 3.531443009545199, "grad_norm": 0.3153926730155945, "learning_rate": 2.40300897708112e-06, "loss": 0.3426, "step": 4193 }, { "epoch": 3.53228523301516, "grad_norm": 0.34617164731025696, "learning_rate": 2.4004962337347036e-06, "loss": 0.3662, "step": 4194 }, { "epoch": 3.5331274564851207, "grad_norm": 0.345333069562912, "learning_rate": 2.3979843897720178e-06, "loss": 0.3513, "step": 4195 }, { "epoch": 3.5339696799550815, "grad_norm": 0.3547077775001526, "learning_rate": 2.3954734460621195e-06, "loss": 0.3722, "step": 4196 }, { "epoch": 3.5348119034250423, "grad_norm": 0.3371422290802002, "learning_rate": 2.3929634034737533e-06, "loss": 0.3461, "step": 4197 }, { "epoch": 3.535654126895003, "grad_norm": 0.3565349876880646, "learning_rate": 2.39045426287535e-06, "loss": 0.3824, "step": 4198 }, { "epoch": 3.5364963503649633, "grad_norm": 0.316873162984848, "learning_rate": 2.3879460251350255e-06, "loss": 0.357, "step": 4199 }, { "epoch": 3.537338573834924, "grad_norm": 0.3213164210319519, "learning_rate": 2.385438691120591e-06, "loss": 0.3691, "step": 4200 }, { "epoch": 3.538180797304885, "grad_norm": 0.33925601840019226, "learning_rate": 2.3829322616995426e-06, "loss": 0.3455, "step": 4201 }, { "epoch": 3.5390230207748457, "grad_norm": 0.3280434012413025, "learning_rate": 2.380426737739058e-06, "loss": 0.3469, "step": 4202 }, { "epoch": 3.5398652442448064, "grad_norm": 0.30479368567466736, "learning_rate": 2.377922120106008e-06, "loss": 0.3573, "step": 4203 }, { "epoch": 3.5407074677147667, "grad_norm": 0.3169383108615875, "learning_rate": 2.3754184096669448e-06, "loss": 0.3597, "step": 4204 }, { "epoch": 3.541549691184728, "grad_norm": 0.3660662770271301, "learning_rate": 2.3729156072881133e-06, "loss": 0.347, "step": 4205 }, { "epoch": 3.5423919146546883, "grad_norm": 0.34468621015548706, "learning_rate": 2.3704137138354357e-06, "loss": 0.336, "step": 4206 }, { "epoch": 3.543234138124649, "grad_norm": 0.3239629864692688, "learning_rate": 2.3679127301745286e-06, "loss": 0.3404, "step": 4207 }, { "epoch": 3.54407636159461, "grad_norm": 0.3347218632698059, "learning_rate": 2.3654126571706847e-06, "loss": 0.351, "step": 4208 }, { "epoch": 3.5449185850645706, "grad_norm": 0.3538128733634949, "learning_rate": 2.3629134956888917e-06, "loss": 0.3602, "step": 4209 }, { "epoch": 3.5457608085345313, "grad_norm": 0.3414236605167389, "learning_rate": 2.360415246593812e-06, "loss": 0.3554, "step": 4210 }, { "epoch": 3.5466030320044917, "grad_norm": 0.34543079137802124, "learning_rate": 2.3579179107498e-06, "loss": 0.3762, "step": 4211 }, { "epoch": 3.5474452554744524, "grad_norm": 0.34067174792289734, "learning_rate": 2.355421489020889e-06, "loss": 0.3644, "step": 4212 }, { "epoch": 3.548287478944413, "grad_norm": 0.3406452536582947, "learning_rate": 2.3529259822708002e-06, "loss": 0.3636, "step": 4213 }, { "epoch": 3.549129702414374, "grad_norm": 0.32081446051597595, "learning_rate": 2.3504313913629333e-06, "loss": 0.3666, "step": 4214 }, { "epoch": 3.5499719258843347, "grad_norm": 0.32635653018951416, "learning_rate": 2.347937717160377e-06, "loss": 0.3472, "step": 4215 }, { "epoch": 3.550814149354295, "grad_norm": 0.3293617367744446, "learning_rate": 2.345444960525898e-06, "loss": 0.3471, "step": 4216 }, { "epoch": 3.5516563728242563, "grad_norm": 0.3080567717552185, "learning_rate": 2.3429531223219486e-06, "loss": 0.3672, "step": 4217 }, { "epoch": 3.5524985962942166, "grad_norm": 0.33831316232681274, "learning_rate": 2.34046220341066e-06, "loss": 0.3771, "step": 4218 }, { "epoch": 3.5533408197641774, "grad_norm": 0.3346281945705414, "learning_rate": 2.3379722046538504e-06, "loss": 0.3682, "step": 4219 }, { "epoch": 3.554183043234138, "grad_norm": 0.32037025690078735, "learning_rate": 2.3354831269130133e-06, "loss": 0.3878, "step": 4220 }, { "epoch": 3.555025266704099, "grad_norm": 0.3295297622680664, "learning_rate": 2.3329949710493304e-06, "loss": 0.3677, "step": 4221 }, { "epoch": 3.5558674901740597, "grad_norm": 0.33413538336753845, "learning_rate": 2.330507737923658e-06, "loss": 0.3752, "step": 4222 }, { "epoch": 3.55670971364402, "grad_norm": 0.3400398790836334, "learning_rate": 2.3280214283965387e-06, "loss": 0.3402, "step": 4223 }, { "epoch": 3.5575519371139808, "grad_norm": 0.3216843903064728, "learning_rate": 2.3255360433281897e-06, "loss": 0.3753, "step": 4224 }, { "epoch": 3.5583941605839415, "grad_norm": 0.32190361618995667, "learning_rate": 2.323051583578514e-06, "loss": 0.3557, "step": 4225 }, { "epoch": 3.5592363840539023, "grad_norm": 0.3833969533443451, "learning_rate": 2.3205680500070924e-06, "loss": 0.3932, "step": 4226 }, { "epoch": 3.560078607523863, "grad_norm": 0.337167352437973, "learning_rate": 2.3180854434731853e-06, "loss": 0.3712, "step": 4227 }, { "epoch": 3.560920830993824, "grad_norm": 0.3195006251335144, "learning_rate": 2.315603764835729e-06, "loss": 0.3606, "step": 4228 }, { "epoch": 3.5617630544637846, "grad_norm": 0.3586292862892151, "learning_rate": 2.313123014953343e-06, "loss": 0.3649, "step": 4229 }, { "epoch": 3.562605277933745, "grad_norm": 0.35695773363113403, "learning_rate": 2.3106431946843265e-06, "loss": 0.3567, "step": 4230 }, { "epoch": 3.5634475014037057, "grad_norm": 0.3264645040035248, "learning_rate": 2.3081643048866536e-06, "loss": 0.3298, "step": 4231 }, { "epoch": 3.5642897248736665, "grad_norm": 0.34983348846435547, "learning_rate": 2.3056863464179756e-06, "loss": 0.3799, "step": 4232 }, { "epoch": 3.5651319483436272, "grad_norm": 0.34579721093177795, "learning_rate": 2.303209320135625e-06, "loss": 0.3677, "step": 4233 }, { "epoch": 3.565974171813588, "grad_norm": 0.33737221360206604, "learning_rate": 2.300733226896612e-06, "loss": 0.3696, "step": 4234 }, { "epoch": 3.5668163952835483, "grad_norm": 0.32288244366645813, "learning_rate": 2.2982580675576214e-06, "loss": 0.3618, "step": 4235 }, { "epoch": 3.5676586187535095, "grad_norm": 0.3444443941116333, "learning_rate": 2.295783842975014e-06, "loss": 0.3509, "step": 4236 }, { "epoch": 3.56850084222347, "grad_norm": 0.3193950355052948, "learning_rate": 2.2933105540048307e-06, "loss": 0.3666, "step": 4237 }, { "epoch": 3.5693430656934306, "grad_norm": 0.3169402778148651, "learning_rate": 2.2908382015027887e-06, "loss": 0.3717, "step": 4238 }, { "epoch": 3.5701852891633914, "grad_norm": 0.36697083711624146, "learning_rate": 2.288366786324276e-06, "loss": 0.3509, "step": 4239 }, { "epoch": 3.571027512633352, "grad_norm": 0.3353302478790283, "learning_rate": 2.2858963093243637e-06, "loss": 0.3688, "step": 4240 }, { "epoch": 3.571869736103313, "grad_norm": 0.33169856667518616, "learning_rate": 2.2834267713577908e-06, "loss": 0.3775, "step": 4241 }, { "epoch": 3.5727119595732733, "grad_norm": 0.31672993302345276, "learning_rate": 2.280958173278978e-06, "loss": 0.3675, "step": 4242 }, { "epoch": 3.573554183043234, "grad_norm": 0.35060515999794006, "learning_rate": 2.2784905159420157e-06, "loss": 0.3509, "step": 4243 }, { "epoch": 3.574396406513195, "grad_norm": 0.354946494102478, "learning_rate": 2.2760238002006717e-06, "loss": 0.3402, "step": 4244 }, { "epoch": 3.5752386299831556, "grad_norm": 0.3521469831466675, "learning_rate": 2.2735580269083896e-06, "loss": 0.3564, "step": 4245 }, { "epoch": 3.5760808534531163, "grad_norm": 0.34278154373168945, "learning_rate": 2.2710931969182833e-06, "loss": 0.3748, "step": 4246 }, { "epoch": 3.5769230769230766, "grad_norm": 0.3388248085975647, "learning_rate": 2.2686293110831387e-06, "loss": 0.3722, "step": 4247 }, { "epoch": 3.577765300393038, "grad_norm": 0.3300139904022217, "learning_rate": 2.266166370255421e-06, "loss": 0.3501, "step": 4248 }, { "epoch": 3.578607523862998, "grad_norm": 0.330220103263855, "learning_rate": 2.263704375287267e-06, "loss": 0.3424, "step": 4249 }, { "epoch": 3.579449747332959, "grad_norm": 0.35566189885139465, "learning_rate": 2.2612433270304824e-06, "loss": 0.3806, "step": 4250 }, { "epoch": 3.5802919708029197, "grad_norm": 0.34553954005241394, "learning_rate": 2.2587832263365467e-06, "loss": 0.3639, "step": 4251 }, { "epoch": 3.5811341942728805, "grad_norm": 0.33233484625816345, "learning_rate": 2.2563240740566138e-06, "loss": 0.3443, "step": 4252 }, { "epoch": 3.5819764177428413, "grad_norm": 0.31534507870674133, "learning_rate": 2.2538658710415102e-06, "loss": 0.3524, "step": 4253 }, { "epoch": 3.5828186412128016, "grad_norm": 0.37039825320243835, "learning_rate": 2.2514086181417305e-06, "loss": 0.3534, "step": 4254 }, { "epoch": 3.5836608646827623, "grad_norm": 0.32621845602989197, "learning_rate": 2.2489523162074394e-06, "loss": 0.3595, "step": 4255 }, { "epoch": 3.584503088152723, "grad_norm": 0.3215562403202057, "learning_rate": 2.2464969660884783e-06, "loss": 0.3416, "step": 4256 }, { "epoch": 3.585345311622684, "grad_norm": 0.3456661105155945, "learning_rate": 2.244042568634357e-06, "loss": 0.3565, "step": 4257 }, { "epoch": 3.5861875350926447, "grad_norm": 0.31173479557037354, "learning_rate": 2.2415891246942534e-06, "loss": 0.3524, "step": 4258 }, { "epoch": 3.5870297585626054, "grad_norm": 0.35013508796691895, "learning_rate": 2.239136635117016e-06, "loss": 0.366, "step": 4259 }, { "epoch": 3.587871982032566, "grad_norm": 0.3399290144443512, "learning_rate": 2.2366851007511643e-06, "loss": 0.3455, "step": 4260 }, { "epoch": 3.5887142055025265, "grad_norm": 0.3176720142364502, "learning_rate": 2.2342345224448907e-06, "loss": 0.3586, "step": 4261 }, { "epoch": 3.5895564289724873, "grad_norm": 0.3059355914592743, "learning_rate": 2.2317849010460508e-06, "loss": 0.3486, "step": 4262 }, { "epoch": 3.590398652442448, "grad_norm": 0.3350572884082794, "learning_rate": 2.229336237402167e-06, "loss": 0.358, "step": 4263 }, { "epoch": 3.591240875912409, "grad_norm": 0.361370325088501, "learning_rate": 2.226888532360444e-06, "loss": 0.3682, "step": 4264 }, { "epoch": 3.5920830993823696, "grad_norm": 0.36793506145477295, "learning_rate": 2.224441786767741e-06, "loss": 0.3488, "step": 4265 }, { "epoch": 3.59292532285233, "grad_norm": 0.31614458560943604, "learning_rate": 2.2219960014705898e-06, "loss": 0.3874, "step": 4266 }, { "epoch": 3.593767546322291, "grad_norm": 0.33482491970062256, "learning_rate": 2.219551177315188e-06, "loss": 0.3765, "step": 4267 }, { "epoch": 3.5946097697922514, "grad_norm": 0.3302411735057831, "learning_rate": 2.217107315147408e-06, "loss": 0.3714, "step": 4268 }, { "epoch": 3.595451993262212, "grad_norm": 0.4057002365589142, "learning_rate": 2.2146644158127827e-06, "loss": 0.3597, "step": 4269 }, { "epoch": 3.596294216732173, "grad_norm": 0.3448302447795868, "learning_rate": 2.21222248015651e-06, "loss": 0.3604, "step": 4270 }, { "epoch": 3.5971364402021337, "grad_norm": 0.33568713068962097, "learning_rate": 2.2097815090234604e-06, "loss": 0.3475, "step": 4271 }, { "epoch": 3.5979786636720945, "grad_norm": 0.3319399356842041, "learning_rate": 2.207341503258169e-06, "loss": 0.347, "step": 4272 }, { "epoch": 3.598820887142055, "grad_norm": 0.3903086185455322, "learning_rate": 2.2049024637048356e-06, "loss": 0.3462, "step": 4273 }, { "epoch": 3.5996631106120156, "grad_norm": 0.3182038366794586, "learning_rate": 2.202464391207323e-06, "loss": 0.3691, "step": 4274 }, { "epoch": 3.6005053340819764, "grad_norm": 0.38996756076812744, "learning_rate": 2.2000272866091647e-06, "loss": 0.3526, "step": 4275 }, { "epoch": 3.601347557551937, "grad_norm": 0.3317779004573822, "learning_rate": 2.1975911507535593e-06, "loss": 0.3739, "step": 4276 }, { "epoch": 3.602189781021898, "grad_norm": 0.31816425919532776, "learning_rate": 2.1951559844833655e-06, "loss": 0.3484, "step": 4277 }, { "epoch": 3.6030320044918582, "grad_norm": 0.33124062418937683, "learning_rate": 2.1927217886411074e-06, "loss": 0.3557, "step": 4278 }, { "epoch": 3.6038742279618194, "grad_norm": 0.3479458689689636, "learning_rate": 2.190288564068977e-06, "loss": 0.3636, "step": 4279 }, { "epoch": 3.6047164514317798, "grad_norm": 0.3206084668636322, "learning_rate": 2.18785631160883e-06, "loss": 0.3633, "step": 4280 }, { "epoch": 3.6055586749017405, "grad_norm": 0.34817495942115784, "learning_rate": 2.185425032102183e-06, "loss": 0.3566, "step": 4281 }, { "epoch": 3.6064008983717013, "grad_norm": 0.3455098569393158, "learning_rate": 2.1829947263902148e-06, "loss": 0.3399, "step": 4282 }, { "epoch": 3.607243121841662, "grad_norm": 0.31409749388694763, "learning_rate": 2.1805653953137705e-06, "loss": 0.3542, "step": 4283 }, { "epoch": 3.608085345311623, "grad_norm": 0.33304041624069214, "learning_rate": 2.1781370397133594e-06, "loss": 0.355, "step": 4284 }, { "epoch": 3.608927568781583, "grad_norm": 0.349978506565094, "learning_rate": 2.1757096604291493e-06, "loss": 0.3462, "step": 4285 }, { "epoch": 3.609769792251544, "grad_norm": 0.35174107551574707, "learning_rate": 2.17328325830097e-06, "loss": 0.3827, "step": 4286 }, { "epoch": 3.6106120157215047, "grad_norm": 0.353162944316864, "learning_rate": 2.170857834168317e-06, "loss": 0.3534, "step": 4287 }, { "epoch": 3.6114542391914655, "grad_norm": 0.3596218228340149, "learning_rate": 2.1684333888703457e-06, "loss": 0.3707, "step": 4288 }, { "epoch": 3.6122964626614262, "grad_norm": 0.362157940864563, "learning_rate": 2.1660099232458714e-06, "loss": 0.3555, "step": 4289 }, { "epoch": 3.613138686131387, "grad_norm": 0.3499021530151367, "learning_rate": 2.1635874381333716e-06, "loss": 0.3747, "step": 4290 }, { "epoch": 3.6139809096013478, "grad_norm": 0.37353524565696716, "learning_rate": 2.1611659343709873e-06, "loss": 0.3557, "step": 4291 }, { "epoch": 3.614823133071308, "grad_norm": 0.3528103232383728, "learning_rate": 2.1587454127965152e-06, "loss": 0.367, "step": 4292 }, { "epoch": 3.615665356541269, "grad_norm": 0.38172510266304016, "learning_rate": 2.1563258742474124e-06, "loss": 0.354, "step": 4293 }, { "epoch": 3.6165075800112296, "grad_norm": 0.3510737419128418, "learning_rate": 2.1539073195608e-06, "loss": 0.3622, "step": 4294 }, { "epoch": 3.6173498034811904, "grad_norm": 0.3482060432434082, "learning_rate": 2.1514897495734583e-06, "loss": 0.3635, "step": 4295 }, { "epoch": 3.618192026951151, "grad_norm": 0.33367085456848145, "learning_rate": 2.1490731651218222e-06, "loss": 0.3572, "step": 4296 }, { "epoch": 3.6190342504211115, "grad_norm": 0.3386695086956024, "learning_rate": 2.1466575670419875e-06, "loss": 0.3459, "step": 4297 }, { "epoch": 3.6198764738910727, "grad_norm": 0.3380354940891266, "learning_rate": 2.1442429561697116e-06, "loss": 0.3513, "step": 4298 }, { "epoch": 3.620718697361033, "grad_norm": 0.3373531401157379, "learning_rate": 2.14182933334041e-06, "loss": 0.3585, "step": 4299 }, { "epoch": 3.621560920830994, "grad_norm": 0.34339073300361633, "learning_rate": 2.139416699389153e-06, "loss": 0.3653, "step": 4300 }, { "epoch": 3.6224031443009546, "grad_norm": 0.3570534288883209, "learning_rate": 2.137005055150669e-06, "loss": 0.3497, "step": 4301 }, { "epoch": 3.6232453677709153, "grad_norm": 0.3182597756385803, "learning_rate": 2.1345944014593474e-06, "loss": 0.3651, "step": 4302 }, { "epoch": 3.624087591240876, "grad_norm": 0.3396379053592682, "learning_rate": 2.132184739149234e-06, "loss": 0.3988, "step": 4303 }, { "epoch": 3.6249298147108364, "grad_norm": 0.3426070213317871, "learning_rate": 2.1297760690540304e-06, "loss": 0.3688, "step": 4304 }, { "epoch": 3.625772038180797, "grad_norm": 0.33090734481811523, "learning_rate": 2.127368392007092e-06, "loss": 0.3461, "step": 4305 }, { "epoch": 3.626614261650758, "grad_norm": 0.32062530517578125, "learning_rate": 2.1249617088414357e-06, "loss": 0.3525, "step": 4306 }, { "epoch": 3.6274564851207187, "grad_norm": 0.3516179919242859, "learning_rate": 2.122556020389735e-06, "loss": 0.3745, "step": 4307 }, { "epoch": 3.6282987085906795, "grad_norm": 0.33987957239151, "learning_rate": 2.1201513274843133e-06, "loss": 0.3672, "step": 4308 }, { "epoch": 3.62914093206064, "grad_norm": 0.30638623237609863, "learning_rate": 2.1177476309571564e-06, "loss": 0.343, "step": 4309 }, { "epoch": 3.629983155530601, "grad_norm": 0.3406371474266052, "learning_rate": 2.115344931639898e-06, "loss": 0.3685, "step": 4310 }, { "epoch": 3.6308253790005613, "grad_norm": 0.3331175148487091, "learning_rate": 2.1129432303638354e-06, "loss": 0.3505, "step": 4311 }, { "epoch": 3.631667602470522, "grad_norm": 0.31972333788871765, "learning_rate": 2.1105425279599125e-06, "loss": 0.3591, "step": 4312 }, { "epoch": 3.632509825940483, "grad_norm": 0.3316403031349182, "learning_rate": 2.1081428252587334e-06, "loss": 0.3633, "step": 4313 }, { "epoch": 3.6333520494104437, "grad_norm": 0.32091856002807617, "learning_rate": 2.1057441230905517e-06, "loss": 0.3533, "step": 4314 }, { "epoch": 3.6341942728804044, "grad_norm": 0.3374306261539459, "learning_rate": 2.1033464222852806e-06, "loss": 0.3649, "step": 4315 }, { "epoch": 3.6350364963503647, "grad_norm": 0.334473580121994, "learning_rate": 2.1009497236724795e-06, "loss": 0.3672, "step": 4316 }, { "epoch": 3.6358787198203255, "grad_norm": 0.33112913370132446, "learning_rate": 2.098554028081369e-06, "loss": 0.3532, "step": 4317 }, { "epoch": 3.6367209432902863, "grad_norm": 0.31698158383369446, "learning_rate": 2.0961593363408154e-06, "loss": 0.3586, "step": 4318 }, { "epoch": 3.637563166760247, "grad_norm": 0.3299761712551117, "learning_rate": 2.093765649279344e-06, "loss": 0.3461, "step": 4319 }, { "epoch": 3.638405390230208, "grad_norm": 0.3536585867404938, "learning_rate": 2.091372967725126e-06, "loss": 0.3461, "step": 4320 }, { "epoch": 3.6392476137001686, "grad_norm": 0.3062922954559326, "learning_rate": 2.0889812925059922e-06, "loss": 0.3568, "step": 4321 }, { "epoch": 3.6400898371701293, "grad_norm": 0.3213004171848297, "learning_rate": 2.0865906244494166e-06, "loss": 0.3609, "step": 4322 }, { "epoch": 3.6409320606400897, "grad_norm": 0.30256277322769165, "learning_rate": 2.084200964382534e-06, "loss": 0.3433, "step": 4323 }, { "epoch": 3.6417742841100504, "grad_norm": 0.32316258549690247, "learning_rate": 2.081812313132122e-06, "loss": 0.3544, "step": 4324 }, { "epoch": 3.642616507580011, "grad_norm": 0.32566460967063904, "learning_rate": 2.0794246715246164e-06, "loss": 0.3627, "step": 4325 }, { "epoch": 3.643458731049972, "grad_norm": 0.30945396423339844, "learning_rate": 2.0770380403860968e-06, "loss": 0.3301, "step": 4326 }, { "epoch": 3.6443009545199327, "grad_norm": 0.328647643327713, "learning_rate": 2.0746524205422997e-06, "loss": 0.3419, "step": 4327 }, { "epoch": 3.645143177989893, "grad_norm": 0.31244778633117676, "learning_rate": 2.072267812818606e-06, "loss": 0.3617, "step": 4328 }, { "epoch": 3.6459854014598543, "grad_norm": 0.34087708592414856, "learning_rate": 2.069884218040052e-06, "loss": 0.3763, "step": 4329 }, { "epoch": 3.6468276249298146, "grad_norm": 0.33927613496780396, "learning_rate": 2.0675016370313165e-06, "loss": 0.3771, "step": 4330 }, { "epoch": 3.6476698483997754, "grad_norm": 0.31336402893066406, "learning_rate": 2.065120070616735e-06, "loss": 0.3355, "step": 4331 }, { "epoch": 3.648512071869736, "grad_norm": 0.33036231994628906, "learning_rate": 2.06273951962029e-06, "loss": 0.3662, "step": 4332 }, { "epoch": 3.649354295339697, "grad_norm": 0.3261284828186035, "learning_rate": 2.060359984865608e-06, "loss": 0.3338, "step": 4333 }, { "epoch": 3.6501965188096577, "grad_norm": 0.32756465673446655, "learning_rate": 2.0579814671759675e-06, "loss": 0.3459, "step": 4334 }, { "epoch": 3.651038742279618, "grad_norm": 0.33571863174438477, "learning_rate": 2.0556039673742958e-06, "loss": 0.3895, "step": 4335 }, { "epoch": 3.6518809657495788, "grad_norm": 0.32896459102630615, "learning_rate": 2.053227486283168e-06, "loss": 0.3685, "step": 4336 }, { "epoch": 3.6527231892195395, "grad_norm": 0.32682520151138306, "learning_rate": 2.0508520247248054e-06, "loss": 0.3525, "step": 4337 }, { "epoch": 3.6535654126895003, "grad_norm": 0.3353217840194702, "learning_rate": 2.048477583521075e-06, "loss": 0.3686, "step": 4338 }, { "epoch": 3.654407636159461, "grad_norm": 0.34526652097702026, "learning_rate": 2.046104163493493e-06, "loss": 0.3652, "step": 4339 }, { "epoch": 3.655249859629422, "grad_norm": 0.32464897632598877, "learning_rate": 2.0437317654632253e-06, "loss": 0.3641, "step": 4340 }, { "epoch": 3.6560920830993826, "grad_norm": 0.3379760980606079, "learning_rate": 2.0413603902510777e-06, "loss": 0.3322, "step": 4341 }, { "epoch": 3.656934306569343, "grad_norm": 0.32673555612564087, "learning_rate": 2.0389900386775073e-06, "loss": 0.3726, "step": 4342 }, { "epoch": 3.6577765300393037, "grad_norm": 0.2919549345970154, "learning_rate": 2.036620711562613e-06, "loss": 0.3521, "step": 4343 }, { "epoch": 3.6586187535092645, "grad_norm": 0.33143889904022217, "learning_rate": 2.0342524097261446e-06, "loss": 0.357, "step": 4344 }, { "epoch": 3.6594609769792252, "grad_norm": 0.3180175721645355, "learning_rate": 2.0318851339874894e-06, "loss": 0.3607, "step": 4345 }, { "epoch": 3.660303200449186, "grad_norm": 0.3016355633735657, "learning_rate": 2.0295188851656894e-06, "loss": 0.3531, "step": 4346 }, { "epoch": 3.6611454239191463, "grad_norm": 0.3550379276275635, "learning_rate": 2.0271536640794215e-06, "loss": 0.3388, "step": 4347 }, { "epoch": 3.661987647389107, "grad_norm": 0.3091687858104706, "learning_rate": 2.0247894715470153e-06, "loss": 0.3349, "step": 4348 }, { "epoch": 3.662829870859068, "grad_norm": 0.3548104763031006, "learning_rate": 2.0224263083864372e-06, "loss": 0.3757, "step": 4349 }, { "epoch": 3.6636720943290286, "grad_norm": 0.32736736536026, "learning_rate": 2.0200641754153033e-06, "loss": 0.3579, "step": 4350 }, { "epoch": 3.6645143177989894, "grad_norm": 0.3398660123348236, "learning_rate": 2.017703073450873e-06, "loss": 0.3907, "step": 4351 }, { "epoch": 3.66535654126895, "grad_norm": 0.32795754075050354, "learning_rate": 2.015343003310045e-06, "loss": 0.3606, "step": 4352 }, { "epoch": 3.666198764738911, "grad_norm": 0.32908207178115845, "learning_rate": 2.012983965809361e-06, "loss": 0.3765, "step": 4353 }, { "epoch": 3.6670409882088713, "grad_norm": 0.36095190048217773, "learning_rate": 2.01062596176501e-06, "loss": 0.3676, "step": 4354 }, { "epoch": 3.667883211678832, "grad_norm": 0.3802684545516968, "learning_rate": 2.0082689919928215e-06, "loss": 0.3705, "step": 4355 }, { "epoch": 3.668725435148793, "grad_norm": 0.3506524860858917, "learning_rate": 2.005913057308267e-06, "loss": 0.3604, "step": 4356 }, { "epoch": 3.6695676586187536, "grad_norm": 0.36697566509246826, "learning_rate": 2.0035581585264558e-06, "loss": 0.3561, "step": 4357 }, { "epoch": 3.6704098820887143, "grad_norm": 0.4115530550479889, "learning_rate": 2.0012042964621453e-06, "loss": 0.3658, "step": 4358 }, { "epoch": 3.6712521055586746, "grad_norm": 0.36406075954437256, "learning_rate": 1.998851471929733e-06, "loss": 0.3426, "step": 4359 }, { "epoch": 3.672094329028636, "grad_norm": 0.34920376539230347, "learning_rate": 1.996499685743254e-06, "loss": 0.3706, "step": 4360 }, { "epoch": 3.672936552498596, "grad_norm": 0.30285337567329407, "learning_rate": 1.9941489387163836e-06, "loss": 0.3588, "step": 4361 }, { "epoch": 3.673778775968557, "grad_norm": 0.3507077097892761, "learning_rate": 1.991799231662443e-06, "loss": 0.349, "step": 4362 }, { "epoch": 3.6746209994385177, "grad_norm": 0.38061612844467163, "learning_rate": 1.989450565394391e-06, "loss": 0.3668, "step": 4363 }, { "epoch": 3.6754632229084785, "grad_norm": 0.3671213686466217, "learning_rate": 1.987102940724825e-06, "loss": 0.3542, "step": 4364 }, { "epoch": 3.6763054463784393, "grad_norm": 0.3433200716972351, "learning_rate": 1.9847563584659807e-06, "loss": 0.3468, "step": 4365 }, { "epoch": 3.6771476698483996, "grad_norm": 0.3651235103607178, "learning_rate": 1.9824108194297378e-06, "loss": 0.3562, "step": 4366 }, { "epoch": 3.6779898933183603, "grad_norm": 0.346535325050354, "learning_rate": 1.980066324427613e-06, "loss": 0.3451, "step": 4367 }, { "epoch": 3.678832116788321, "grad_norm": 0.3499429225921631, "learning_rate": 1.977722874270761e-06, "loss": 0.3724, "step": 4368 }, { "epoch": 3.679674340258282, "grad_norm": 0.34666165709495544, "learning_rate": 1.9753804697699707e-06, "loss": 0.3796, "step": 4369 }, { "epoch": 3.6805165637282427, "grad_norm": 0.3335898220539093, "learning_rate": 1.9730391117356817e-06, "loss": 0.34, "step": 4370 }, { "epoch": 3.6813587871982034, "grad_norm": 0.32833972573280334, "learning_rate": 1.9706988009779604e-06, "loss": 0.3494, "step": 4371 }, { "epoch": 3.682201010668164, "grad_norm": 0.3341798186302185, "learning_rate": 1.9683595383065133e-06, "loss": 0.3611, "step": 4372 }, { "epoch": 3.6830432341381245, "grad_norm": 0.31329384446144104, "learning_rate": 1.9660213245306824e-06, "loss": 0.3317, "step": 4373 }, { "epoch": 3.6838854576080853, "grad_norm": 0.34783264994621277, "learning_rate": 1.9636841604594557e-06, "loss": 0.3594, "step": 4374 }, { "epoch": 3.684727681078046, "grad_norm": 0.29839226603507996, "learning_rate": 1.96134804690145e-06, "loss": 0.3697, "step": 4375 }, { "epoch": 3.685569904548007, "grad_norm": 0.32674142718315125, "learning_rate": 1.9590129846649174e-06, "loss": 0.3554, "step": 4376 }, { "epoch": 3.6864121280179676, "grad_norm": 0.33659419417381287, "learning_rate": 1.9566789745577513e-06, "loss": 0.3784, "step": 4377 }, { "epoch": 3.687254351487928, "grad_norm": 0.3173052668571472, "learning_rate": 1.9543460173874816e-06, "loss": 0.3466, "step": 4378 }, { "epoch": 3.6880965749578887, "grad_norm": 0.35424813628196716, "learning_rate": 1.9520141139612693e-06, "loss": 0.3689, "step": 4379 }, { "epoch": 3.6889387984278494, "grad_norm": 0.34807273745536804, "learning_rate": 1.9496832650859115e-06, "loss": 0.3834, "step": 4380 }, { "epoch": 3.68978102189781, "grad_norm": 0.32965368032455444, "learning_rate": 1.947353471567843e-06, "loss": 0.3787, "step": 4381 }, { "epoch": 3.690623245367771, "grad_norm": 0.32263290882110596, "learning_rate": 1.9450247342131344e-06, "loss": 0.3618, "step": 4382 }, { "epoch": 3.6914654688377317, "grad_norm": 0.3255171477794647, "learning_rate": 1.942697053827488e-06, "loss": 0.3449, "step": 4383 }, { "epoch": 3.6923076923076925, "grad_norm": 0.3486112952232361, "learning_rate": 1.940370431216239e-06, "loss": 0.3709, "step": 4384 }, { "epoch": 3.693149915777653, "grad_norm": 0.3049474060535431, "learning_rate": 1.9380448671843607e-06, "loss": 0.3612, "step": 4385 }, { "epoch": 3.6939921392476136, "grad_norm": 0.3266221582889557, "learning_rate": 1.9357203625364598e-06, "loss": 0.3422, "step": 4386 }, { "epoch": 3.6948343627175744, "grad_norm": 0.3470652103424072, "learning_rate": 1.933396918076773e-06, "loss": 0.3307, "step": 4387 }, { "epoch": 3.695676586187535, "grad_norm": 0.36286377906799316, "learning_rate": 1.9310745346091715e-06, "loss": 0.3495, "step": 4388 }, { "epoch": 3.696518809657496, "grad_norm": 0.34587645530700684, "learning_rate": 1.9287532129371617e-06, "loss": 0.3648, "step": 4389 }, { "epoch": 3.6973610331274562, "grad_norm": 0.33444949984550476, "learning_rate": 1.926432953863882e-06, "loss": 0.3676, "step": 4390 }, { "epoch": 3.6982032565974174, "grad_norm": 0.3318166434764862, "learning_rate": 1.9241137581921016e-06, "loss": 0.3658, "step": 4391 }, { "epoch": 3.6990454800673778, "grad_norm": 0.33822280168533325, "learning_rate": 1.9217956267242182e-06, "loss": 0.374, "step": 4392 }, { "epoch": 3.6998877035373385, "grad_norm": 0.3514258861541748, "learning_rate": 1.9194785602622733e-06, "loss": 0.3608, "step": 4393 }, { "epoch": 3.7007299270072993, "grad_norm": 0.3386603593826294, "learning_rate": 1.9171625596079273e-06, "loss": 0.363, "step": 4394 }, { "epoch": 3.70157215047726, "grad_norm": 0.36877238750457764, "learning_rate": 1.9148476255624766e-06, "loss": 0.3595, "step": 4395 }, { "epoch": 3.702414373947221, "grad_norm": 0.4091641306877136, "learning_rate": 1.9125337589268495e-06, "loss": 0.3692, "step": 4396 }, { "epoch": 3.703256597417181, "grad_norm": 0.34086862206459045, "learning_rate": 1.910220960501606e-06, "loss": 0.3636, "step": 4397 }, { "epoch": 3.704098820887142, "grad_norm": 0.3456082046031952, "learning_rate": 1.907909231086933e-06, "loss": 0.3564, "step": 4398 }, { "epoch": 3.7049410443571027, "grad_norm": 0.32376953959465027, "learning_rate": 1.9055985714826474e-06, "loss": 0.3461, "step": 4399 }, { "epoch": 3.7057832678270635, "grad_norm": 0.37122076749801636, "learning_rate": 1.9032889824882e-06, "loss": 0.3568, "step": 4400 }, { "epoch": 3.7066254912970242, "grad_norm": 0.31658515334129333, "learning_rate": 1.9009804649026703e-06, "loss": 0.3451, "step": 4401 }, { "epoch": 3.707467714766985, "grad_norm": 0.3201794922351837, "learning_rate": 1.8986730195247644e-06, "loss": 0.3526, "step": 4402 }, { "epoch": 3.7083099382369458, "grad_norm": 0.3488161861896515, "learning_rate": 1.8963666471528164e-06, "loss": 0.3515, "step": 4403 }, { "epoch": 3.709152161706906, "grad_norm": 0.3577723503112793, "learning_rate": 1.8940613485847943e-06, "loss": 0.3627, "step": 4404 }, { "epoch": 3.709994385176867, "grad_norm": 0.29486504197120667, "learning_rate": 1.8917571246182931e-06, "loss": 0.3767, "step": 4405 }, { "epoch": 3.7108366086468276, "grad_norm": 0.3476150929927826, "learning_rate": 1.8894539760505327e-06, "loss": 0.3577, "step": 4406 }, { "epoch": 3.7116788321167884, "grad_norm": 0.34388449788093567, "learning_rate": 1.887151903678362e-06, "loss": 0.3529, "step": 4407 }, { "epoch": 3.712521055586749, "grad_norm": 0.3377683162689209, "learning_rate": 1.8848509082982597e-06, "loss": 0.3591, "step": 4408 }, { "epoch": 3.7133632790567095, "grad_norm": 0.33659759163856506, "learning_rate": 1.8825509907063328e-06, "loss": 0.3651, "step": 4409 }, { "epoch": 3.7142055025266703, "grad_norm": 0.3695089817047119, "learning_rate": 1.880252151698312e-06, "loss": 0.3308, "step": 4410 }, { "epoch": 3.715047725996631, "grad_norm": 0.3499218225479126, "learning_rate": 1.8779543920695537e-06, "loss": 0.3517, "step": 4411 }, { "epoch": 3.715889949466592, "grad_norm": 0.35170915722846985, "learning_rate": 1.8756577126150448e-06, "loss": 0.3677, "step": 4412 }, { "epoch": 3.7167321729365526, "grad_norm": 0.3470503091812134, "learning_rate": 1.8733621141294e-06, "loss": 0.3523, "step": 4413 }, { "epoch": 3.7175743964065133, "grad_norm": 0.3154240846633911, "learning_rate": 1.8710675974068548e-06, "loss": 0.369, "step": 4414 }, { "epoch": 3.718416619876474, "grad_norm": 0.33963119983673096, "learning_rate": 1.8687741632412703e-06, "loss": 0.3791, "step": 4415 }, { "epoch": 3.7192588433464344, "grad_norm": 0.3564891517162323, "learning_rate": 1.8664818124261375e-06, "loss": 0.361, "step": 4416 }, { "epoch": 3.720101066816395, "grad_norm": 0.3469276428222656, "learning_rate": 1.8641905457545727e-06, "loss": 0.3784, "step": 4417 }, { "epoch": 3.720943290286356, "grad_norm": 0.3188013732433319, "learning_rate": 1.8619003640193106e-06, "loss": 0.3651, "step": 4418 }, { "epoch": 3.7217855137563167, "grad_norm": 0.3475021421909332, "learning_rate": 1.8596112680127188e-06, "loss": 0.3474, "step": 4419 }, { "epoch": 3.7226277372262775, "grad_norm": 0.34388962388038635, "learning_rate": 1.857323258526782e-06, "loss": 0.3969, "step": 4420 }, { "epoch": 3.723469960696238, "grad_norm": 0.33964723348617554, "learning_rate": 1.8550363363531148e-06, "loss": 0.3453, "step": 4421 }, { "epoch": 3.724312184166199, "grad_norm": 0.33038339018821716, "learning_rate": 1.8527505022829506e-06, "loss": 0.3547, "step": 4422 }, { "epoch": 3.7251544076361593, "grad_norm": 0.3368109464645386, "learning_rate": 1.8504657571071517e-06, "loss": 0.3557, "step": 4423 }, { "epoch": 3.72599663110612, "grad_norm": 0.3318595290184021, "learning_rate": 1.8481821016161971e-06, "loss": 0.3654, "step": 4424 }, { "epoch": 3.726838854576081, "grad_norm": 0.3437637984752655, "learning_rate": 1.8458995366001963e-06, "loss": 0.3862, "step": 4425 }, { "epoch": 3.7276810780460417, "grad_norm": 0.3218975365161896, "learning_rate": 1.8436180628488747e-06, "loss": 0.3484, "step": 4426 }, { "epoch": 3.7285233015160024, "grad_norm": 0.3185303211212158, "learning_rate": 1.8413376811515855e-06, "loss": 0.3577, "step": 4427 }, { "epoch": 3.7293655249859627, "grad_norm": 0.34011220932006836, "learning_rate": 1.8390583922972988e-06, "loss": 0.3745, "step": 4428 }, { "epoch": 3.7302077484559235, "grad_norm": 0.31760066747665405, "learning_rate": 1.836780197074613e-06, "loss": 0.3685, "step": 4429 }, { "epoch": 3.7310499719258843, "grad_norm": 0.31817638874053955, "learning_rate": 1.8345030962717409e-06, "loss": 0.3711, "step": 4430 }, { "epoch": 3.731892195395845, "grad_norm": 0.3519677519798279, "learning_rate": 1.8322270906765238e-06, "loss": 0.3355, "step": 4431 }, { "epoch": 3.732734418865806, "grad_norm": 0.32490453124046326, "learning_rate": 1.8299521810764171e-06, "loss": 0.3579, "step": 4432 }, { "epoch": 3.7335766423357666, "grad_norm": 0.3354076147079468, "learning_rate": 1.8276783682585043e-06, "loss": 0.3487, "step": 4433 }, { "epoch": 3.7344188658057273, "grad_norm": 0.3352147936820984, "learning_rate": 1.825405653009482e-06, "loss": 0.3713, "step": 4434 }, { "epoch": 3.7352610892756877, "grad_norm": 0.34520500898361206, "learning_rate": 1.8231340361156742e-06, "loss": 0.3599, "step": 4435 }, { "epoch": 3.7361033127456484, "grad_norm": 0.3182627260684967, "learning_rate": 1.8208635183630174e-06, "loss": 0.3558, "step": 4436 }, { "epoch": 3.736945536215609, "grad_norm": 0.3501364290714264, "learning_rate": 1.8185941005370744e-06, "loss": 0.3643, "step": 4437 }, { "epoch": 3.73778775968557, "grad_norm": 0.32007575035095215, "learning_rate": 1.8163257834230257e-06, "loss": 0.3375, "step": 4438 }, { "epoch": 3.7386299831555307, "grad_norm": 0.3282841444015503, "learning_rate": 1.8140585678056677e-06, "loss": 0.3351, "step": 4439 }, { "epoch": 3.739472206625491, "grad_norm": 0.3259499967098236, "learning_rate": 1.811792454469421e-06, "loss": 0.3519, "step": 4440 }, { "epoch": 3.740314430095452, "grad_norm": 0.33802756667137146, "learning_rate": 1.8095274441983186e-06, "loss": 0.3492, "step": 4441 }, { "epoch": 3.7411566535654126, "grad_norm": 0.3210373818874359, "learning_rate": 1.8072635377760177e-06, "loss": 0.3622, "step": 4442 }, { "epoch": 3.7419988770353734, "grad_norm": 0.32818150520324707, "learning_rate": 1.8050007359857895e-06, "loss": 0.3635, "step": 4443 }, { "epoch": 3.742841100505334, "grad_norm": 0.2965181767940521, "learning_rate": 1.8027390396105271e-06, "loss": 0.3765, "step": 4444 }, { "epoch": 3.743683323975295, "grad_norm": 0.33650845289230347, "learning_rate": 1.8004784494327354e-06, "loss": 0.3731, "step": 4445 }, { "epoch": 3.7445255474452557, "grad_norm": 0.34308135509490967, "learning_rate": 1.7982189662345428e-06, "loss": 0.336, "step": 4446 }, { "epoch": 3.745367770915216, "grad_norm": 0.31207868456840515, "learning_rate": 1.795960590797689e-06, "loss": 0.356, "step": 4447 }, { "epoch": 3.7462099943851768, "grad_norm": 0.3229331374168396, "learning_rate": 1.7937033239035361e-06, "loss": 0.3217, "step": 4448 }, { "epoch": 3.7470522178551375, "grad_norm": 0.34016740322113037, "learning_rate": 1.7914471663330562e-06, "loss": 0.3728, "step": 4449 }, { "epoch": 3.7478944413250983, "grad_norm": 0.3194488286972046, "learning_rate": 1.7891921188668454e-06, "loss": 0.3487, "step": 4450 }, { "epoch": 3.748736664795059, "grad_norm": 0.33509838581085205, "learning_rate": 1.7869381822851073e-06, "loss": 0.35, "step": 4451 }, { "epoch": 3.7495788882650194, "grad_norm": 0.33871808648109436, "learning_rate": 1.7846853573676686e-06, "loss": 0.3612, "step": 4452 }, { "epoch": 3.7504211117349806, "grad_norm": 0.33310428261756897, "learning_rate": 1.7824336448939655e-06, "loss": 0.3696, "step": 4453 }, { "epoch": 3.751263335204941, "grad_norm": 0.31736496090888977, "learning_rate": 1.7801830456430542e-06, "loss": 0.3613, "step": 4454 }, { "epoch": 3.7521055586749017, "grad_norm": 0.3323925733566284, "learning_rate": 1.7779335603936009e-06, "loss": 0.3615, "step": 4455 }, { "epoch": 3.7529477821448625, "grad_norm": 0.33774489164352417, "learning_rate": 1.7756851899238908e-06, "loss": 0.3334, "step": 4456 }, { "epoch": 3.7537900056148232, "grad_norm": 0.34044724702835083, "learning_rate": 1.7734379350118226e-06, "loss": 0.3251, "step": 4457 }, { "epoch": 3.754632229084784, "grad_norm": 0.34731730818748474, "learning_rate": 1.7711917964349063e-06, "loss": 0.3504, "step": 4458 }, { "epoch": 3.7554744525547443, "grad_norm": 0.3175167143344879, "learning_rate": 1.768946774970266e-06, "loss": 0.3627, "step": 4459 }, { "epoch": 3.756316676024705, "grad_norm": 0.32040295004844666, "learning_rate": 1.766702871394642e-06, "loss": 0.3583, "step": 4460 }, { "epoch": 3.757158899494666, "grad_norm": 0.32639825344085693, "learning_rate": 1.7644600864843886e-06, "loss": 0.3756, "step": 4461 }, { "epoch": 3.7580011229646266, "grad_norm": 0.33750471472740173, "learning_rate": 1.7622184210154692e-06, "loss": 0.3764, "step": 4462 }, { "epoch": 3.7588433464345874, "grad_norm": 0.3224203288555145, "learning_rate": 1.759977875763459e-06, "loss": 0.3673, "step": 4463 }, { "epoch": 3.759685569904548, "grad_norm": 0.3145717978477478, "learning_rate": 1.757738451503551e-06, "loss": 0.3475, "step": 4464 }, { "epoch": 3.760527793374509, "grad_norm": 0.32422688603401184, "learning_rate": 1.755500149010549e-06, "loss": 0.3566, "step": 4465 }, { "epoch": 3.7613700168444693, "grad_norm": 0.33373329043388367, "learning_rate": 1.7532629690588654e-06, "loss": 0.3556, "step": 4466 }, { "epoch": 3.76221224031443, "grad_norm": 0.39312058687210083, "learning_rate": 1.7510269124225242e-06, "loss": 0.38, "step": 4467 }, { "epoch": 3.763054463784391, "grad_norm": 0.32660338282585144, "learning_rate": 1.7487919798751645e-06, "loss": 0.3531, "step": 4468 }, { "epoch": 3.7638966872543516, "grad_norm": 0.33717241883277893, "learning_rate": 1.7465581721900372e-06, "loss": 0.3795, "step": 4469 }, { "epoch": 3.7647389107243123, "grad_norm": 0.31206828355789185, "learning_rate": 1.7443254901399986e-06, "loss": 0.3644, "step": 4470 }, { "epoch": 3.7655811341942727, "grad_norm": 0.3573628067970276, "learning_rate": 1.7420939344975173e-06, "loss": 0.3728, "step": 4471 }, { "epoch": 3.7664233576642334, "grad_norm": 0.3391473591327667, "learning_rate": 1.7398635060346746e-06, "loss": 0.352, "step": 4472 }, { "epoch": 3.767265581134194, "grad_norm": 0.3387961983680725, "learning_rate": 1.7376342055231631e-06, "loss": 0.3555, "step": 4473 }, { "epoch": 3.768107804604155, "grad_norm": 0.3593043386936188, "learning_rate": 1.7354060337342798e-06, "loss": 0.3501, "step": 4474 }, { "epoch": 3.7689500280741157, "grad_norm": 0.3592545986175537, "learning_rate": 1.7331789914389324e-06, "loss": 0.3575, "step": 4475 }, { "epoch": 3.7697922515440765, "grad_norm": 0.33347952365875244, "learning_rate": 1.7309530794076418e-06, "loss": 0.3313, "step": 4476 }, { "epoch": 3.7706344750140373, "grad_norm": 0.3394003212451935, "learning_rate": 1.7287282984105363e-06, "loss": 0.3748, "step": 4477 }, { "epoch": 3.7714766984839976, "grad_norm": 0.36696523427963257, "learning_rate": 1.7265046492173505e-06, "loss": 0.3446, "step": 4478 }, { "epoch": 3.7723189219539583, "grad_norm": 0.326893150806427, "learning_rate": 1.7242821325974258e-06, "loss": 0.3384, "step": 4479 }, { "epoch": 3.773161145423919, "grad_norm": 0.3508204519748688, "learning_rate": 1.722060749319721e-06, "loss": 0.3582, "step": 4480 }, { "epoch": 3.77400336889388, "grad_norm": 0.3123379349708557, "learning_rate": 1.719840500152794e-06, "loss": 0.3644, "step": 4481 }, { "epoch": 3.7748455923638407, "grad_norm": 0.31705784797668457, "learning_rate": 1.7176213858648105e-06, "loss": 0.3626, "step": 4482 }, { "epoch": 3.775687815833801, "grad_norm": 0.3385692536830902, "learning_rate": 1.7154034072235486e-06, "loss": 0.3953, "step": 4483 }, { "epoch": 3.776530039303762, "grad_norm": 0.31377702951431274, "learning_rate": 1.7131865649963919e-06, "loss": 0.3447, "step": 4484 }, { "epoch": 3.7773722627737225, "grad_norm": 0.3516111969947815, "learning_rate": 1.7109708599503278e-06, "loss": 0.3502, "step": 4485 }, { "epoch": 3.7782144862436833, "grad_norm": 0.35180068016052246, "learning_rate": 1.7087562928519514e-06, "loss": 0.3421, "step": 4486 }, { "epoch": 3.779056709713644, "grad_norm": 0.3479491174221039, "learning_rate": 1.7065428644674664e-06, "loss": 0.369, "step": 4487 }, { "epoch": 3.779898933183605, "grad_norm": 0.3344051241874695, "learning_rate": 1.7043305755626827e-06, "loss": 0.3416, "step": 4488 }, { "epoch": 3.7807411566535656, "grad_norm": 0.33491194248199463, "learning_rate": 1.7021194269030122e-06, "loss": 0.3743, "step": 4489 }, { "epoch": 3.781583380123526, "grad_norm": 0.3470025360584259, "learning_rate": 1.6999094192534731e-06, "loss": 0.3566, "step": 4490 }, { "epoch": 3.7824256035934867, "grad_norm": 0.3270544111728668, "learning_rate": 1.6977005533786918e-06, "loss": 0.3461, "step": 4491 }, { "epoch": 3.7832678270634474, "grad_norm": 0.31279170513153076, "learning_rate": 1.6954928300429003e-06, "loss": 0.3691, "step": 4492 }, { "epoch": 3.784110050533408, "grad_norm": 0.30023446679115295, "learning_rate": 1.69328625000993e-06, "loss": 0.3381, "step": 4493 }, { "epoch": 3.784952274003369, "grad_norm": 0.3297571539878845, "learning_rate": 1.6910808140432195e-06, "loss": 0.3614, "step": 4494 }, { "epoch": 3.7857944974733297, "grad_norm": 0.3319167196750641, "learning_rate": 1.6888765229058124e-06, "loss": 0.3673, "step": 4495 }, { "epoch": 3.7866367209432905, "grad_norm": 0.323946088552475, "learning_rate": 1.6866733773603576e-06, "loss": 0.3802, "step": 4496 }, { "epoch": 3.787478944413251, "grad_norm": 0.3483160138130188, "learning_rate": 1.6844713781691047e-06, "loss": 0.3539, "step": 4497 }, { "epoch": 3.7883211678832116, "grad_norm": 0.297738641500473, "learning_rate": 1.6822705260939038e-06, "loss": 0.3551, "step": 4498 }, { "epoch": 3.7891633913531724, "grad_norm": 0.32279151678085327, "learning_rate": 1.680070821896218e-06, "loss": 0.369, "step": 4499 }, { "epoch": 3.790005614823133, "grad_norm": 0.3148172199726105, "learning_rate": 1.6778722663371056e-06, "loss": 0.3337, "step": 4500 }, { "epoch": 3.790847838293094, "grad_norm": 0.32915425300598145, "learning_rate": 1.6756748601772272e-06, "loss": 0.3677, "step": 4501 }, { "epoch": 3.7916900617630542, "grad_norm": 0.3173002302646637, "learning_rate": 1.673478604176846e-06, "loss": 0.3393, "step": 4502 }, { "epoch": 3.792532285233015, "grad_norm": 0.3196558952331543, "learning_rate": 1.6712834990958337e-06, "loss": 0.3591, "step": 4503 }, { "epoch": 3.7933745087029758, "grad_norm": 0.30798959732055664, "learning_rate": 1.6690895456936578e-06, "loss": 0.3503, "step": 4504 }, { "epoch": 3.7942167321729365, "grad_norm": 0.3347166180610657, "learning_rate": 1.6668967447293855e-06, "loss": 0.3637, "step": 4505 }, { "epoch": 3.7950589556428973, "grad_norm": 0.30939239263534546, "learning_rate": 1.6647050969616907e-06, "loss": 0.347, "step": 4506 }, { "epoch": 3.795901179112858, "grad_norm": 0.32504379749298096, "learning_rate": 1.662514603148847e-06, "loss": 0.354, "step": 4507 }, { "epoch": 3.796743402582819, "grad_norm": 0.3412012457847595, "learning_rate": 1.6603252640487262e-06, "loss": 0.3785, "step": 4508 }, { "epoch": 3.797585626052779, "grad_norm": 0.30717769265174866, "learning_rate": 1.6581370804188007e-06, "loss": 0.3467, "step": 4509 }, { "epoch": 3.79842784952274, "grad_norm": 0.325661301612854, "learning_rate": 1.655950053016146e-06, "loss": 0.3355, "step": 4510 }, { "epoch": 3.7992700729927007, "grad_norm": 0.32622477412223816, "learning_rate": 1.6537641825974376e-06, "loss": 0.3721, "step": 4511 }, { "epoch": 3.8001122964626615, "grad_norm": 0.298071950674057, "learning_rate": 1.6515794699189481e-06, "loss": 0.3432, "step": 4512 }, { "epoch": 3.8009545199326222, "grad_norm": 0.3299024999141693, "learning_rate": 1.6493959157365487e-06, "loss": 0.382, "step": 4513 }, { "epoch": 3.8017967434025826, "grad_norm": 0.34122133255004883, "learning_rate": 1.6472135208057128e-06, "loss": 0.3789, "step": 4514 }, { "epoch": 3.8026389668725438, "grad_norm": 0.3579752445220947, "learning_rate": 1.6450322858815142e-06, "loss": 0.3512, "step": 4515 }, { "epoch": 3.803481190342504, "grad_norm": 0.31512030959129333, "learning_rate": 1.6428522117186202e-06, "loss": 0.3512, "step": 4516 }, { "epoch": 3.804323413812465, "grad_norm": 0.35222145915031433, "learning_rate": 1.6406732990712982e-06, "loss": 0.3638, "step": 4517 }, { "epoch": 3.8051656372824256, "grad_norm": 0.33476096391677856, "learning_rate": 1.6384955486934157e-06, "loss": 0.3602, "step": 4518 }, { "epoch": 3.8060078607523864, "grad_norm": 0.33606821298599243, "learning_rate": 1.636318961338439e-06, "loss": 0.3556, "step": 4519 }, { "epoch": 3.806850084222347, "grad_norm": 0.3181990683078766, "learning_rate": 1.6341435377594284e-06, "loss": 0.3672, "step": 4520 }, { "epoch": 3.8076923076923075, "grad_norm": 0.31861162185668945, "learning_rate": 1.6319692787090413e-06, "loss": 0.3393, "step": 4521 }, { "epoch": 3.8085345311622683, "grad_norm": 0.32069650292396545, "learning_rate": 1.6297961849395355e-06, "loss": 0.3615, "step": 4522 }, { "epoch": 3.809376754632229, "grad_norm": 0.34070202708244324, "learning_rate": 1.6276242572027667e-06, "loss": 0.3652, "step": 4523 }, { "epoch": 3.81021897810219, "grad_norm": 0.32550010085105896, "learning_rate": 1.62545349625018e-06, "loss": 0.3596, "step": 4524 }, { "epoch": 3.8110612015721506, "grad_norm": 0.3104510009288788, "learning_rate": 1.6232839028328261e-06, "loss": 0.4025, "step": 4525 }, { "epoch": 3.8119034250421113, "grad_norm": 0.31529709696769714, "learning_rate": 1.6211154777013432e-06, "loss": 0.3521, "step": 4526 }, { "epoch": 3.812745648512072, "grad_norm": 0.3094523549079895, "learning_rate": 1.6189482216059726e-06, "loss": 0.3738, "step": 4527 }, { "epoch": 3.8135878719820324, "grad_norm": 0.33841222524642944, "learning_rate": 1.6167821352965441e-06, "loss": 0.365, "step": 4528 }, { "epoch": 3.814430095451993, "grad_norm": 0.31779974699020386, "learning_rate": 1.6146172195224902e-06, "loss": 0.3742, "step": 4529 }, { "epoch": 3.815272318921954, "grad_norm": 0.31637316942214966, "learning_rate": 1.6124534750328313e-06, "loss": 0.3655, "step": 4530 }, { "epoch": 3.8161145423919147, "grad_norm": 0.3111618757247925, "learning_rate": 1.6102909025761888e-06, "loss": 0.3542, "step": 4531 }, { "epoch": 3.8169567658618755, "grad_norm": 0.3328016996383667, "learning_rate": 1.6081295029007727e-06, "loss": 0.3659, "step": 4532 }, { "epoch": 3.817798989331836, "grad_norm": 0.33168935775756836, "learning_rate": 1.6059692767543933e-06, "loss": 0.368, "step": 4533 }, { "epoch": 3.8186412128017966, "grad_norm": 0.32181909680366516, "learning_rate": 1.6038102248844494e-06, "loss": 0.3477, "step": 4534 }, { "epoch": 3.8194834362717573, "grad_norm": 0.3335615396499634, "learning_rate": 1.6016523480379382e-06, "loss": 0.3616, "step": 4535 }, { "epoch": 3.820325659741718, "grad_norm": 0.34055304527282715, "learning_rate": 1.5994956469614448e-06, "loss": 0.352, "step": 4536 }, { "epoch": 3.821167883211679, "grad_norm": 0.3039279282093048, "learning_rate": 1.5973401224011548e-06, "loss": 0.3693, "step": 4537 }, { "epoch": 3.8220101066816397, "grad_norm": 0.3261438310146332, "learning_rate": 1.5951857751028389e-06, "loss": 0.3484, "step": 4538 }, { "epoch": 3.8228523301516004, "grad_norm": 0.3067868947982788, "learning_rate": 1.5930326058118671e-06, "loss": 0.368, "step": 4539 }, { "epoch": 3.8236945536215607, "grad_norm": 0.3224530518054962, "learning_rate": 1.5908806152731975e-06, "loss": 0.3449, "step": 4540 }, { "epoch": 3.8245367770915215, "grad_norm": 0.32402196526527405, "learning_rate": 1.588729804231382e-06, "loss": 0.335, "step": 4541 }, { "epoch": 3.8253790005614823, "grad_norm": 0.33597391843795776, "learning_rate": 1.586580173430567e-06, "loss": 0.3633, "step": 4542 }, { "epoch": 3.826221224031443, "grad_norm": 0.3475346267223358, "learning_rate": 1.5844317236144835e-06, "loss": 0.3657, "step": 4543 }, { "epoch": 3.827063447501404, "grad_norm": 0.3203149735927582, "learning_rate": 1.5822844555264622e-06, "loss": 0.3491, "step": 4544 }, { "epoch": 3.827905670971364, "grad_norm": 0.31834760308265686, "learning_rate": 1.580138369909418e-06, "loss": 0.36, "step": 4545 }, { "epoch": 3.8287478944413254, "grad_norm": 0.3549058139324188, "learning_rate": 1.5779934675058628e-06, "loss": 0.3524, "step": 4546 }, { "epoch": 3.8295901179112857, "grad_norm": 0.3127344250679016, "learning_rate": 1.5758497490578928e-06, "loss": 0.3437, "step": 4547 }, { "epoch": 3.8304323413812464, "grad_norm": 0.33223921060562134, "learning_rate": 1.5737072153072007e-06, "loss": 0.3444, "step": 4548 }, { "epoch": 3.831274564851207, "grad_norm": 0.34163904190063477, "learning_rate": 1.5715658669950634e-06, "loss": 0.3325, "step": 4549 }, { "epoch": 3.832116788321168, "grad_norm": 0.3268188238143921, "learning_rate": 1.5694257048623545e-06, "loss": 0.3519, "step": 4550 }, { "epoch": 3.8329590117911287, "grad_norm": 0.3205500543117523, "learning_rate": 1.5672867296495297e-06, "loss": 0.348, "step": 4551 }, { "epoch": 3.833801235261089, "grad_norm": 0.3147057890892029, "learning_rate": 1.5651489420966409e-06, "loss": 0.3555, "step": 4552 }, { "epoch": 3.83464345873105, "grad_norm": 0.3349163830280304, "learning_rate": 1.563012342943323e-06, "loss": 0.3675, "step": 4553 }, { "epoch": 3.8354856822010106, "grad_norm": 0.35055750608444214, "learning_rate": 1.5608769329288054e-06, "loss": 0.3302, "step": 4554 }, { "epoch": 3.8363279056709714, "grad_norm": 0.3698115944862366, "learning_rate": 1.5587427127919008e-06, "loss": 0.3821, "step": 4555 }, { "epoch": 3.837170129140932, "grad_norm": 0.3344760835170746, "learning_rate": 1.5566096832710153e-06, "loss": 0.3623, "step": 4556 }, { "epoch": 3.838012352610893, "grad_norm": 0.31321650743484497, "learning_rate": 1.5544778451041375e-06, "loss": 0.3751, "step": 4557 }, { "epoch": 3.8388545760808537, "grad_norm": 0.31887924671173096, "learning_rate": 1.5523471990288507e-06, "loss": 0.3599, "step": 4558 }, { "epoch": 3.839696799550814, "grad_norm": 0.3395664095878601, "learning_rate": 1.550217745782318e-06, "loss": 0.3486, "step": 4559 }, { "epoch": 3.8405390230207748, "grad_norm": 0.3565104305744171, "learning_rate": 1.5480894861012973e-06, "loss": 0.365, "step": 4560 }, { "epoch": 3.8413812464907355, "grad_norm": 0.35427504777908325, "learning_rate": 1.5459624207221269e-06, "loss": 0.3523, "step": 4561 }, { "epoch": 3.8422234699606963, "grad_norm": 0.31598979234695435, "learning_rate": 1.5438365503807356e-06, "loss": 0.355, "step": 4562 }, { "epoch": 3.843065693430657, "grad_norm": 0.32776692509651184, "learning_rate": 1.5417118758126408e-06, "loss": 0.3518, "step": 4563 }, { "epoch": 3.8439079169006174, "grad_norm": 0.3381842076778412, "learning_rate": 1.5395883977529413e-06, "loss": 0.3882, "step": 4564 }, { "epoch": 3.844750140370578, "grad_norm": 0.3188408613204956, "learning_rate": 1.5374661169363225e-06, "loss": 0.3258, "step": 4565 }, { "epoch": 3.845592363840539, "grad_norm": 0.3486538529396057, "learning_rate": 1.5353450340970594e-06, "loss": 0.348, "step": 4566 }, { "epoch": 3.8464345873104997, "grad_norm": 0.3379819393157959, "learning_rate": 1.533225149969011e-06, "loss": 0.3552, "step": 4567 }, { "epoch": 3.8472768107804605, "grad_norm": 0.36643579602241516, "learning_rate": 1.5311064652856194e-06, "loss": 0.3398, "step": 4568 }, { "epoch": 3.8481190342504212, "grad_norm": 0.3207562565803528, "learning_rate": 1.5289889807799119e-06, "loss": 0.3552, "step": 4569 }, { "epoch": 3.848961257720382, "grad_norm": 0.31840336322784424, "learning_rate": 1.5268726971845038e-06, "loss": 0.3642, "step": 4570 }, { "epoch": 3.8498034811903423, "grad_norm": 0.3451004922389984, "learning_rate": 1.5247576152315935e-06, "loss": 0.3731, "step": 4571 }, { "epoch": 3.850645704660303, "grad_norm": 0.3214096426963806, "learning_rate": 1.5226437356529629e-06, "loss": 0.3643, "step": 4572 }, { "epoch": 3.851487928130264, "grad_norm": 0.33304286003112793, "learning_rate": 1.5205310591799748e-06, "loss": 0.3586, "step": 4573 }, { "epoch": 3.8523301516002246, "grad_norm": 0.32214492559432983, "learning_rate": 1.518419586543582e-06, "loss": 0.3637, "step": 4574 }, { "epoch": 3.8531723750701854, "grad_norm": 0.34517961740493774, "learning_rate": 1.5163093184743189e-06, "loss": 0.3558, "step": 4575 }, { "epoch": 3.8540145985401457, "grad_norm": 0.3190985321998596, "learning_rate": 1.5142002557023007e-06, "loss": 0.3578, "step": 4576 }, { "epoch": 3.854856822010107, "grad_norm": 0.3284756541252136, "learning_rate": 1.5120923989572246e-06, "loss": 0.368, "step": 4577 }, { "epoch": 3.8556990454800673, "grad_norm": 0.3261776566505432, "learning_rate": 1.5099857489683756e-06, "loss": 0.3621, "step": 4578 }, { "epoch": 3.856541268950028, "grad_norm": 0.3231543004512787, "learning_rate": 1.507880306464619e-06, "loss": 0.3589, "step": 4579 }, { "epoch": 3.857383492419989, "grad_norm": 0.348090797662735, "learning_rate": 1.5057760721744008e-06, "loss": 0.375, "step": 4580 }, { "epoch": 3.8582257158899496, "grad_norm": 0.3297468423843384, "learning_rate": 1.5036730468257489e-06, "loss": 0.3648, "step": 4581 }, { "epoch": 3.8590679393599103, "grad_norm": 0.33784717321395874, "learning_rate": 1.5015712311462748e-06, "loss": 0.3714, "step": 4582 }, { "epoch": 3.8599101628298707, "grad_norm": 0.33733493089675903, "learning_rate": 1.4994706258631726e-06, "loss": 0.3567, "step": 4583 }, { "epoch": 3.8607523862998314, "grad_norm": 0.3341391384601593, "learning_rate": 1.4973712317032135e-06, "loss": 0.3455, "step": 4584 }, { "epoch": 3.861594609769792, "grad_norm": 0.34344902634620667, "learning_rate": 1.4952730493927498e-06, "loss": 0.3724, "step": 4585 }, { "epoch": 3.862436833239753, "grad_norm": 0.33374103903770447, "learning_rate": 1.4931760796577222e-06, "loss": 0.3328, "step": 4586 }, { "epoch": 3.8632790567097137, "grad_norm": 0.30843308568000793, "learning_rate": 1.491080323223643e-06, "loss": 0.3804, "step": 4587 }, { "epoch": 3.8641212801796745, "grad_norm": 0.2995354235172272, "learning_rate": 1.4889857808156071e-06, "loss": 0.3458, "step": 4588 }, { "epoch": 3.8649635036496353, "grad_norm": 0.32793599367141724, "learning_rate": 1.4868924531582911e-06, "loss": 0.349, "step": 4589 }, { "epoch": 3.8658057271195956, "grad_norm": 0.33148911595344543, "learning_rate": 1.4848003409759532e-06, "loss": 0.354, "step": 4590 }, { "epoch": 3.8666479505895563, "grad_norm": 0.3623882234096527, "learning_rate": 1.482709444992425e-06, "loss": 0.3717, "step": 4591 }, { "epoch": 3.867490174059517, "grad_norm": 0.3140641450881958, "learning_rate": 1.4806197659311205e-06, "loss": 0.3451, "step": 4592 }, { "epoch": 3.868332397529478, "grad_norm": 0.3370644450187683, "learning_rate": 1.4785313045150341e-06, "loss": 0.3711, "step": 4593 }, { "epoch": 3.8691746209994387, "grad_norm": 0.3253495693206787, "learning_rate": 1.476444061466739e-06, "loss": 0.3696, "step": 4594 }, { "epoch": 3.870016844469399, "grad_norm": 0.3438718914985657, "learning_rate": 1.4743580375083838e-06, "loss": 0.3782, "step": 4595 }, { "epoch": 3.87085906793936, "grad_norm": 0.3122476041316986, "learning_rate": 1.4722732333616963e-06, "loss": 0.3749, "step": 4596 }, { "epoch": 3.8717012914093205, "grad_norm": 0.30423977971076965, "learning_rate": 1.4701896497479834e-06, "loss": 0.3435, "step": 4597 }, { "epoch": 3.8725435148792813, "grad_norm": 0.31240716576576233, "learning_rate": 1.4681072873881313e-06, "loss": 0.3686, "step": 4598 }, { "epoch": 3.873385738349242, "grad_norm": 0.3139128088951111, "learning_rate": 1.4660261470025999e-06, "loss": 0.3562, "step": 4599 }, { "epoch": 3.874227961819203, "grad_norm": 0.3502335846424103, "learning_rate": 1.4639462293114275e-06, "loss": 0.3575, "step": 4600 }, { "epoch": 3.8750701852891636, "grad_norm": 0.3182070851325989, "learning_rate": 1.4618675350342303e-06, "loss": 0.3885, "step": 4601 }, { "epoch": 3.875912408759124, "grad_norm": 0.30051136016845703, "learning_rate": 1.4597900648902036e-06, "loss": 0.3441, "step": 4602 }, { "epoch": 3.8767546322290847, "grad_norm": 0.30988839268684387, "learning_rate": 1.4577138195981138e-06, "loss": 0.3568, "step": 4603 }, { "epoch": 3.8775968556990454, "grad_norm": 0.3229370415210724, "learning_rate": 1.4556387998763038e-06, "loss": 0.3606, "step": 4604 }, { "epoch": 3.878439079169006, "grad_norm": 0.3305395841598511, "learning_rate": 1.4535650064427003e-06, "loss": 0.3376, "step": 4605 }, { "epoch": 3.879281302638967, "grad_norm": 0.28555500507354736, "learning_rate": 1.4514924400147984e-06, "loss": 0.3533, "step": 4606 }, { "epoch": 3.8801235261089273, "grad_norm": 0.2978992462158203, "learning_rate": 1.4494211013096694e-06, "loss": 0.3473, "step": 4607 }, { "epoch": 3.8809657495788885, "grad_norm": 0.32250499725341797, "learning_rate": 1.4473509910439581e-06, "loss": 0.3488, "step": 4608 }, { "epoch": 3.881807973048849, "grad_norm": 0.3350200355052948, "learning_rate": 1.4452821099338942e-06, "loss": 0.3545, "step": 4609 }, { "epoch": 3.8826501965188096, "grad_norm": 0.3322496712207794, "learning_rate": 1.4432144586952706e-06, "loss": 0.3912, "step": 4610 }, { "epoch": 3.8834924199887704, "grad_norm": 0.3196665346622467, "learning_rate": 1.441148038043459e-06, "loss": 0.3655, "step": 4611 }, { "epoch": 3.884334643458731, "grad_norm": 0.30585914850234985, "learning_rate": 1.439082848693406e-06, "loss": 0.3825, "step": 4612 }, { "epoch": 3.885176866928692, "grad_norm": 0.30947548151016235, "learning_rate": 1.4370188913596339e-06, "loss": 0.3586, "step": 4613 }, { "epoch": 3.8860190903986522, "grad_norm": 0.2999376356601715, "learning_rate": 1.4349561667562345e-06, "loss": 0.3467, "step": 4614 }, { "epoch": 3.886861313868613, "grad_norm": 0.3189385235309601, "learning_rate": 1.4328946755968742e-06, "loss": 0.3532, "step": 4615 }, { "epoch": 3.8877035373385738, "grad_norm": 0.3214626610279083, "learning_rate": 1.4308344185947948e-06, "loss": 0.367, "step": 4616 }, { "epoch": 3.8885457608085345, "grad_norm": 0.31439465284347534, "learning_rate": 1.4287753964628108e-06, "loss": 0.3214, "step": 4617 }, { "epoch": 3.8893879842784953, "grad_norm": 0.31111404299736023, "learning_rate": 1.4267176099133085e-06, "loss": 0.3621, "step": 4618 }, { "epoch": 3.890230207748456, "grad_norm": 0.35472372174263, "learning_rate": 1.4246610596582443e-06, "loss": 0.3652, "step": 4619 }, { "epoch": 3.891072431218417, "grad_norm": 0.36642906069755554, "learning_rate": 1.4226057464091508e-06, "loss": 0.3621, "step": 4620 }, { "epoch": 3.891914654688377, "grad_norm": 0.36119019985198975, "learning_rate": 1.4205516708771327e-06, "loss": 0.3634, "step": 4621 }, { "epoch": 3.892756878158338, "grad_norm": 0.35234811902046204, "learning_rate": 1.418498833772864e-06, "loss": 0.3434, "step": 4622 }, { "epoch": 3.8935991016282987, "grad_norm": 0.3334217071533203, "learning_rate": 1.4164472358065884e-06, "loss": 0.361, "step": 4623 }, { "epoch": 3.8944413250982595, "grad_norm": 0.3582606911659241, "learning_rate": 1.4143968776881261e-06, "loss": 0.368, "step": 4624 }, { "epoch": 3.8952835485682202, "grad_norm": 0.35640934109687805, "learning_rate": 1.4123477601268676e-06, "loss": 0.3681, "step": 4625 }, { "epoch": 3.8961257720381806, "grad_norm": 0.3498721122741699, "learning_rate": 1.41029988383177e-06, "loss": 0.3522, "step": 4626 }, { "epoch": 3.8969679955081418, "grad_norm": 0.31299397349357605, "learning_rate": 1.4082532495113627e-06, "loss": 0.3546, "step": 4627 }, { "epoch": 3.897810218978102, "grad_norm": 0.3011874854564667, "learning_rate": 1.406207857873747e-06, "loss": 0.3648, "step": 4628 }, { "epoch": 3.898652442448063, "grad_norm": 0.3414140045642853, "learning_rate": 1.4041637096265954e-06, "loss": 0.3614, "step": 4629 }, { "epoch": 3.8994946659180236, "grad_norm": 0.30418381094932556, "learning_rate": 1.4021208054771445e-06, "loss": 0.3496, "step": 4630 }, { "epoch": 3.9003368893879844, "grad_norm": 0.31510141491889954, "learning_rate": 1.4000791461322078e-06, "loss": 0.3523, "step": 4631 }, { "epoch": 3.901179112857945, "grad_norm": 0.3139173984527588, "learning_rate": 1.398038732298161e-06, "loss": 0.3291, "step": 4632 }, { "epoch": 3.9020213363279055, "grad_norm": 0.3078778386116028, "learning_rate": 1.395999564680955e-06, "loss": 0.3496, "step": 4633 }, { "epoch": 3.9028635597978663, "grad_norm": 0.30446916818618774, "learning_rate": 1.3939616439861043e-06, "loss": 0.3863, "step": 4634 }, { "epoch": 3.903705783267827, "grad_norm": 0.3234868049621582, "learning_rate": 1.3919249709186978e-06, "loss": 0.3377, "step": 4635 }, { "epoch": 3.904548006737788, "grad_norm": 0.28871896862983704, "learning_rate": 1.3898895461833856e-06, "loss": 0.346, "step": 4636 }, { "epoch": 3.9053902302077486, "grad_norm": 0.32479172945022583, "learning_rate": 1.3878553704843933e-06, "loss": 0.3466, "step": 4637 }, { "epoch": 3.906232453677709, "grad_norm": 0.32357269525527954, "learning_rate": 1.3858224445255082e-06, "loss": 0.3618, "step": 4638 }, { "epoch": 3.90707467714767, "grad_norm": 0.3313543498516083, "learning_rate": 1.3837907690100882e-06, "loss": 0.3476, "step": 4639 }, { "epoch": 3.9079169006176304, "grad_norm": 0.3005976378917694, "learning_rate": 1.381760344641061e-06, "loss": 0.3665, "step": 4640 }, { "epoch": 3.908759124087591, "grad_norm": 0.3326511085033417, "learning_rate": 1.3797311721209162e-06, "loss": 0.3632, "step": 4641 }, { "epoch": 3.909601347557552, "grad_norm": 0.31403452157974243, "learning_rate": 1.3777032521517113e-06, "loss": 0.3388, "step": 4642 }, { "epoch": 3.9104435710275127, "grad_norm": 0.31206196546554565, "learning_rate": 1.375676585435074e-06, "loss": 0.3554, "step": 4643 }, { "epoch": 3.9112857944974735, "grad_norm": 0.31385910511016846, "learning_rate": 1.3736511726721969e-06, "loss": 0.3495, "step": 4644 }, { "epoch": 3.912128017967434, "grad_norm": 0.3250003755092621, "learning_rate": 1.371627014563837e-06, "loss": 0.348, "step": 4645 }, { "epoch": 3.9129702414373946, "grad_norm": 0.3309669494628906, "learning_rate": 1.369604111810317e-06, "loss": 0.3575, "step": 4646 }, { "epoch": 3.9138124649073553, "grad_norm": 0.33092910051345825, "learning_rate": 1.3675824651115276e-06, "loss": 0.3485, "step": 4647 }, { "epoch": 3.914654688377316, "grad_norm": 0.3167420029640198, "learning_rate": 1.3655620751669257e-06, "loss": 0.3623, "step": 4648 }, { "epoch": 3.915496911847277, "grad_norm": 0.3092111349105835, "learning_rate": 1.3635429426755288e-06, "loss": 0.3616, "step": 4649 }, { "epoch": 3.9163391353172377, "grad_norm": 0.3081475496292114, "learning_rate": 1.3615250683359238e-06, "loss": 0.3679, "step": 4650 }, { "epoch": 3.9171813587871984, "grad_norm": 0.35832592844963074, "learning_rate": 1.3595084528462594e-06, "loss": 0.3597, "step": 4651 }, { "epoch": 3.9180235822571587, "grad_norm": 0.33081141114234924, "learning_rate": 1.3574930969042522e-06, "loss": 0.3312, "step": 4652 }, { "epoch": 3.9188658057271195, "grad_norm": 0.32139089703559875, "learning_rate": 1.3554790012071778e-06, "loss": 0.3826, "step": 4653 }, { "epoch": 3.9197080291970803, "grad_norm": 0.34394076466560364, "learning_rate": 1.3534661664518816e-06, "loss": 0.3518, "step": 4654 }, { "epoch": 3.920550252667041, "grad_norm": 0.3522562086582184, "learning_rate": 1.3514545933347671e-06, "loss": 0.3476, "step": 4655 }, { "epoch": 3.921392476137002, "grad_norm": 0.3168662488460541, "learning_rate": 1.3494442825518077e-06, "loss": 0.3517, "step": 4656 }, { "epoch": 3.922234699606962, "grad_norm": 0.31334906816482544, "learning_rate": 1.3474352347985326e-06, "loss": 0.3378, "step": 4657 }, { "epoch": 3.9230769230769234, "grad_norm": 0.3213628828525543, "learning_rate": 1.345427450770041e-06, "loss": 0.3396, "step": 4658 }, { "epoch": 3.9239191465468837, "grad_norm": 0.32370316982269287, "learning_rate": 1.3434209311609885e-06, "loss": 0.3531, "step": 4659 }, { "epoch": 3.9247613700168444, "grad_norm": 0.3244037330150604, "learning_rate": 1.3414156766656005e-06, "loss": 0.3527, "step": 4660 }, { "epoch": 3.925603593486805, "grad_norm": 0.3104284405708313, "learning_rate": 1.339411687977657e-06, "loss": 0.3617, "step": 4661 }, { "epoch": 3.926445816956766, "grad_norm": 0.3043331503868103, "learning_rate": 1.3374089657905065e-06, "loss": 0.3623, "step": 4662 }, { "epoch": 3.9272880404267267, "grad_norm": 0.30458205938339233, "learning_rate": 1.3354075107970539e-06, "loss": 0.381, "step": 4663 }, { "epoch": 3.928130263896687, "grad_norm": 0.34282615780830383, "learning_rate": 1.3334073236897715e-06, "loss": 0.3397, "step": 4664 }, { "epoch": 3.928972487366648, "grad_norm": 0.3639140725135803, "learning_rate": 1.3314084051606864e-06, "loss": 0.3456, "step": 4665 }, { "epoch": 3.9298147108366086, "grad_norm": 0.33698463439941406, "learning_rate": 1.329410755901393e-06, "loss": 0.3836, "step": 4666 }, { "epoch": 3.9306569343065694, "grad_norm": 0.37342384457588196, "learning_rate": 1.3274143766030411e-06, "loss": 0.3483, "step": 4667 }, { "epoch": 3.93149915777653, "grad_norm": 0.30536314845085144, "learning_rate": 1.325419267956346e-06, "loss": 0.3656, "step": 4668 }, { "epoch": 3.9323413812464905, "grad_norm": 0.31974783539772034, "learning_rate": 1.3234254306515793e-06, "loss": 0.3326, "step": 4669 }, { "epoch": 3.9331836047164517, "grad_norm": 0.3169264495372772, "learning_rate": 1.3214328653785768e-06, "loss": 0.3798, "step": 4670 }, { "epoch": 3.934025828186412, "grad_norm": 0.3332829177379608, "learning_rate": 1.3194415728267286e-06, "loss": 0.3463, "step": 4671 }, { "epoch": 3.9348680516563728, "grad_norm": 0.3363880217075348, "learning_rate": 1.3174515536849896e-06, "loss": 0.3442, "step": 4672 }, { "epoch": 3.9357102751263335, "grad_norm": 0.3231698274612427, "learning_rate": 1.315462808641874e-06, "loss": 0.3465, "step": 4673 }, { "epoch": 3.9365524985962943, "grad_norm": 0.3172794580459595, "learning_rate": 1.313475338385452e-06, "loss": 0.3594, "step": 4674 }, { "epoch": 3.937394722066255, "grad_norm": 0.33577781915664673, "learning_rate": 1.3114891436033521e-06, "loss": 0.3386, "step": 4675 }, { "epoch": 3.9382369455362154, "grad_norm": 0.32084742188453674, "learning_rate": 1.3095042249827662e-06, "loss": 0.3284, "step": 4676 }, { "epoch": 3.939079169006176, "grad_norm": 0.2931286096572876, "learning_rate": 1.3075205832104422e-06, "loss": 0.3693, "step": 4677 }, { "epoch": 3.939921392476137, "grad_norm": 0.36135196685791016, "learning_rate": 1.3055382189726856e-06, "loss": 0.3441, "step": 4678 }, { "epoch": 3.9407636159460977, "grad_norm": 0.31924140453338623, "learning_rate": 1.3035571329553592e-06, "loss": 0.3795, "step": 4679 }, { "epoch": 3.9416058394160585, "grad_norm": 0.3011431396007538, "learning_rate": 1.3015773258438858e-06, "loss": 0.3502, "step": 4680 }, { "epoch": 3.9424480628860192, "grad_norm": 0.3150167763233185, "learning_rate": 1.299598798323246e-06, "loss": 0.3649, "step": 4681 }, { "epoch": 3.94329028635598, "grad_norm": 0.3098214864730835, "learning_rate": 1.2976215510779755e-06, "loss": 0.3654, "step": 4682 }, { "epoch": 3.9441325098259403, "grad_norm": 0.3093997836112976, "learning_rate": 1.2956455847921657e-06, "loss": 0.3346, "step": 4683 }, { "epoch": 3.944974733295901, "grad_norm": 0.29363325238227844, "learning_rate": 1.2936709001494697e-06, "loss": 0.3604, "step": 4684 }, { "epoch": 3.945816956765862, "grad_norm": 0.3195500671863556, "learning_rate": 1.2916974978330944e-06, "loss": 0.3657, "step": 4685 }, { "epoch": 3.9466591802358226, "grad_norm": 0.3119347095489502, "learning_rate": 1.289725378525803e-06, "loss": 0.3693, "step": 4686 }, { "epoch": 3.9475014037057834, "grad_norm": 0.34296971559524536, "learning_rate": 1.2877545429099131e-06, "loss": 0.351, "step": 4687 }, { "epoch": 3.9483436271757437, "grad_norm": 0.30658623576164246, "learning_rate": 1.2857849916673016e-06, "loss": 0.3646, "step": 4688 }, { "epoch": 3.949185850645705, "grad_norm": 0.3021605908870697, "learning_rate": 1.2838167254794004e-06, "loss": 0.3487, "step": 4689 }, { "epoch": 3.9500280741156653, "grad_norm": 0.31557559967041016, "learning_rate": 1.2818497450271939e-06, "loss": 0.3588, "step": 4690 }, { "epoch": 3.950870297585626, "grad_norm": 0.28874653577804565, "learning_rate": 1.2798840509912242e-06, "loss": 0.3319, "step": 4691 }, { "epoch": 3.951712521055587, "grad_norm": 0.2972557544708252, "learning_rate": 1.277919644051589e-06, "loss": 0.3686, "step": 4692 }, { "epoch": 3.9525547445255476, "grad_norm": 0.293440043926239, "learning_rate": 1.2759565248879391e-06, "loss": 0.3687, "step": 4693 }, { "epoch": 3.9533969679955083, "grad_norm": 0.2918764054775238, "learning_rate": 1.2739946941794778e-06, "loss": 0.3538, "step": 4694 }, { "epoch": 3.9542391914654687, "grad_norm": 0.38969263434410095, "learning_rate": 1.2720341526049662e-06, "loss": 0.3327, "step": 4695 }, { "epoch": 3.9550814149354294, "grad_norm": 0.3165687322616577, "learning_rate": 1.2700749008427204e-06, "loss": 0.3469, "step": 4696 }, { "epoch": 3.95592363840539, "grad_norm": 0.31915581226348877, "learning_rate": 1.2681169395706056e-06, "loss": 0.357, "step": 4697 }, { "epoch": 3.956765861875351, "grad_norm": 0.3532717823982239, "learning_rate": 1.2661602694660414e-06, "loss": 0.3807, "step": 4698 }, { "epoch": 3.9576080853453117, "grad_norm": 0.30728408694267273, "learning_rate": 1.2642048912060045e-06, "loss": 0.3387, "step": 4699 }, { "epoch": 3.958450308815272, "grad_norm": 0.30705106258392334, "learning_rate": 1.2622508054670223e-06, "loss": 0.3534, "step": 4700 }, { "epoch": 3.9592925322852333, "grad_norm": 0.3067359924316406, "learning_rate": 1.2602980129251747e-06, "loss": 0.3782, "step": 4701 }, { "epoch": 3.9601347557551936, "grad_norm": 0.32899218797683716, "learning_rate": 1.2583465142560924e-06, "loss": 0.3571, "step": 4702 }, { "epoch": 3.9609769792251543, "grad_norm": 0.3190484344959259, "learning_rate": 1.256396310134962e-06, "loss": 0.361, "step": 4703 }, { "epoch": 3.961819202695115, "grad_norm": 0.31617605686187744, "learning_rate": 1.254447401236522e-06, "loss": 0.3576, "step": 4704 }, { "epoch": 3.962661426165076, "grad_norm": 0.31298086047172546, "learning_rate": 1.252499788235061e-06, "loss": 0.3612, "step": 4705 }, { "epoch": 3.9635036496350367, "grad_norm": 0.3004712462425232, "learning_rate": 1.2505534718044166e-06, "loss": 0.3521, "step": 4706 }, { "epoch": 3.964345873104997, "grad_norm": 0.3363568186759949, "learning_rate": 1.2486084526179838e-06, "loss": 0.3664, "step": 4707 }, { "epoch": 3.9651880965749577, "grad_norm": 0.29899242520332336, "learning_rate": 1.2466647313487062e-06, "loss": 0.3816, "step": 4708 }, { "epoch": 3.9660303200449185, "grad_norm": 0.3318082094192505, "learning_rate": 1.2447223086690774e-06, "loss": 0.3556, "step": 4709 }, { "epoch": 3.9668725435148793, "grad_norm": 0.3124702572822571, "learning_rate": 1.2427811852511396e-06, "loss": 0.3367, "step": 4710 }, { "epoch": 3.96771476698484, "grad_norm": 0.2949615716934204, "learning_rate": 1.2408413617664933e-06, "loss": 0.378, "step": 4711 }, { "epoch": 3.968556990454801, "grad_norm": 0.32309117913246155, "learning_rate": 1.238902838886281e-06, "loss": 0.3461, "step": 4712 }, { "epoch": 3.9693992139247616, "grad_norm": 0.32001665234565735, "learning_rate": 1.2369656172812e-06, "loss": 0.3691, "step": 4713 }, { "epoch": 3.970241437394722, "grad_norm": 0.3234848976135254, "learning_rate": 1.235029697621491e-06, "loss": 0.3598, "step": 4714 }, { "epoch": 3.9710836608646827, "grad_norm": 0.30215200781822205, "learning_rate": 1.233095080576956e-06, "loss": 0.3571, "step": 4715 }, { "epoch": 3.9719258843346434, "grad_norm": 0.32671383023262024, "learning_rate": 1.2311617668169361e-06, "loss": 0.3872, "step": 4716 }, { "epoch": 3.972768107804604, "grad_norm": 0.334604948759079, "learning_rate": 1.2292297570103229e-06, "loss": 0.3463, "step": 4717 }, { "epoch": 3.973610331274565, "grad_norm": 0.3250965476036072, "learning_rate": 1.2272990518255606e-06, "loss": 0.3594, "step": 4718 }, { "epoch": 3.9744525547445253, "grad_norm": 0.2948666512966156, "learning_rate": 1.2253696519306413e-06, "loss": 0.3472, "step": 4719 }, { "epoch": 3.9752947782144865, "grad_norm": 0.3236892521381378, "learning_rate": 1.223441557993103e-06, "loss": 0.3815, "step": 4720 }, { "epoch": 3.976137001684447, "grad_norm": 0.32563450932502747, "learning_rate": 1.2215147706800318e-06, "loss": 0.3547, "step": 4721 }, { "epoch": 3.9769792251544076, "grad_norm": 0.30780068039894104, "learning_rate": 1.2195892906580642e-06, "loss": 0.363, "step": 4722 }, { "epoch": 3.9778214486243684, "grad_norm": 0.31268244981765747, "learning_rate": 1.2176651185933846e-06, "loss": 0.3372, "step": 4723 }, { "epoch": 3.978663672094329, "grad_norm": 0.3242247998714447, "learning_rate": 1.2157422551517228e-06, "loss": 0.3693, "step": 4724 }, { "epoch": 3.97950589556429, "grad_norm": 0.3229404091835022, "learning_rate": 1.2138207009983544e-06, "loss": 0.361, "step": 4725 }, { "epoch": 3.9803481190342502, "grad_norm": 0.3157649636268616, "learning_rate": 1.2119004567981057e-06, "loss": 0.3849, "step": 4726 }, { "epoch": 3.981190342504211, "grad_norm": 0.31488585472106934, "learning_rate": 1.20998152321535e-06, "loss": 0.3476, "step": 4727 }, { "epoch": 3.9820325659741718, "grad_norm": 0.3146711587905884, "learning_rate": 1.2080639009140039e-06, "loss": 0.3667, "step": 4728 }, { "epoch": 3.9828747894441325, "grad_norm": 0.28562623262405396, "learning_rate": 1.2061475905575309e-06, "loss": 0.3647, "step": 4729 }, { "epoch": 3.9837170129140933, "grad_norm": 0.3163255751132965, "learning_rate": 1.2042325928089422e-06, "loss": 0.364, "step": 4730 }, { "epoch": 3.9845592363840536, "grad_norm": 0.338783323764801, "learning_rate": 1.202318908330795e-06, "loss": 0.3518, "step": 4731 }, { "epoch": 3.985401459854015, "grad_norm": 0.31538766622543335, "learning_rate": 1.200406537785192e-06, "loss": 0.3621, "step": 4732 }, { "epoch": 3.986243683323975, "grad_norm": 0.31235966086387634, "learning_rate": 1.1984954818337774e-06, "loss": 0.3582, "step": 4733 }, { "epoch": 3.987085906793936, "grad_norm": 0.3409157395362854, "learning_rate": 1.1965857411377457e-06, "loss": 0.3372, "step": 4734 }, { "epoch": 3.9879281302638967, "grad_norm": 0.319913774728775, "learning_rate": 1.1946773163578363e-06, "loss": 0.3542, "step": 4735 }, { "epoch": 3.9887703537338575, "grad_norm": 0.29448917508125305, "learning_rate": 1.1927702081543279e-06, "loss": 0.3478, "step": 4736 }, { "epoch": 3.9896125772038182, "grad_norm": 0.33356985449790955, "learning_rate": 1.1908644171870504e-06, "loss": 0.3483, "step": 4737 }, { "epoch": 3.9904548006737786, "grad_norm": 0.3180762827396393, "learning_rate": 1.1889599441153722e-06, "loss": 0.3897, "step": 4738 }, { "epoch": 3.9912970241437393, "grad_norm": 0.3237883448600769, "learning_rate": 1.187056789598211e-06, "loss": 0.3464, "step": 4739 }, { "epoch": 3.9921392476137, "grad_norm": 0.3234178125858307, "learning_rate": 1.1851549542940223e-06, "loss": 0.3496, "step": 4740 }, { "epoch": 3.992981471083661, "grad_norm": 0.3092823922634125, "learning_rate": 1.1832544388608109e-06, "loss": 0.3578, "step": 4741 }, { "epoch": 3.9938236945536216, "grad_norm": 0.3536873161792755, "learning_rate": 1.1813552439561232e-06, "loss": 0.365, "step": 4742 }, { "epoch": 3.9946659180235824, "grad_norm": 0.3060668408870697, "learning_rate": 1.1794573702370464e-06, "loss": 0.3451, "step": 4743 }, { "epoch": 3.995508141493543, "grad_norm": 0.3214830458164215, "learning_rate": 1.1775608183602121e-06, "loss": 0.3571, "step": 4744 }, { "epoch": 3.9963503649635035, "grad_norm": 0.3224519193172455, "learning_rate": 1.1756655889817952e-06, "loss": 0.3866, "step": 4745 }, { "epoch": 3.9971925884334643, "grad_norm": 0.31845560669898987, "learning_rate": 1.1737716827575141e-06, "loss": 0.3553, "step": 4746 }, { "epoch": 3.998034811903425, "grad_norm": 0.3298132121562958, "learning_rate": 1.1718791003426267e-06, "loss": 0.3876, "step": 4747 }, { "epoch": 3.998877035373386, "grad_norm": 0.30267229676246643, "learning_rate": 1.1699878423919325e-06, "loss": 0.3366, "step": 4748 }, { "epoch": 3.9997192588433466, "grad_norm": 0.3069247305393219, "learning_rate": 1.1680979095597755e-06, "loss": 0.37, "step": 4749 }, { "epoch": 4.000561482313307, "grad_norm": 0.626861572265625, "learning_rate": 1.1662093025000415e-06, "loss": 0.5154, "step": 4750 }, { "epoch": 4.001403705783268, "grad_norm": 0.3226439952850342, "learning_rate": 1.1643220218661555e-06, "loss": 0.3568, "step": 4751 }, { "epoch": 4.002245929253228, "grad_norm": 0.3114364445209503, "learning_rate": 1.162436068311082e-06, "loss": 0.3039, "step": 4752 }, { "epoch": 4.00308815272319, "grad_norm": 0.33753687143325806, "learning_rate": 1.1605514424873304e-06, "loss": 0.3203, "step": 4753 }, { "epoch": 4.00393037619315, "grad_norm": 0.33062347769737244, "learning_rate": 1.15866814504695e-06, "loss": 0.3644, "step": 4754 }, { "epoch": 4.00477259966311, "grad_norm": 0.3226526975631714, "learning_rate": 1.1567861766415268e-06, "loss": 0.3536, "step": 4755 }, { "epoch": 4.0056148231330715, "grad_norm": 0.3182262182235718, "learning_rate": 1.1549055379221923e-06, "loss": 0.3048, "step": 4756 }, { "epoch": 4.006457046603032, "grad_norm": 0.3435400426387787, "learning_rate": 1.1530262295396133e-06, "loss": 0.3448, "step": 4757 }, { "epoch": 4.007299270072993, "grad_norm": 0.3388313055038452, "learning_rate": 1.1511482521439998e-06, "loss": 0.3455, "step": 4758 }, { "epoch": 4.008141493542953, "grad_norm": 0.3278298079967499, "learning_rate": 1.1492716063850973e-06, "loss": 0.3312, "step": 4759 }, { "epoch": 4.008983717012914, "grad_norm": 0.3166085481643677, "learning_rate": 1.1473962929121968e-06, "loss": 0.3206, "step": 4760 }, { "epoch": 4.009825940482875, "grad_norm": 0.30389201641082764, "learning_rate": 1.1455223123741204e-06, "loss": 0.3187, "step": 4761 }, { "epoch": 4.010668163952835, "grad_norm": 0.35824429988861084, "learning_rate": 1.1436496654192368e-06, "loss": 0.3642, "step": 4762 }, { "epoch": 4.011510387422796, "grad_norm": 0.33321037888526917, "learning_rate": 1.1417783526954474e-06, "loss": 0.3295, "step": 4763 }, { "epoch": 4.012352610892757, "grad_norm": 0.3231672942638397, "learning_rate": 1.1399083748501966e-06, "loss": 0.301, "step": 4764 }, { "epoch": 4.013194834362718, "grad_norm": 0.3385957181453705, "learning_rate": 1.1380397325304614e-06, "loss": 0.3618, "step": 4765 }, { "epoch": 4.014037057832678, "grad_norm": 0.3307839035987854, "learning_rate": 1.1361724263827633e-06, "loss": 0.3456, "step": 4766 }, { "epoch": 4.014879281302639, "grad_norm": 0.31745079159736633, "learning_rate": 1.1343064570531552e-06, "loss": 0.3038, "step": 4767 }, { "epoch": 4.0157215047726, "grad_norm": 0.31787943840026855, "learning_rate": 1.1324418251872342e-06, "loss": 0.3438, "step": 4768 }, { "epoch": 4.01656372824256, "grad_norm": 0.3365257978439331, "learning_rate": 1.1305785314301271e-06, "loss": 0.3358, "step": 4769 }, { "epoch": 4.017405951712521, "grad_norm": 0.3429902195930481, "learning_rate": 1.1287165764265045e-06, "loss": 0.3828, "step": 4770 }, { "epoch": 4.018248175182482, "grad_norm": 0.3003966510295868, "learning_rate": 1.1268559608205681e-06, "loss": 0.3234, "step": 4771 }, { "epoch": 4.019090398652443, "grad_norm": 0.32748088240623474, "learning_rate": 1.124996685256063e-06, "loss": 0.3243, "step": 4772 }, { "epoch": 4.019932622122403, "grad_norm": 0.35587161779403687, "learning_rate": 1.123138750376262e-06, "loss": 0.3595, "step": 4773 }, { "epoch": 4.0207748455923635, "grad_norm": 0.3249056339263916, "learning_rate": 1.1212821568239822e-06, "loss": 0.3445, "step": 4774 }, { "epoch": 4.021617069062325, "grad_norm": 0.335587739944458, "learning_rate": 1.1194269052415714e-06, "loss": 0.3525, "step": 4775 }, { "epoch": 4.022459292532285, "grad_norm": 0.322501003742218, "learning_rate": 1.1175729962709165e-06, "loss": 0.3548, "step": 4776 }, { "epoch": 4.023301516002246, "grad_norm": 0.32869043946266174, "learning_rate": 1.1157204305534352e-06, "loss": 0.3533, "step": 4777 }, { "epoch": 4.024143739472207, "grad_norm": 0.2929961681365967, "learning_rate": 1.1138692087300856e-06, "loss": 0.3204, "step": 4778 }, { "epoch": 4.024985962942167, "grad_norm": 0.31837838888168335, "learning_rate": 1.11201933144136e-06, "loss": 0.3391, "step": 4779 }, { "epoch": 4.025828186412128, "grad_norm": 0.322224885225296, "learning_rate": 1.1101707993272826e-06, "loss": 0.3528, "step": 4780 }, { "epoch": 4.0266704098820885, "grad_norm": 0.30950281023979187, "learning_rate": 1.1083236130274128e-06, "loss": 0.3179, "step": 4781 }, { "epoch": 4.02751263335205, "grad_norm": 0.3194299638271332, "learning_rate": 1.1064777731808463e-06, "loss": 0.3501, "step": 4782 }, { "epoch": 4.02835485682201, "grad_norm": 0.31974413990974426, "learning_rate": 1.1046332804262138e-06, "loss": 0.3186, "step": 4783 }, { "epoch": 4.029197080291971, "grad_norm": 0.3259528875350952, "learning_rate": 1.1027901354016763e-06, "loss": 0.3606, "step": 4784 }, { "epoch": 4.0300393037619315, "grad_norm": 0.28838983178138733, "learning_rate": 1.1009483387449294e-06, "loss": 0.3264, "step": 4785 }, { "epoch": 4.030881527231892, "grad_norm": 0.3033713698387146, "learning_rate": 1.0991078910932047e-06, "loss": 0.3534, "step": 4786 }, { "epoch": 4.031723750701853, "grad_norm": 0.3081444799900055, "learning_rate": 1.097268793083266e-06, "loss": 0.3383, "step": 4787 }, { "epoch": 4.032565974171813, "grad_norm": 0.31056734919548035, "learning_rate": 1.0954310453514083e-06, "loss": 0.3711, "step": 4788 }, { "epoch": 4.033408197641775, "grad_norm": 0.3115420341491699, "learning_rate": 1.0935946485334625e-06, "loss": 0.3358, "step": 4789 }, { "epoch": 4.034250421111735, "grad_norm": 0.31970125436782837, "learning_rate": 1.0917596032647882e-06, "loss": 0.3674, "step": 4790 }, { "epoch": 4.035092644581695, "grad_norm": 0.3173792362213135, "learning_rate": 1.0899259101802818e-06, "loss": 0.3542, "step": 4791 }, { "epoch": 4.0359348680516565, "grad_norm": 0.32067549228668213, "learning_rate": 1.0880935699143675e-06, "loss": 0.3208, "step": 4792 }, { "epoch": 4.036777091521617, "grad_norm": 0.299493670463562, "learning_rate": 1.086262583101006e-06, "loss": 0.3297, "step": 4793 }, { "epoch": 4.037619314991578, "grad_norm": 0.31630685925483704, "learning_rate": 1.084432950373685e-06, "loss": 0.3213, "step": 4794 }, { "epoch": 4.038461538461538, "grad_norm": 0.3160959780216217, "learning_rate": 1.0826046723654283e-06, "loss": 0.3673, "step": 4795 }, { "epoch": 4.0393037619314995, "grad_norm": 0.30073750019073486, "learning_rate": 1.0807777497087863e-06, "loss": 0.3467, "step": 4796 }, { "epoch": 4.04014598540146, "grad_norm": 0.3033844232559204, "learning_rate": 1.078952183035844e-06, "loss": 0.3475, "step": 4797 }, { "epoch": 4.04098820887142, "grad_norm": 0.3089721202850342, "learning_rate": 1.077127972978218e-06, "loss": 0.3252, "step": 4798 }, { "epoch": 4.041830432341381, "grad_norm": 0.29699498414993286, "learning_rate": 1.0753051201670517e-06, "loss": 0.3059, "step": 4799 }, { "epoch": 4.042672655811342, "grad_norm": 0.3175121545791626, "learning_rate": 1.0734836252330204e-06, "loss": 0.3847, "step": 4800 }, { "epoch": 4.043514879281303, "grad_norm": 0.31369271874427795, "learning_rate": 1.071663488806331e-06, "loss": 0.3534, "step": 4801 }, { "epoch": 4.044357102751263, "grad_norm": 0.3323385715484619, "learning_rate": 1.0698447115167204e-06, "loss": 0.3459, "step": 4802 }, { "epoch": 4.0451993262212245, "grad_norm": 0.3301713764667511, "learning_rate": 1.0680272939934538e-06, "loss": 0.3594, "step": 4803 }, { "epoch": 4.046041549691185, "grad_norm": 0.302642822265625, "learning_rate": 1.066211236865325e-06, "loss": 0.2919, "step": 4804 }, { "epoch": 4.046883773161145, "grad_norm": 0.3422060012817383, "learning_rate": 1.0643965407606594e-06, "loss": 0.3458, "step": 4805 }, { "epoch": 4.047725996631106, "grad_norm": 0.3323655426502228, "learning_rate": 1.0625832063073127e-06, "loss": 0.3271, "step": 4806 }, { "epoch": 4.048568220101067, "grad_norm": 0.31639498472213745, "learning_rate": 1.0607712341326658e-06, "loss": 0.323, "step": 4807 }, { "epoch": 4.049410443571028, "grad_norm": 0.2975011169910431, "learning_rate": 1.0589606248636291e-06, "loss": 0.3469, "step": 4808 }, { "epoch": 4.050252667040988, "grad_norm": 0.32699882984161377, "learning_rate": 1.0571513791266436e-06, "loss": 0.3431, "step": 4809 }, { "epoch": 4.0510948905109485, "grad_norm": 0.31289881467819214, "learning_rate": 1.0553434975476784e-06, "loss": 0.336, "step": 4810 }, { "epoch": 4.05193711398091, "grad_norm": 0.31244710087776184, "learning_rate": 1.0535369807522278e-06, "loss": 0.3586, "step": 4811 }, { "epoch": 4.05277933745087, "grad_norm": 0.31501176953315735, "learning_rate": 1.0517318293653151e-06, "loss": 0.3242, "step": 4812 }, { "epoch": 4.053621560920831, "grad_norm": 0.30620598793029785, "learning_rate": 1.0499280440114923e-06, "loss": 0.3354, "step": 4813 }, { "epoch": 4.054463784390792, "grad_norm": 0.31723326444625854, "learning_rate": 1.0481256253148403e-06, "loss": 0.3404, "step": 4814 }, { "epoch": 4.055306007860753, "grad_norm": 0.3001113831996918, "learning_rate": 1.0463245738989636e-06, "loss": 0.3505, "step": 4815 }, { "epoch": 4.056148231330713, "grad_norm": 0.3188074827194214, "learning_rate": 1.044524890386992e-06, "loss": 0.3554, "step": 4816 }, { "epoch": 4.056990454800673, "grad_norm": 0.32508692145347595, "learning_rate": 1.0427265754015907e-06, "loss": 0.3354, "step": 4817 }, { "epoch": 4.057832678270635, "grad_norm": 0.3351089358329773, "learning_rate": 1.0409296295649435e-06, "loss": 0.3152, "step": 4818 }, { "epoch": 4.058674901740595, "grad_norm": 0.300353080034256, "learning_rate": 1.0391340534987621e-06, "loss": 0.3442, "step": 4819 }, { "epoch": 4.059517125210556, "grad_norm": 0.33542975783348083, "learning_rate": 1.037339847824283e-06, "loss": 0.331, "step": 4820 }, { "epoch": 4.0603593486805165, "grad_norm": 0.3273381292819977, "learning_rate": 1.0355470131622759e-06, "loss": 0.375, "step": 4821 }, { "epoch": 4.061201572150477, "grad_norm": 0.3210098445415497, "learning_rate": 1.0337555501330283e-06, "loss": 0.3698, "step": 4822 }, { "epoch": 4.062043795620438, "grad_norm": 0.2996540069580078, "learning_rate": 1.0319654593563533e-06, "loss": 0.2964, "step": 4823 }, { "epoch": 4.062886019090398, "grad_norm": 0.32569757103919983, "learning_rate": 1.0301767414515945e-06, "loss": 0.3286, "step": 4824 }, { "epoch": 4.06372824256036, "grad_norm": 0.3310258388519287, "learning_rate": 1.0283893970376175e-06, "loss": 0.3551, "step": 4825 }, { "epoch": 4.06457046603032, "grad_norm": 0.31328678131103516, "learning_rate": 1.0266034267328129e-06, "loss": 0.3463, "step": 4826 }, { "epoch": 4.065412689500281, "grad_norm": 0.3199225962162018, "learning_rate": 1.0248188311550934e-06, "loss": 0.3825, "step": 4827 }, { "epoch": 4.066254912970241, "grad_norm": 0.29482266306877136, "learning_rate": 1.0230356109218996e-06, "loss": 0.2949, "step": 4828 }, { "epoch": 4.067097136440202, "grad_norm": 0.3361750543117523, "learning_rate": 1.0212537666501976e-06, "loss": 0.3775, "step": 4829 }, { "epoch": 4.067939359910163, "grad_norm": 0.31981709599494934, "learning_rate": 1.0194732989564733e-06, "loss": 0.3101, "step": 4830 }, { "epoch": 4.068781583380123, "grad_norm": 0.345583438873291, "learning_rate": 1.017694208456736e-06, "loss": 0.3166, "step": 4831 }, { "epoch": 4.0696238068500845, "grad_norm": 0.32028669118881226, "learning_rate": 1.0159164957665224e-06, "loss": 0.3726, "step": 4832 }, { "epoch": 4.070466030320045, "grad_norm": 0.30975088477134705, "learning_rate": 1.0141401615008917e-06, "loss": 0.334, "step": 4833 }, { "epoch": 4.071308253790006, "grad_norm": 0.306618869304657, "learning_rate": 1.0123652062744243e-06, "loss": 0.3716, "step": 4834 }, { "epoch": 4.072150477259966, "grad_norm": 0.30990171432495117, "learning_rate": 1.010591630701222e-06, "loss": 0.3521, "step": 4835 }, { "epoch": 4.072992700729927, "grad_norm": 0.2923038899898529, "learning_rate": 1.0088194353949137e-06, "loss": 0.3184, "step": 4836 }, { "epoch": 4.073834924199888, "grad_norm": 0.319573312997818, "learning_rate": 1.0070486209686491e-06, "loss": 0.3762, "step": 4837 }, { "epoch": 4.074677147669848, "grad_norm": 0.3115522563457489, "learning_rate": 1.0052791880350992e-06, "loss": 0.3338, "step": 4838 }, { "epoch": 4.075519371139809, "grad_norm": 0.3130648732185364, "learning_rate": 1.0035111372064548e-06, "loss": 0.3041, "step": 4839 }, { "epoch": 4.07636159460977, "grad_norm": 0.3012271523475647, "learning_rate": 1.0017444690944356e-06, "loss": 0.3361, "step": 4840 }, { "epoch": 4.07720381807973, "grad_norm": 0.3045131266117096, "learning_rate": 9.999791843102763e-07, "loss": 0.3172, "step": 4841 }, { "epoch": 4.078046041549691, "grad_norm": 0.32677775621414185, "learning_rate": 9.98215283464734e-07, "loss": 0.3762, "step": 4842 }, { "epoch": 4.078888265019652, "grad_norm": 0.3192811608314514, "learning_rate": 9.96452767168089e-07, "loss": 0.3389, "step": 4843 }, { "epoch": 4.079730488489613, "grad_norm": 0.301280677318573, "learning_rate": 9.946916360301435e-07, "loss": 0.317, "step": 4844 }, { "epoch": 4.080572711959573, "grad_norm": 0.30540165305137634, "learning_rate": 9.929318906602176e-07, "loss": 0.3686, "step": 4845 }, { "epoch": 4.081414935429534, "grad_norm": 0.317220538854599, "learning_rate": 9.91173531667151e-07, "loss": 0.3453, "step": 4846 }, { "epoch": 4.082257158899495, "grad_norm": 0.3179939389228821, "learning_rate": 9.894165596593074e-07, "loss": 0.3674, "step": 4847 }, { "epoch": 4.083099382369455, "grad_norm": 0.31590595841407776, "learning_rate": 9.876609752445705e-07, "loss": 0.3246, "step": 4848 }, { "epoch": 4.083941605839416, "grad_norm": 0.33918797969818115, "learning_rate": 9.859067790303406e-07, "loss": 0.3716, "step": 4849 }, { "epoch": 4.084783829309377, "grad_norm": 0.3060058653354645, "learning_rate": 9.841539716235387e-07, "loss": 0.3274, "step": 4850 }, { "epoch": 4.085626052779338, "grad_norm": 0.3173372745513916, "learning_rate": 9.82402553630607e-07, "loss": 0.3442, "step": 4851 }, { "epoch": 4.086468276249298, "grad_norm": 0.3097417950630188, "learning_rate": 9.806525256575079e-07, "loss": 0.3269, "step": 4852 }, { "epoch": 4.087310499719258, "grad_norm": 0.344524621963501, "learning_rate": 9.78903888309719e-07, "loss": 0.3878, "step": 4853 }, { "epoch": 4.08815272318922, "grad_norm": 0.3366200029850006, "learning_rate": 9.771566421922386e-07, "loss": 0.3502, "step": 4854 }, { "epoch": 4.08899494665918, "grad_norm": 0.3100210130214691, "learning_rate": 9.754107879095847e-07, "loss": 0.3001, "step": 4855 }, { "epoch": 4.089837170129141, "grad_norm": 0.35410135984420776, "learning_rate": 9.736663260657936e-07, "loss": 0.3679, "step": 4856 }, { "epoch": 4.0906793935991015, "grad_norm": 0.2905329763889313, "learning_rate": 9.719232572644189e-07, "loss": 0.2965, "step": 4857 }, { "epoch": 4.091521617069063, "grad_norm": 0.3383554220199585, "learning_rate": 9.70181582108531e-07, "loss": 0.3719, "step": 4858 }, { "epoch": 4.092363840539023, "grad_norm": 0.30619245767593384, "learning_rate": 9.684413012007205e-07, "loss": 0.2943, "step": 4859 }, { "epoch": 4.093206064008983, "grad_norm": 0.3315677046775818, "learning_rate": 9.66702415143096e-07, "loss": 0.3674, "step": 4860 }, { "epoch": 4.094048287478945, "grad_norm": 0.3252440094947815, "learning_rate": 9.649649245372816e-07, "loss": 0.3699, "step": 4861 }, { "epoch": 4.094890510948905, "grad_norm": 0.3042856752872467, "learning_rate": 9.632288299844173e-07, "loss": 0.3153, "step": 4862 }, { "epoch": 4.095732734418866, "grad_norm": 0.3319932818412781, "learning_rate": 9.614941320851634e-07, "loss": 0.3518, "step": 4863 }, { "epoch": 4.096574957888826, "grad_norm": 0.3244350552558899, "learning_rate": 9.597608314396978e-07, "loss": 0.3563, "step": 4864 }, { "epoch": 4.097417181358788, "grad_norm": 0.3124767541885376, "learning_rate": 9.580289286477097e-07, "loss": 0.3492, "step": 4865 }, { "epoch": 4.098259404828748, "grad_norm": 0.31974127888679504, "learning_rate": 9.562984243084094e-07, "loss": 0.3395, "step": 4866 }, { "epoch": 4.099101628298708, "grad_norm": 0.3037610650062561, "learning_rate": 9.545693190205208e-07, "loss": 0.3366, "step": 4867 }, { "epoch": 4.0999438517686695, "grad_norm": 0.3018365204334259, "learning_rate": 9.52841613382286e-07, "loss": 0.343, "step": 4868 }, { "epoch": 4.10078607523863, "grad_norm": 0.32013243436813354, "learning_rate": 9.511153079914598e-07, "loss": 0.3699, "step": 4869 }, { "epoch": 4.101628298708591, "grad_norm": 0.32082870602607727, "learning_rate": 9.493904034453161e-07, "loss": 0.3414, "step": 4870 }, { "epoch": 4.102470522178551, "grad_norm": 0.3256252110004425, "learning_rate": 9.476669003406403e-07, "loss": 0.3674, "step": 4871 }, { "epoch": 4.103312745648512, "grad_norm": 0.29743099212646484, "learning_rate": 9.459447992737369e-07, "loss": 0.3532, "step": 4872 }, { "epoch": 4.104154969118473, "grad_norm": 0.2851738929748535, "learning_rate": 9.442241008404213e-07, "loss": 0.308, "step": 4873 }, { "epoch": 4.104997192588433, "grad_norm": 0.31436511874198914, "learning_rate": 9.425048056360286e-07, "loss": 0.3557, "step": 4874 }, { "epoch": 4.105839416058394, "grad_norm": 0.32802441716194153, "learning_rate": 9.407869142554016e-07, "loss": 0.3357, "step": 4875 }, { "epoch": 4.106681639528355, "grad_norm": 0.30117371678352356, "learning_rate": 9.390704272929052e-07, "loss": 0.3138, "step": 4876 }, { "epoch": 4.107523862998316, "grad_norm": 0.33110666275024414, "learning_rate": 9.373553453424106e-07, "loss": 0.3475, "step": 4877 }, { "epoch": 4.108366086468276, "grad_norm": 0.3351551592350006, "learning_rate": 9.356416689973108e-07, "loss": 0.3454, "step": 4878 }, { "epoch": 4.109208309938237, "grad_norm": 0.31951186060905457, "learning_rate": 9.339293988505044e-07, "loss": 0.3505, "step": 4879 }, { "epoch": 4.110050533408198, "grad_norm": 0.32250919938087463, "learning_rate": 9.322185354944107e-07, "loss": 0.3697, "step": 4880 }, { "epoch": 4.110892756878158, "grad_norm": 0.3088934123516083, "learning_rate": 9.305090795209571e-07, "loss": 0.3459, "step": 4881 }, { "epoch": 4.111734980348119, "grad_norm": 0.3009852170944214, "learning_rate": 9.288010315215879e-07, "loss": 0.31, "step": 4882 }, { "epoch": 4.11257720381808, "grad_norm": 0.3570581376552582, "learning_rate": 9.27094392087256e-07, "loss": 0.3638, "step": 4883 }, { "epoch": 4.11341942728804, "grad_norm": 0.3218142092227936, "learning_rate": 9.253891618084304e-07, "loss": 0.3458, "step": 4884 }, { "epoch": 4.114261650758001, "grad_norm": 0.2963237166404724, "learning_rate": 9.236853412750935e-07, "loss": 0.3344, "step": 4885 }, { "epoch": 4.1151038742279615, "grad_norm": 0.31391942501068115, "learning_rate": 9.219829310767365e-07, "loss": 0.3533, "step": 4886 }, { "epoch": 4.115946097697923, "grad_norm": 0.34267112612724304, "learning_rate": 9.202819318023625e-07, "loss": 0.3405, "step": 4887 }, { "epoch": 4.116788321167883, "grad_norm": 0.30576422810554504, "learning_rate": 9.185823440404895e-07, "loss": 0.336, "step": 4888 }, { "epoch": 4.117630544637844, "grad_norm": 0.330409973859787, "learning_rate": 9.168841683791469e-07, "loss": 0.3264, "step": 4889 }, { "epoch": 4.118472768107805, "grad_norm": 0.34570688009262085, "learning_rate": 9.151874054058718e-07, "loss": 0.3508, "step": 4890 }, { "epoch": 4.119314991577765, "grad_norm": 0.29635804891586304, "learning_rate": 9.134920557077182e-07, "loss": 0.2983, "step": 4891 }, { "epoch": 4.120157215047726, "grad_norm": 0.3438546061515808, "learning_rate": 9.11798119871245e-07, "loss": 0.3789, "step": 4892 }, { "epoch": 4.1209994385176865, "grad_norm": 0.2987477779388428, "learning_rate": 9.10105598482528e-07, "loss": 0.3275, "step": 4893 }, { "epoch": 4.121841661987648, "grad_norm": 0.3159482479095459, "learning_rate": 9.084144921271471e-07, "loss": 0.3713, "step": 4894 }, { "epoch": 4.122683885457608, "grad_norm": 0.31872886419296265, "learning_rate": 9.067248013901992e-07, "loss": 0.3717, "step": 4895 }, { "epoch": 4.123526108927569, "grad_norm": 0.29322177171707153, "learning_rate": 9.050365268562861e-07, "loss": 0.3527, "step": 4896 }, { "epoch": 4.1243683323975295, "grad_norm": 0.29845964908599854, "learning_rate": 9.03349669109524e-07, "loss": 0.3245, "step": 4897 }, { "epoch": 4.12521055586749, "grad_norm": 0.32082322239875793, "learning_rate": 9.016642287335336e-07, "loss": 0.3727, "step": 4898 }, { "epoch": 4.126052779337451, "grad_norm": 0.302968829870224, "learning_rate": 8.999802063114522e-07, "loss": 0.3285, "step": 4899 }, { "epoch": 4.126895002807411, "grad_norm": 0.30419886112213135, "learning_rate": 8.982976024259193e-07, "loss": 0.3472, "step": 4900 }, { "epoch": 4.127737226277373, "grad_norm": 0.3330878019332886, "learning_rate": 8.966164176590896e-07, "loss": 0.3562, "step": 4901 }, { "epoch": 4.128579449747333, "grad_norm": 0.31581249833106995, "learning_rate": 8.949366525926223e-07, "loss": 0.3361, "step": 4902 }, { "epoch": 4.129421673217293, "grad_norm": 0.31526869535446167, "learning_rate": 8.932583078076879e-07, "loss": 0.3091, "step": 4903 }, { "epoch": 4.1302638966872545, "grad_norm": 0.33025676012039185, "learning_rate": 8.915813838849662e-07, "loss": 0.3689, "step": 4904 }, { "epoch": 4.131106120157215, "grad_norm": 0.32728925347328186, "learning_rate": 8.89905881404643e-07, "loss": 0.3242, "step": 4905 }, { "epoch": 4.131948343627176, "grad_norm": 0.3083949387073517, "learning_rate": 8.882318009464124e-07, "loss": 0.318, "step": 4906 }, { "epoch": 4.132790567097136, "grad_norm": 0.3447399139404297, "learning_rate": 8.865591430894788e-07, "loss": 0.3584, "step": 4907 }, { "epoch": 4.1336327905670975, "grad_norm": 0.320787638425827, "learning_rate": 8.848879084125539e-07, "loss": 0.3171, "step": 4908 }, { "epoch": 4.134475014037058, "grad_norm": 0.31647956371307373, "learning_rate": 8.832180974938553e-07, "loss": 0.3416, "step": 4909 }, { "epoch": 4.135317237507018, "grad_norm": 0.29488474130630493, "learning_rate": 8.815497109111077e-07, "loss": 0.334, "step": 4910 }, { "epoch": 4.136159460976979, "grad_norm": 0.3064696788787842, "learning_rate": 8.798827492415457e-07, "loss": 0.3674, "step": 4911 }, { "epoch": 4.13700168444694, "grad_norm": 0.3184245824813843, "learning_rate": 8.782172130619105e-07, "loss": 0.3422, "step": 4912 }, { "epoch": 4.137843907916901, "grad_norm": 0.3136856257915497, "learning_rate": 8.765531029484475e-07, "loss": 0.3008, "step": 4913 }, { "epoch": 4.138686131386861, "grad_norm": 0.3310632109642029, "learning_rate": 8.748904194769093e-07, "loss": 0.3785, "step": 4914 }, { "epoch": 4.139528354856822, "grad_norm": 0.3199516832828522, "learning_rate": 8.73229163222557e-07, "loss": 0.3607, "step": 4915 }, { "epoch": 4.140370578326783, "grad_norm": 0.3144189715385437, "learning_rate": 8.715693347601584e-07, "loss": 0.3205, "step": 4916 }, { "epoch": 4.141212801796743, "grad_norm": 0.3150256872177124, "learning_rate": 8.699109346639845e-07, "loss": 0.3572, "step": 4917 }, { "epoch": 4.142055025266704, "grad_norm": 0.28272107243537903, "learning_rate": 8.682539635078114e-07, "loss": 0.3253, "step": 4918 }, { "epoch": 4.142897248736665, "grad_norm": 0.30826181173324585, "learning_rate": 8.665984218649243e-07, "loss": 0.3397, "step": 4919 }, { "epoch": 4.143739472206626, "grad_norm": 0.30821722745895386, "learning_rate": 8.64944310308114e-07, "loss": 0.3003, "step": 4920 }, { "epoch": 4.144581695676586, "grad_norm": 0.33038610219955444, "learning_rate": 8.632916294096728e-07, "loss": 0.3817, "step": 4921 }, { "epoch": 4.1454239191465465, "grad_norm": 0.3423934876918793, "learning_rate": 8.616403797413969e-07, "loss": 0.3357, "step": 4922 }, { "epoch": 4.146266142616508, "grad_norm": 0.3054860830307007, "learning_rate": 8.599905618745969e-07, "loss": 0.312, "step": 4923 }, { "epoch": 4.147108366086468, "grad_norm": 0.3051191568374634, "learning_rate": 8.583421763800781e-07, "loss": 0.341, "step": 4924 }, { "epoch": 4.147950589556429, "grad_norm": 0.319261372089386, "learning_rate": 8.566952238281533e-07, "loss": 0.3799, "step": 4925 }, { "epoch": 4.14879281302639, "grad_norm": 0.2900123596191406, "learning_rate": 8.55049704788638e-07, "loss": 0.3081, "step": 4926 }, { "epoch": 4.149635036496351, "grad_norm": 0.3165195882320404, "learning_rate": 8.534056198308582e-07, "loss": 0.3199, "step": 4927 }, { "epoch": 4.150477259966311, "grad_norm": 0.3265880048274994, "learning_rate": 8.517629695236373e-07, "loss": 0.3479, "step": 4928 }, { "epoch": 4.151319483436271, "grad_norm": 0.31807368993759155, "learning_rate": 8.501217544353018e-07, "loss": 0.3542, "step": 4929 }, { "epoch": 4.152161706906233, "grad_norm": 0.3044387400150299, "learning_rate": 8.484819751336859e-07, "loss": 0.2945, "step": 4930 }, { "epoch": 4.153003930376193, "grad_norm": 0.3068995475769043, "learning_rate": 8.468436321861261e-07, "loss": 0.3724, "step": 4931 }, { "epoch": 4.153846153846154, "grad_norm": 0.31577539443969727, "learning_rate": 8.452067261594599e-07, "loss": 0.3697, "step": 4932 }, { "epoch": 4.1546883773161145, "grad_norm": 0.29867568612098694, "learning_rate": 8.435712576200278e-07, "loss": 0.3233, "step": 4933 }, { "epoch": 4.155530600786075, "grad_norm": 0.31112736463546753, "learning_rate": 8.419372271336746e-07, "loss": 0.3479, "step": 4934 }, { "epoch": 4.156372824256036, "grad_norm": 0.3277389109134674, "learning_rate": 8.403046352657479e-07, "loss": 0.3609, "step": 4935 }, { "epoch": 4.157215047725996, "grad_norm": 0.2855316698551178, "learning_rate": 8.38673482581096e-07, "loss": 0.3137, "step": 4936 }, { "epoch": 4.158057271195958, "grad_norm": 0.2930185794830322, "learning_rate": 8.370437696440675e-07, "loss": 0.3467, "step": 4937 }, { "epoch": 4.158899494665918, "grad_norm": 0.3082510530948639, "learning_rate": 8.354154970185174e-07, "loss": 0.3436, "step": 4938 }, { "epoch": 4.159741718135879, "grad_norm": 0.2962001860141754, "learning_rate": 8.337886652678013e-07, "loss": 0.3285, "step": 4939 }, { "epoch": 4.160583941605839, "grad_norm": 0.31687992811203003, "learning_rate": 8.321632749547726e-07, "loss": 0.3577, "step": 4940 }, { "epoch": 4.1614261650758, "grad_norm": 0.29812124371528625, "learning_rate": 8.305393266417888e-07, "loss": 0.3352, "step": 4941 }, { "epoch": 4.162268388545761, "grad_norm": 0.31153780221939087, "learning_rate": 8.289168208907083e-07, "loss": 0.3207, "step": 4942 }, { "epoch": 4.163110612015721, "grad_norm": 0.32465091347694397, "learning_rate": 8.272957582628921e-07, "loss": 0.3748, "step": 4943 }, { "epoch": 4.1639528354856825, "grad_norm": 0.33032727241516113, "learning_rate": 8.256761393191987e-07, "loss": 0.3292, "step": 4944 }, { "epoch": 4.164795058955643, "grad_norm": 0.30177041888237, "learning_rate": 8.240579646199859e-07, "loss": 0.355, "step": 4945 }, { "epoch": 4.165637282425603, "grad_norm": 0.3104805648326874, "learning_rate": 8.224412347251193e-07, "loss": 0.3675, "step": 4946 }, { "epoch": 4.166479505895564, "grad_norm": 0.30979159474372864, "learning_rate": 8.208259501939575e-07, "loss": 0.3114, "step": 4947 }, { "epoch": 4.167321729365525, "grad_norm": 0.3319506049156189, "learning_rate": 8.192121115853601e-07, "loss": 0.368, "step": 4948 }, { "epoch": 4.168163952835486, "grad_norm": 0.2933095395565033, "learning_rate": 8.175997194576885e-07, "loss": 0.3224, "step": 4949 }, { "epoch": 4.169006176305446, "grad_norm": 0.3215898275375366, "learning_rate": 8.159887743688039e-07, "loss": 0.3937, "step": 4950 }, { "epoch": 4.169848399775407, "grad_norm": 0.2931530177593231, "learning_rate": 8.14379276876065e-07, "loss": 0.3272, "step": 4951 }, { "epoch": 4.170690623245368, "grad_norm": 0.32366761565208435, "learning_rate": 8.127712275363281e-07, "loss": 0.348, "step": 4952 }, { "epoch": 4.171532846715328, "grad_norm": 0.3254413306713104, "learning_rate": 8.111646269059531e-07, "loss": 0.3722, "step": 4953 }, { "epoch": 4.172375070185289, "grad_norm": 0.30617254972457886, "learning_rate": 8.095594755407971e-07, "loss": 0.2952, "step": 4954 }, { "epoch": 4.17321729365525, "grad_norm": 0.31338226795196533, "learning_rate": 8.079557739962129e-07, "loss": 0.3258, "step": 4955 }, { "epoch": 4.174059517125211, "grad_norm": 0.3170274794101715, "learning_rate": 8.063535228270535e-07, "loss": 0.3463, "step": 4956 }, { "epoch": 4.174901740595171, "grad_norm": 0.296432226896286, "learning_rate": 8.047527225876717e-07, "loss": 0.3158, "step": 4957 }, { "epoch": 4.175743964065132, "grad_norm": 0.3095460832118988, "learning_rate": 8.031533738319175e-07, "loss": 0.3175, "step": 4958 }, { "epoch": 4.176586187535093, "grad_norm": 0.3146760165691376, "learning_rate": 8.015554771131368e-07, "loss": 0.3681, "step": 4959 }, { "epoch": 4.177428411005053, "grad_norm": 0.2957116365432739, "learning_rate": 7.999590329841738e-07, "loss": 0.3312, "step": 4960 }, { "epoch": 4.178270634475014, "grad_norm": 0.302371084690094, "learning_rate": 7.983640419973715e-07, "loss": 0.3367, "step": 4961 }, { "epoch": 4.179112857944975, "grad_norm": 0.32339611649513245, "learning_rate": 7.967705047045715e-07, "loss": 0.3607, "step": 4962 }, { "epoch": 4.179955081414936, "grad_norm": 0.31681761145591736, "learning_rate": 7.951784216571085e-07, "loss": 0.3254, "step": 4963 }, { "epoch": 4.180797304884896, "grad_norm": 0.32897162437438965, "learning_rate": 7.935877934058145e-07, "loss": 0.3356, "step": 4964 }, { "epoch": 4.181639528354856, "grad_norm": 0.3077234625816345, "learning_rate": 7.919986205010217e-07, "loss": 0.3713, "step": 4965 }, { "epoch": 4.182481751824818, "grad_norm": 0.30855870246887207, "learning_rate": 7.904109034925567e-07, "loss": 0.3492, "step": 4966 }, { "epoch": 4.183323975294778, "grad_norm": 0.31602340936660767, "learning_rate": 7.888246429297419e-07, "loss": 0.3457, "step": 4967 }, { "epoch": 4.184166198764739, "grad_norm": 0.30332860350608826, "learning_rate": 7.87239839361395e-07, "loss": 0.3726, "step": 4968 }, { "epoch": 4.1850084222346995, "grad_norm": 0.3078354299068451, "learning_rate": 7.856564933358324e-07, "loss": 0.3495, "step": 4969 }, { "epoch": 4.185850645704661, "grad_norm": 0.2935662269592285, "learning_rate": 7.840746054008653e-07, "loss": 0.3206, "step": 4970 }, { "epoch": 4.186692869174621, "grad_norm": 0.31102505326271057, "learning_rate": 7.824941761037974e-07, "loss": 0.3439, "step": 4971 }, { "epoch": 4.187535092644581, "grad_norm": 0.32220420241355896, "learning_rate": 7.809152059914327e-07, "loss": 0.3419, "step": 4972 }, { "epoch": 4.188377316114543, "grad_norm": 0.30084964632987976, "learning_rate": 7.793376956100657e-07, "loss": 0.3161, "step": 4973 }, { "epoch": 4.189219539584503, "grad_norm": 0.31022030115127563, "learning_rate": 7.777616455054899e-07, "loss": 0.3403, "step": 4974 }, { "epoch": 4.190061763054464, "grad_norm": 0.318313866853714, "learning_rate": 7.761870562229895e-07, "loss": 0.3052, "step": 4975 }, { "epoch": 4.190903986524424, "grad_norm": 0.31629401445388794, "learning_rate": 7.746139283073473e-07, "loss": 0.3663, "step": 4976 }, { "epoch": 4.191746209994385, "grad_norm": 0.3056858777999878, "learning_rate": 7.730422623028372e-07, "loss": 0.3633, "step": 4977 }, { "epoch": 4.192588433464346, "grad_norm": 0.3021262288093567, "learning_rate": 7.714720587532299e-07, "loss": 0.3217, "step": 4978 }, { "epoch": 4.193430656934306, "grad_norm": 0.30933162569999695, "learning_rate": 7.699033182017873e-07, "loss": 0.3077, "step": 4979 }, { "epoch": 4.1942728804042675, "grad_norm": 0.3357297480106354, "learning_rate": 7.683360411912688e-07, "loss": 0.3642, "step": 4980 }, { "epoch": 4.195115103874228, "grad_norm": 0.285759836435318, "learning_rate": 7.667702282639228e-07, "loss": 0.3143, "step": 4981 }, { "epoch": 4.195957327344189, "grad_norm": 0.3341926336288452, "learning_rate": 7.65205879961497e-07, "loss": 0.3296, "step": 4982 }, { "epoch": 4.196799550814149, "grad_norm": 0.3307144343852997, "learning_rate": 7.636429968252257e-07, "loss": 0.3397, "step": 4983 }, { "epoch": 4.19764177428411, "grad_norm": 0.2945469915866852, "learning_rate": 7.62081579395842e-07, "loss": 0.3453, "step": 4984 }, { "epoch": 4.198483997754071, "grad_norm": 0.3291681408882141, "learning_rate": 7.60521628213568e-07, "loss": 0.374, "step": 4985 }, { "epoch": 4.199326221224031, "grad_norm": 0.3340815305709839, "learning_rate": 7.589631438181222e-07, "loss": 0.3403, "step": 4986 }, { "epoch": 4.200168444693992, "grad_norm": 0.3119966387748718, "learning_rate": 7.57406126748711e-07, "loss": 0.3591, "step": 4987 }, { "epoch": 4.201010668163953, "grad_norm": 0.30597418546676636, "learning_rate": 7.558505775440367e-07, "loss": 0.3432, "step": 4988 }, { "epoch": 4.201852891633914, "grad_norm": 0.29927706718444824, "learning_rate": 7.542964967422933e-07, "loss": 0.3245, "step": 4989 }, { "epoch": 4.202695115103874, "grad_norm": 0.313686341047287, "learning_rate": 7.527438848811653e-07, "loss": 0.3638, "step": 4990 }, { "epoch": 4.203537338573835, "grad_norm": 0.3620133697986603, "learning_rate": 7.511927424978305e-07, "loss": 0.3291, "step": 4991 }, { "epoch": 4.204379562043796, "grad_norm": 0.3163236081600189, "learning_rate": 7.496430701289558e-07, "loss": 0.3095, "step": 4992 }, { "epoch": 4.205221785513756, "grad_norm": 0.3064352571964264, "learning_rate": 7.480948683107042e-07, "loss": 0.3206, "step": 4993 }, { "epoch": 4.206064008983717, "grad_norm": 0.3307116627693176, "learning_rate": 7.465481375787242e-07, "loss": 0.3514, "step": 4994 }, { "epoch": 4.206906232453678, "grad_norm": 0.30652308464050293, "learning_rate": 7.450028784681601e-07, "loss": 0.3304, "step": 4995 }, { "epoch": 4.207748455923638, "grad_norm": 0.3266957998275757, "learning_rate": 7.43459091513643e-07, "loss": 0.372, "step": 4996 }, { "epoch": 4.208590679393599, "grad_norm": 0.30559903383255005, "learning_rate": 7.419167772492991e-07, "loss": 0.3349, "step": 4997 }, { "epoch": 4.2094329028635595, "grad_norm": 0.31911036372184753, "learning_rate": 7.403759362087404e-07, "loss": 0.3514, "step": 4998 }, { "epoch": 4.210275126333521, "grad_norm": 0.29893842339515686, "learning_rate": 7.388365689250737e-07, "loss": 0.3314, "step": 4999 }, { "epoch": 4.211117349803481, "grad_norm": 0.29777297377586365, "learning_rate": 7.372986759308914e-07, "loss": 0.3473, "step": 5000 }, { "epoch": 4.211959573273442, "grad_norm": 0.29028141498565674, "learning_rate": 7.357622577582801e-07, "loss": 0.3114, "step": 5001 }, { "epoch": 4.212801796743403, "grad_norm": 0.29833364486694336, "learning_rate": 7.34227314938813e-07, "loss": 0.365, "step": 5002 }, { "epoch": 4.213644020213363, "grad_norm": 0.29235848784446716, "learning_rate": 7.326938480035545e-07, "loss": 0.346, "step": 5003 }, { "epoch": 4.214486243683324, "grad_norm": 0.3107306957244873, "learning_rate": 7.31161857483057e-07, "loss": 0.3532, "step": 5004 }, { "epoch": 4.2153284671532845, "grad_norm": 0.30233556032180786, "learning_rate": 7.296313439073649e-07, "loss": 0.3393, "step": 5005 }, { "epoch": 4.216170690623246, "grad_norm": 0.3066336512565613, "learning_rate": 7.281023078060073e-07, "loss": 0.3143, "step": 5006 }, { "epoch": 4.217012914093206, "grad_norm": 0.3131508231163025, "learning_rate": 7.265747497080061e-07, "loss": 0.343, "step": 5007 }, { "epoch": 4.217855137563166, "grad_norm": 0.3219795823097229, "learning_rate": 7.250486701418691e-07, "loss": 0.3453, "step": 5008 }, { "epoch": 4.2186973610331275, "grad_norm": 0.290932297706604, "learning_rate": 7.235240696355944e-07, "loss": 0.3208, "step": 5009 }, { "epoch": 4.219539584503088, "grad_norm": 0.33101263642311096, "learning_rate": 7.220009487166679e-07, "loss": 0.3373, "step": 5010 }, { "epoch": 4.220381807973049, "grad_norm": 0.30183884501457214, "learning_rate": 7.204793079120636e-07, "loss": 0.3244, "step": 5011 }, { "epoch": 4.221224031443009, "grad_norm": 0.32207953929901123, "learning_rate": 7.18959147748241e-07, "loss": 0.3541, "step": 5012 }, { "epoch": 4.222066254912971, "grad_norm": 0.3047855496406555, "learning_rate": 7.174404687511505e-07, "loss": 0.3366, "step": 5013 }, { "epoch": 4.222908478382931, "grad_norm": 0.333682656288147, "learning_rate": 7.159232714462311e-07, "loss": 0.3576, "step": 5014 }, { "epoch": 4.223750701852891, "grad_norm": 0.314919650554657, "learning_rate": 7.144075563584052e-07, "loss": 0.2984, "step": 5015 }, { "epoch": 4.2245929253228525, "grad_norm": 0.32651033997535706, "learning_rate": 7.128933240120827e-07, "loss": 0.3673, "step": 5016 }, { "epoch": 4.225435148792813, "grad_norm": 0.3143424689769745, "learning_rate": 7.113805749311642e-07, "loss": 0.3203, "step": 5017 }, { "epoch": 4.226277372262774, "grad_norm": 0.29164743423461914, "learning_rate": 7.098693096390358e-07, "loss": 0.3467, "step": 5018 }, { "epoch": 4.227119595732734, "grad_norm": 0.31956595182418823, "learning_rate": 7.083595286585676e-07, "loss": 0.346, "step": 5019 }, { "epoch": 4.2279618192026955, "grad_norm": 0.32236552238464355, "learning_rate": 7.068512325121168e-07, "loss": 0.3226, "step": 5020 }, { "epoch": 4.228804042672656, "grad_norm": 0.32371652126312256, "learning_rate": 7.053444217215305e-07, "loss": 0.3139, "step": 5021 }, { "epoch": 4.229646266142616, "grad_norm": 0.3100232779979706, "learning_rate": 7.038390968081388e-07, "loss": 0.3339, "step": 5022 }, { "epoch": 4.230488489612577, "grad_norm": 0.31486040353775024, "learning_rate": 7.023352582927584e-07, "loss": 0.377, "step": 5023 }, { "epoch": 4.231330713082538, "grad_norm": 0.3058784604072571, "learning_rate": 7.008329066956898e-07, "loss": 0.3305, "step": 5024 }, { "epoch": 4.232172936552499, "grad_norm": 0.28409454226493835, "learning_rate": 6.993320425367222e-07, "loss": 0.3336, "step": 5025 }, { "epoch": 4.233015160022459, "grad_norm": 0.2991652190685272, "learning_rate": 6.978326663351303e-07, "loss": 0.3481, "step": 5026 }, { "epoch": 4.23385738349242, "grad_norm": 0.31203001737594604, "learning_rate": 6.963347786096714e-07, "loss": 0.334, "step": 5027 }, { "epoch": 4.234699606962381, "grad_norm": 0.3319263160228729, "learning_rate": 6.948383798785873e-07, "loss": 0.3549, "step": 5028 }, { "epoch": 4.235541830432341, "grad_norm": 0.3014014959335327, "learning_rate": 6.933434706596076e-07, "loss": 0.3243, "step": 5029 }, { "epoch": 4.236384053902302, "grad_norm": 0.303652286529541, "learning_rate": 6.918500514699472e-07, "loss": 0.3891, "step": 5030 }, { "epoch": 4.237226277372263, "grad_norm": 0.31965532898902893, "learning_rate": 6.903581228263012e-07, "loss": 0.3547, "step": 5031 }, { "epoch": 4.238068500842224, "grad_norm": 0.3107907474040985, "learning_rate": 6.888676852448501e-07, "loss": 0.3396, "step": 5032 }, { "epoch": 4.238910724312184, "grad_norm": 0.32413437962532043, "learning_rate": 6.873787392412634e-07, "loss": 0.3471, "step": 5033 }, { "epoch": 4.2397529477821445, "grad_norm": 0.314765065908432, "learning_rate": 6.858912853306893e-07, "loss": 0.3158, "step": 5034 }, { "epoch": 4.240595171252106, "grad_norm": 0.30767402052879333, "learning_rate": 6.844053240277598e-07, "loss": 0.3478, "step": 5035 }, { "epoch": 4.241437394722066, "grad_norm": 0.3073284327983856, "learning_rate": 6.829208558465939e-07, "loss": 0.3097, "step": 5036 }, { "epoch": 4.242279618192027, "grad_norm": 0.31700655817985535, "learning_rate": 6.814378813007921e-07, "loss": 0.3676, "step": 5037 }, { "epoch": 4.243121841661988, "grad_norm": 0.3190152943134308, "learning_rate": 6.799564009034376e-07, "loss": 0.3523, "step": 5038 }, { "epoch": 4.243964065131948, "grad_norm": 0.31908679008483887, "learning_rate": 6.784764151670964e-07, "loss": 0.3265, "step": 5039 }, { "epoch": 4.244806288601909, "grad_norm": 0.3009735941886902, "learning_rate": 6.769979246038189e-07, "loss": 0.345, "step": 5040 }, { "epoch": 4.245648512071869, "grad_norm": 0.31360670924186707, "learning_rate": 6.755209297251386e-07, "loss": 0.3925, "step": 5041 }, { "epoch": 4.246490735541831, "grad_norm": 0.31901028752326965, "learning_rate": 6.740454310420697e-07, "loss": 0.3646, "step": 5042 }, { "epoch": 4.247332959011791, "grad_norm": 0.2940385043621063, "learning_rate": 6.725714290651081e-07, "loss": 0.3217, "step": 5043 }, { "epoch": 4.248175182481752, "grad_norm": 0.29774829745292664, "learning_rate": 6.710989243042337e-07, "loss": 0.3562, "step": 5044 }, { "epoch": 4.2490174059517125, "grad_norm": 0.3184093236923218, "learning_rate": 6.696279172689102e-07, "loss": 0.339, "step": 5045 }, { "epoch": 4.249859629421673, "grad_norm": 0.33004727959632874, "learning_rate": 6.681584084680787e-07, "loss": 0.3372, "step": 5046 }, { "epoch": 4.250701852891634, "grad_norm": 0.31911659240722656, "learning_rate": 6.666903984101635e-07, "loss": 0.3202, "step": 5047 }, { "epoch": 4.251544076361594, "grad_norm": 0.31983041763305664, "learning_rate": 6.65223887603072e-07, "loss": 0.3358, "step": 5048 }, { "epoch": 4.252386299831556, "grad_norm": 0.3103150427341461, "learning_rate": 6.637588765541925e-07, "loss": 0.3444, "step": 5049 }, { "epoch": 4.253228523301516, "grad_norm": 0.31225088238716125, "learning_rate": 6.622953657703934e-07, "loss": 0.3577, "step": 5050 }, { "epoch": 4.254070746771477, "grad_norm": 0.29581218957901, "learning_rate": 6.60833355758021e-07, "loss": 0.3306, "step": 5051 }, { "epoch": 4.254912970241437, "grad_norm": 0.32097503542900085, "learning_rate": 6.593728470229105e-07, "loss": 0.3575, "step": 5052 }, { "epoch": 4.255755193711398, "grad_norm": 0.309303879737854, "learning_rate": 6.579138400703716e-07, "loss": 0.3052, "step": 5053 }, { "epoch": 4.256597417181359, "grad_norm": 0.295785516500473, "learning_rate": 6.564563354051945e-07, "loss": 0.3269, "step": 5054 }, { "epoch": 4.257439640651319, "grad_norm": 0.31205251812934875, "learning_rate": 6.55000333531649e-07, "loss": 0.3689, "step": 5055 }, { "epoch": 4.2582818641212805, "grad_norm": 0.27422162890434265, "learning_rate": 6.535458349534912e-07, "loss": 0.2849, "step": 5056 }, { "epoch": 4.259124087591241, "grad_norm": 0.3366714417934418, "learning_rate": 6.520928401739506e-07, "loss": 0.3903, "step": 5057 }, { "epoch": 4.259966311061201, "grad_norm": 0.3066287934780121, "learning_rate": 6.506413496957375e-07, "loss": 0.3383, "step": 5058 }, { "epoch": 4.260808534531162, "grad_norm": 0.3051040768623352, "learning_rate": 6.49191364021044e-07, "loss": 0.3382, "step": 5059 }, { "epoch": 4.261650758001123, "grad_norm": 0.30920296907424927, "learning_rate": 6.477428836515404e-07, "loss": 0.3445, "step": 5060 }, { "epoch": 4.262492981471084, "grad_norm": 0.3122831881046295, "learning_rate": 6.462959090883758e-07, "loss": 0.3355, "step": 5061 }, { "epoch": 4.263335204941044, "grad_norm": 0.3185219168663025, "learning_rate": 6.448504408321771e-07, "loss": 0.355, "step": 5062 }, { "epoch": 4.264177428411005, "grad_norm": 0.28588464856147766, "learning_rate": 6.43406479383053e-07, "loss": 0.3084, "step": 5063 }, { "epoch": 4.265019651880966, "grad_norm": 0.317183256149292, "learning_rate": 6.419640252405901e-07, "loss": 0.3559, "step": 5064 }, { "epoch": 4.265861875350926, "grad_norm": 0.31796106696128845, "learning_rate": 6.405230789038519e-07, "loss": 0.325, "step": 5065 }, { "epoch": 4.266704098820887, "grad_norm": 0.3329648971557617, "learning_rate": 6.390836408713802e-07, "loss": 0.3591, "step": 5066 }, { "epoch": 4.267546322290848, "grad_norm": 0.29436254501342773, "learning_rate": 6.376457116411971e-07, "loss": 0.3213, "step": 5067 }, { "epoch": 4.268388545760809, "grad_norm": 0.3042960464954376, "learning_rate": 6.362092917108021e-07, "loss": 0.3477, "step": 5068 }, { "epoch": 4.269230769230769, "grad_norm": 0.30826207995414734, "learning_rate": 6.347743815771707e-07, "loss": 0.3224, "step": 5069 }, { "epoch": 4.2700729927007295, "grad_norm": 0.31548044085502625, "learning_rate": 6.333409817367564e-07, "loss": 0.3458, "step": 5070 }, { "epoch": 4.270915216170691, "grad_norm": 0.3035666048526764, "learning_rate": 6.319090926854921e-07, "loss": 0.3756, "step": 5071 }, { "epoch": 4.271757439640651, "grad_norm": 0.2933618426322937, "learning_rate": 6.304787149187874e-07, "loss": 0.3204, "step": 5072 }, { "epoch": 4.272599663110612, "grad_norm": 0.3370901048183441, "learning_rate": 6.290498489315283e-07, "loss": 0.3476, "step": 5073 }, { "epoch": 4.273441886580573, "grad_norm": 0.2997503876686096, "learning_rate": 6.276224952180765e-07, "loss": 0.3109, "step": 5074 }, { "epoch": 4.274284110050534, "grad_norm": 0.3139159679412842, "learning_rate": 6.261966542722731e-07, "loss": 0.3401, "step": 5075 }, { "epoch": 4.275126333520494, "grad_norm": 0.30729615688323975, "learning_rate": 6.247723265874351e-07, "loss": 0.3192, "step": 5076 }, { "epoch": 4.275968556990454, "grad_norm": 0.3152543306350708, "learning_rate": 6.233495126563538e-07, "loss": 0.3457, "step": 5077 }, { "epoch": 4.276810780460416, "grad_norm": 0.3472512662410736, "learning_rate": 6.219282129713006e-07, "loss": 0.3437, "step": 5078 }, { "epoch": 4.277653003930376, "grad_norm": 0.31246304512023926, "learning_rate": 6.205084280240182e-07, "loss": 0.3163, "step": 5079 }, { "epoch": 4.278495227400337, "grad_norm": 0.3177039623260498, "learning_rate": 6.190901583057307e-07, "loss": 0.3451, "step": 5080 }, { "epoch": 4.2793374508702975, "grad_norm": 0.29623740911483765, "learning_rate": 6.176734043071319e-07, "loss": 0.3463, "step": 5081 }, { "epoch": 4.280179674340259, "grad_norm": 0.3014947474002838, "learning_rate": 6.162581665183981e-07, "loss": 0.2993, "step": 5082 }, { "epoch": 4.281021897810219, "grad_norm": 0.2933516502380371, "learning_rate": 6.148444454291741e-07, "loss": 0.3075, "step": 5083 }, { "epoch": 4.281864121280179, "grad_norm": 0.3130274713039398, "learning_rate": 6.134322415285854e-07, "loss": 0.3616, "step": 5084 }, { "epoch": 4.282706344750141, "grad_norm": 0.312862366437912, "learning_rate": 6.120215553052283e-07, "loss": 0.3482, "step": 5085 }, { "epoch": 4.283548568220101, "grad_norm": 0.2861306369304657, "learning_rate": 6.10612387247178e-07, "loss": 0.3382, "step": 5086 }, { "epoch": 4.284390791690062, "grad_norm": 0.3017825782299042, "learning_rate": 6.092047378419808e-07, "loss": 0.3666, "step": 5087 }, { "epoch": 4.285233015160022, "grad_norm": 0.299189031124115, "learning_rate": 6.077986075766613e-07, "loss": 0.3503, "step": 5088 }, { "epoch": 4.286075238629983, "grad_norm": 0.2954254150390625, "learning_rate": 6.063939969377136e-07, "loss": 0.3502, "step": 5089 }, { "epoch": 4.286917462099944, "grad_norm": 0.28878775238990784, "learning_rate": 6.049909064111109e-07, "loss": 0.3319, "step": 5090 }, { "epoch": 4.287759685569904, "grad_norm": 0.3057447671890259, "learning_rate": 6.035893364822992e-07, "loss": 0.3219, "step": 5091 }, { "epoch": 4.2886019090398655, "grad_norm": 0.3151351511478424, "learning_rate": 6.021892876361962e-07, "loss": 0.3365, "step": 5092 }, { "epoch": 4.289444132509826, "grad_norm": 0.3008064925670624, "learning_rate": 6.007907603571944e-07, "loss": 0.3521, "step": 5093 }, { "epoch": 4.290286355979787, "grad_norm": 0.3010386824607849, "learning_rate": 5.993937551291606e-07, "loss": 0.3337, "step": 5094 }, { "epoch": 4.291128579449747, "grad_norm": 0.3285367786884308, "learning_rate": 5.979982724354366e-07, "loss": 0.3395, "step": 5095 }, { "epoch": 4.291970802919708, "grad_norm": 0.30890214443206787, "learning_rate": 5.966043127588328e-07, "loss": 0.3131, "step": 5096 }, { "epoch": 4.292813026389669, "grad_norm": 0.32889679074287415, "learning_rate": 5.952118765816372e-07, "loss": 0.3493, "step": 5097 }, { "epoch": 4.293655249859629, "grad_norm": 0.2963213622570038, "learning_rate": 5.938209643856075e-07, "loss": 0.3041, "step": 5098 }, { "epoch": 4.29449747332959, "grad_norm": 0.317220002412796, "learning_rate": 5.924315766519773e-07, "loss": 0.3448, "step": 5099 }, { "epoch": 4.295339696799551, "grad_norm": 0.3046329617500305, "learning_rate": 5.910437138614483e-07, "loss": 0.3216, "step": 5100 }, { "epoch": 4.296181920269511, "grad_norm": 0.3105696141719818, "learning_rate": 5.896573764941999e-07, "loss": 0.3325, "step": 5101 }, { "epoch": 4.297024143739472, "grad_norm": 0.3275417387485504, "learning_rate": 5.882725650298787e-07, "loss": 0.3502, "step": 5102 }, { "epoch": 4.297866367209433, "grad_norm": 0.31570377945899963, "learning_rate": 5.868892799476079e-07, "loss": 0.3362, "step": 5103 }, { "epoch": 4.298708590679394, "grad_norm": 0.30592209100723267, "learning_rate": 5.855075217259781e-07, "loss": 0.3142, "step": 5104 }, { "epoch": 4.299550814149354, "grad_norm": 0.3118080496788025, "learning_rate": 5.841272908430567e-07, "loss": 0.3483, "step": 5105 }, { "epoch": 4.300393037619315, "grad_norm": 0.318092405796051, "learning_rate": 5.827485877763772e-07, "loss": 0.3622, "step": 5106 }, { "epoch": 4.301235261089276, "grad_norm": 0.30980759859085083, "learning_rate": 5.813714130029496e-07, "loss": 0.3168, "step": 5107 }, { "epoch": 4.302077484559236, "grad_norm": 0.3125462532043457, "learning_rate": 5.799957669992501e-07, "loss": 0.3337, "step": 5108 }, { "epoch": 4.302919708029197, "grad_norm": 0.2955458462238312, "learning_rate": 5.786216502412317e-07, "loss": 0.3437, "step": 5109 }, { "epoch": 4.3037619314991575, "grad_norm": 0.3087269365787506, "learning_rate": 5.772490632043127e-07, "loss": 0.3433, "step": 5110 }, { "epoch": 4.304604154969119, "grad_norm": 0.3062915802001953, "learning_rate": 5.758780063633868e-07, "loss": 0.3273, "step": 5111 }, { "epoch": 4.305446378439079, "grad_norm": 0.3164283335208893, "learning_rate": 5.745084801928136e-07, "loss": 0.356, "step": 5112 }, { "epoch": 4.30628860190904, "grad_norm": 0.3205604553222656, "learning_rate": 5.731404851664285e-07, "loss": 0.3354, "step": 5113 }, { "epoch": 4.307130825379001, "grad_norm": 0.3506121337413788, "learning_rate": 5.717740217575313e-07, "loss": 0.3929, "step": 5114 }, { "epoch": 4.307973048848961, "grad_norm": 0.2979818880558014, "learning_rate": 5.70409090438897e-07, "loss": 0.3385, "step": 5115 }, { "epoch": 4.308815272318922, "grad_norm": 0.3079604506492615, "learning_rate": 5.690456916827691e-07, "loss": 0.3219, "step": 5116 }, { "epoch": 4.3096574957888825, "grad_norm": 0.30006837844848633, "learning_rate": 5.676838259608591e-07, "loss": 0.3182, "step": 5117 }, { "epoch": 4.310499719258844, "grad_norm": 0.2947372794151306, "learning_rate": 5.66323493744348e-07, "loss": 0.3473, "step": 5118 }, { "epoch": 4.311341942728804, "grad_norm": 0.305723637342453, "learning_rate": 5.649646955038884e-07, "loss": 0.3261, "step": 5119 }, { "epoch": 4.312184166198764, "grad_norm": 0.33107587695121765, "learning_rate": 5.636074317096024e-07, "loss": 0.3244, "step": 5120 }, { "epoch": 4.3130263896687255, "grad_norm": 0.3430156707763672, "learning_rate": 5.622517028310792e-07, "loss": 0.402, "step": 5121 }, { "epoch": 4.313868613138686, "grad_norm": 0.3043144643306732, "learning_rate": 5.608975093373764e-07, "loss": 0.3227, "step": 5122 }, { "epoch": 4.314710836608647, "grad_norm": 0.3312627077102661, "learning_rate": 5.595448516970231e-07, "loss": 0.347, "step": 5123 }, { "epoch": 4.315553060078607, "grad_norm": 0.3083966374397278, "learning_rate": 5.58193730378016e-07, "loss": 0.3513, "step": 5124 }, { "epoch": 4.316395283548569, "grad_norm": 0.31208547949790955, "learning_rate": 5.568441458478197e-07, "loss": 0.3563, "step": 5125 }, { "epoch": 4.317237507018529, "grad_norm": 0.3119768798351288, "learning_rate": 5.554960985733654e-07, "loss": 0.34, "step": 5126 }, { "epoch": 4.318079730488489, "grad_norm": 0.3033098578453064, "learning_rate": 5.541495890210563e-07, "loss": 0.3384, "step": 5127 }, { "epoch": 4.3189219539584505, "grad_norm": 0.29800572991371155, "learning_rate": 5.52804617656762e-07, "loss": 0.3039, "step": 5128 }, { "epoch": 4.319764177428411, "grad_norm": 0.3438248634338379, "learning_rate": 5.514611849458191e-07, "loss": 0.3314, "step": 5129 }, { "epoch": 4.320606400898372, "grad_norm": 0.3373839259147644, "learning_rate": 5.501192913530301e-07, "loss": 0.3528, "step": 5130 }, { "epoch": 4.321448624368332, "grad_norm": 0.33435794711112976, "learning_rate": 5.487789373426694e-07, "loss": 0.3527, "step": 5131 }, { "epoch": 4.322290847838293, "grad_norm": 0.304360568523407, "learning_rate": 5.474401233784771e-07, "loss": 0.3374, "step": 5132 }, { "epoch": 4.323133071308254, "grad_norm": 0.3066233992576599, "learning_rate": 5.461028499236593e-07, "loss": 0.3525, "step": 5133 }, { "epoch": 4.323975294778214, "grad_norm": 0.29657965898513794, "learning_rate": 5.447671174408875e-07, "loss": 0.3382, "step": 5134 }, { "epoch": 4.324817518248175, "grad_norm": 0.3078662157058716, "learning_rate": 5.434329263923043e-07, "loss": 0.3546, "step": 5135 }, { "epoch": 4.325659741718136, "grad_norm": 0.29383087158203125, "learning_rate": 5.421002772395179e-07, "loss": 0.305, "step": 5136 }, { "epoch": 4.326501965188097, "grad_norm": 0.3400653600692749, "learning_rate": 5.407691704435991e-07, "loss": 0.3655, "step": 5137 }, { "epoch": 4.327344188658057, "grad_norm": 0.3215368986129761, "learning_rate": 5.394396064650903e-07, "loss": 0.3125, "step": 5138 }, { "epoch": 4.328186412128018, "grad_norm": 0.32114356756210327, "learning_rate": 5.38111585763998e-07, "loss": 0.3367, "step": 5139 }, { "epoch": 4.329028635597979, "grad_norm": 0.3363726735115051, "learning_rate": 5.367851087997939e-07, "loss": 0.3847, "step": 5140 }, { "epoch": 4.329870859067939, "grad_norm": 0.3217226564884186, "learning_rate": 5.354601760314148e-07, "loss": 0.3347, "step": 5141 }, { "epoch": 4.3307130825379, "grad_norm": 0.30846622586250305, "learning_rate": 5.341367879172665e-07, "loss": 0.3532, "step": 5142 }, { "epoch": 4.331555306007861, "grad_norm": 0.3296859860420227, "learning_rate": 5.328149449152192e-07, "loss": 0.3408, "step": 5143 }, { "epoch": 4.332397529477822, "grad_norm": 0.3105439841747284, "learning_rate": 5.314946474826066e-07, "loss": 0.3397, "step": 5144 }, { "epoch": 4.333239752947782, "grad_norm": 0.30605897307395935, "learning_rate": 5.301758960762288e-07, "loss": 0.3513, "step": 5145 }, { "epoch": 4.3340819764177425, "grad_norm": 0.3059791922569275, "learning_rate": 5.288586911523514e-07, "loss": 0.3157, "step": 5146 }, { "epoch": 4.334924199887704, "grad_norm": 0.29087769985198975, "learning_rate": 5.275430331667064e-07, "loss": 0.3206, "step": 5147 }, { "epoch": 4.335766423357664, "grad_norm": 0.3152013123035431, "learning_rate": 5.262289225744871e-07, "loss": 0.3542, "step": 5148 }, { "epoch": 4.336608646827625, "grad_norm": 0.31706467270851135, "learning_rate": 5.249163598303525e-07, "loss": 0.3635, "step": 5149 }, { "epoch": 4.337450870297586, "grad_norm": 0.30148279666900635, "learning_rate": 5.236053453884282e-07, "loss": 0.3196, "step": 5150 }, { "epoch": 4.338293093767547, "grad_norm": 0.3217896521091461, "learning_rate": 5.222958797023036e-07, "loss": 0.3499, "step": 5151 }, { "epoch": 4.339135317237507, "grad_norm": 0.31320255994796753, "learning_rate": 5.209879632250303e-07, "loss": 0.3549, "step": 5152 }, { "epoch": 4.339977540707467, "grad_norm": 0.30328553915023804, "learning_rate": 5.196815964091239e-07, "loss": 0.3305, "step": 5153 }, { "epoch": 4.340819764177429, "grad_norm": 0.3207125961780548, "learning_rate": 5.183767797065659e-07, "loss": 0.352, "step": 5154 }, { "epoch": 4.341661987647389, "grad_norm": 0.29066357016563416, "learning_rate": 5.17073513568801e-07, "loss": 0.3121, "step": 5155 }, { "epoch": 4.34250421111735, "grad_norm": 0.33669766783714294, "learning_rate": 5.15771798446737e-07, "loss": 0.3423, "step": 5156 }, { "epoch": 4.3433464345873105, "grad_norm": 0.30317220091819763, "learning_rate": 5.144716347907424e-07, "loss": 0.3257, "step": 5157 }, { "epoch": 4.344188658057271, "grad_norm": 0.29082557559013367, "learning_rate": 5.13173023050656e-07, "loss": 0.3258, "step": 5158 }, { "epoch": 4.345030881527232, "grad_norm": 0.30219411849975586, "learning_rate": 5.11875963675772e-07, "loss": 0.3537, "step": 5159 }, { "epoch": 4.345873104997192, "grad_norm": 0.29965928196907043, "learning_rate": 5.10580457114852e-07, "loss": 0.326, "step": 5160 }, { "epoch": 4.346715328467154, "grad_norm": 0.3122439980506897, "learning_rate": 5.092865038161171e-07, "loss": 0.3571, "step": 5161 }, { "epoch": 4.347557551937114, "grad_norm": 0.30716419219970703, "learning_rate": 5.079941042272557e-07, "loss": 0.3305, "step": 5162 }, { "epoch": 4.348399775407074, "grad_norm": 0.3133905529975891, "learning_rate": 5.067032587954146e-07, "loss": 0.3378, "step": 5163 }, { "epoch": 4.349241998877035, "grad_norm": 0.3438853621482849, "learning_rate": 5.054139679672037e-07, "loss": 0.3717, "step": 5164 }, { "epoch": 4.350084222346996, "grad_norm": 0.30291327834129333, "learning_rate": 5.041262321886958e-07, "loss": 0.2902, "step": 5165 }, { "epoch": 4.350926445816957, "grad_norm": 0.32928770780563354, "learning_rate": 5.028400519054267e-07, "loss": 0.3561, "step": 5166 }, { "epoch": 4.351768669286917, "grad_norm": 0.30723974108695984, "learning_rate": 5.015554275623918e-07, "loss": 0.3373, "step": 5167 }, { "epoch": 4.3526108927568785, "grad_norm": 0.31297820806503296, "learning_rate": 5.002723596040477e-07, "loss": 0.3307, "step": 5168 }, { "epoch": 4.353453116226839, "grad_norm": 0.30674538016319275, "learning_rate": 4.989908484743155e-07, "loss": 0.3841, "step": 5169 }, { "epoch": 4.354295339696799, "grad_norm": 0.28893524408340454, "learning_rate": 4.977108946165765e-07, "loss": 0.3266, "step": 5170 }, { "epoch": 4.35513756316676, "grad_norm": 0.30467382073402405, "learning_rate": 4.964324984736723e-07, "loss": 0.3494, "step": 5171 }, { "epoch": 4.355979786636721, "grad_norm": 0.2950119078159332, "learning_rate": 4.951556604879049e-07, "loss": 0.3294, "step": 5172 }, { "epoch": 4.356822010106682, "grad_norm": 0.3194197416305542, "learning_rate": 4.93880381101039e-07, "loss": 0.3543, "step": 5173 }, { "epoch": 4.357664233576642, "grad_norm": 0.3328314423561096, "learning_rate": 4.926066607543006e-07, "loss": 0.3417, "step": 5174 }, { "epoch": 4.358506457046603, "grad_norm": 0.3321813941001892, "learning_rate": 4.91334499888374e-07, "loss": 0.3419, "step": 5175 }, { "epoch": 4.359348680516564, "grad_norm": 0.30503928661346436, "learning_rate": 4.900638989434042e-07, "loss": 0.3028, "step": 5176 }, { "epoch": 4.360190903986524, "grad_norm": 0.332373708486557, "learning_rate": 4.887948583589986e-07, "loss": 0.3366, "step": 5177 }, { "epoch": 4.361033127456485, "grad_norm": 0.2988923490047455, "learning_rate": 4.875273785742241e-07, "loss": 0.3372, "step": 5178 }, { "epoch": 4.361875350926446, "grad_norm": 0.2968159317970276, "learning_rate": 4.862614600276061e-07, "loss": 0.3471, "step": 5179 }, { "epoch": 4.362717574396407, "grad_norm": 0.3112739026546478, "learning_rate": 4.849971031571299e-07, "loss": 0.3253, "step": 5180 }, { "epoch": 4.363559797866367, "grad_norm": 0.31047412753105164, "learning_rate": 4.837343084002422e-07, "loss": 0.3621, "step": 5181 }, { "epoch": 4.364402021336328, "grad_norm": 0.32333579659461975, "learning_rate": 4.82473076193849e-07, "loss": 0.3527, "step": 5182 }, { "epoch": 4.365244244806289, "grad_norm": 0.30966272950172424, "learning_rate": 4.812134069743135e-07, "loss": 0.3513, "step": 5183 }, { "epoch": 4.366086468276249, "grad_norm": 0.2906617522239685, "learning_rate": 4.79955301177461e-07, "loss": 0.3166, "step": 5184 }, { "epoch": 4.36692869174621, "grad_norm": 0.30543068051338196, "learning_rate": 4.786987592385733e-07, "loss": 0.3396, "step": 5185 }, { "epoch": 4.367770915216171, "grad_norm": 0.301078200340271, "learning_rate": 4.774437815923937e-07, "loss": 0.3438, "step": 5186 }, { "epoch": 4.368613138686132, "grad_norm": 0.297883003950119, "learning_rate": 4.761903686731212e-07, "loss": 0.343, "step": 5187 }, { "epoch": 4.369455362156092, "grad_norm": 0.3029603958129883, "learning_rate": 4.749385209144164e-07, "loss": 0.3404, "step": 5188 }, { "epoch": 4.370297585626052, "grad_norm": 0.32393765449523926, "learning_rate": 4.736882387493985e-07, "loss": 0.3584, "step": 5189 }, { "epoch": 4.371139809096014, "grad_norm": 0.3144909739494324, "learning_rate": 4.7243952261064154e-07, "loss": 0.3567, "step": 5190 }, { "epoch": 4.371982032565974, "grad_norm": 0.29127976298332214, "learning_rate": 4.711923729301798e-07, "loss": 0.3242, "step": 5191 }, { "epoch": 4.372824256035935, "grad_norm": 0.311185359954834, "learning_rate": 4.6994679013950606e-07, "loss": 0.3267, "step": 5192 }, { "epoch": 4.3736664795058955, "grad_norm": 0.3135044574737549, "learning_rate": 4.6870277466957273e-07, "loss": 0.3329, "step": 5193 }, { "epoch": 4.374508702975856, "grad_norm": 0.29664111137390137, "learning_rate": 4.674603269507855e-07, "loss": 0.3601, "step": 5194 }, { "epoch": 4.375350926445817, "grad_norm": 0.31377965211868286, "learning_rate": 4.662194474130094e-07, "loss": 0.3614, "step": 5195 }, { "epoch": 4.376193149915777, "grad_norm": 0.31617799401283264, "learning_rate": 4.649801364855694e-07, "loss": 0.3694, "step": 5196 }, { "epoch": 4.377035373385739, "grad_norm": 0.28284960985183716, "learning_rate": 4.6374239459724526e-07, "loss": 0.3336, "step": 5197 }, { "epoch": 4.377877596855699, "grad_norm": 0.3098202049732208, "learning_rate": 4.6250622217627495e-07, "loss": 0.3649, "step": 5198 }, { "epoch": 4.37871982032566, "grad_norm": 0.3153405785560608, "learning_rate": 4.6127161965035084e-07, "loss": 0.3791, "step": 5199 }, { "epoch": 4.37956204379562, "grad_norm": 0.2976747751235962, "learning_rate": 4.6003858744662564e-07, "loss": 0.3322, "step": 5200 }, { "epoch": 4.380404267265581, "grad_norm": 0.31077444553375244, "learning_rate": 4.588071259917082e-07, "loss": 0.3608, "step": 5201 }, { "epoch": 4.381246490735542, "grad_norm": 0.31420350074768066, "learning_rate": 4.5757723571166144e-07, "loss": 0.3293, "step": 5202 }, { "epoch": 4.382088714205502, "grad_norm": 0.31779080629348755, "learning_rate": 4.563489170320079e-07, "loss": 0.352, "step": 5203 }, { "epoch": 4.3829309376754635, "grad_norm": 0.32003501057624817, "learning_rate": 4.551221703777231e-07, "loss": 0.3634, "step": 5204 }, { "epoch": 4.383773161145424, "grad_norm": 0.3126109838485718, "learning_rate": 4.5389699617324255e-07, "loss": 0.3644, "step": 5205 }, { "epoch": 4.384615384615385, "grad_norm": 0.3118753731250763, "learning_rate": 4.5267339484245274e-07, "loss": 0.3235, "step": 5206 }, { "epoch": 4.385457608085345, "grad_norm": 0.28941595554351807, "learning_rate": 4.514513668087012e-07, "loss": 0.3173, "step": 5207 }, { "epoch": 4.386299831555306, "grad_norm": 0.3082727789878845, "learning_rate": 4.502309124947868e-07, "loss": 0.3934, "step": 5208 }, { "epoch": 4.387142055025267, "grad_norm": 0.27935823798179626, "learning_rate": 4.49012032322968e-07, "loss": 0.3423, "step": 5209 }, { "epoch": 4.387984278495227, "grad_norm": 0.30645549297332764, "learning_rate": 4.477947267149535e-07, "loss": 0.3365, "step": 5210 }, { "epoch": 4.388826501965188, "grad_norm": 0.3370758295059204, "learning_rate": 4.465789960919131e-07, "loss": 0.3669, "step": 5211 }, { "epoch": 4.389668725435149, "grad_norm": 0.2920403778553009, "learning_rate": 4.4536484087446654e-07, "loss": 0.3372, "step": 5212 }, { "epoch": 4.39051094890511, "grad_norm": 0.3078870177268982, "learning_rate": 4.4415226148269264e-07, "loss": 0.3421, "step": 5213 }, { "epoch": 4.39135317237507, "grad_norm": 0.2982083857059479, "learning_rate": 4.429412583361209e-07, "loss": 0.3391, "step": 5214 }, { "epoch": 4.392195395845031, "grad_norm": 0.32195740938186646, "learning_rate": 4.417318318537395e-07, "loss": 0.3282, "step": 5215 }, { "epoch": 4.393037619314992, "grad_norm": 0.3245623707771301, "learning_rate": 4.405239824539881e-07, "loss": 0.3728, "step": 5216 }, { "epoch": 4.393879842784952, "grad_norm": 0.2990025281906128, "learning_rate": 4.39317710554763e-07, "loss": 0.3109, "step": 5217 }, { "epoch": 4.394722066254913, "grad_norm": 0.31452426314353943, "learning_rate": 4.381130165734121e-07, "loss": 0.332, "step": 5218 }, { "epoch": 4.395564289724874, "grad_norm": 0.29194915294647217, "learning_rate": 4.3690990092674025e-07, "loss": 0.3161, "step": 5219 }, { "epoch": 4.396406513194834, "grad_norm": 0.32330015301704407, "learning_rate": 4.357083640310039e-07, "loss": 0.3762, "step": 5220 }, { "epoch": 4.397248736664795, "grad_norm": 0.30773788690567017, "learning_rate": 4.3450840630191503e-07, "loss": 0.3277, "step": 5221 }, { "epoch": 4.3980909601347555, "grad_norm": 0.29910165071487427, "learning_rate": 4.333100281546376e-07, "loss": 0.2921, "step": 5222 }, { "epoch": 4.398933183604717, "grad_norm": 0.3216507136821747, "learning_rate": 4.3211323000379115e-07, "loss": 0.3707, "step": 5223 }, { "epoch": 4.399775407074677, "grad_norm": 0.30252882838249207, "learning_rate": 4.3091801226344486e-07, "loss": 0.342, "step": 5224 }, { "epoch": 4.400617630544637, "grad_norm": 0.3351913392543793, "learning_rate": 4.2972437534712586e-07, "loss": 0.3651, "step": 5225 }, { "epoch": 4.401459854014599, "grad_norm": 0.31005924940109253, "learning_rate": 4.2853231966781203e-07, "loss": 0.3289, "step": 5226 }, { "epoch": 4.402302077484559, "grad_norm": 0.3086715042591095, "learning_rate": 4.273418456379341e-07, "loss": 0.3435, "step": 5227 }, { "epoch": 4.40314430095452, "grad_norm": 0.29292818903923035, "learning_rate": 4.261529536693737e-07, "loss": 0.2965, "step": 5228 }, { "epoch": 4.4039865244244805, "grad_norm": 0.3141501843929291, "learning_rate": 4.2496564417346843e-07, "loss": 0.3492, "step": 5229 }, { "epoch": 4.404828747894442, "grad_norm": 0.3094610571861267, "learning_rate": 4.23779917561008e-07, "loss": 0.358, "step": 5230 }, { "epoch": 4.405670971364402, "grad_norm": 0.3257288336753845, "learning_rate": 4.2259577424223264e-07, "loss": 0.3339, "step": 5231 }, { "epoch": 4.406513194834362, "grad_norm": 0.30853596329689026, "learning_rate": 4.2141321462683516e-07, "loss": 0.3351, "step": 5232 }, { "epoch": 4.4073554183043235, "grad_norm": 0.2954862415790558, "learning_rate": 4.202322391239605e-07, "loss": 0.3157, "step": 5233 }, { "epoch": 4.408197641774284, "grad_norm": 0.2979133427143097, "learning_rate": 4.1905284814220793e-07, "loss": 0.3358, "step": 5234 }, { "epoch": 4.409039865244245, "grad_norm": 0.3095383644104004, "learning_rate": 4.178750420896255e-07, "loss": 0.3106, "step": 5235 }, { "epoch": 4.409882088714205, "grad_norm": 0.32535842061042786, "learning_rate": 4.1669882137371277e-07, "loss": 0.3762, "step": 5236 }, { "epoch": 4.410724312184167, "grad_norm": 0.3095881938934326, "learning_rate": 4.155241864014231e-07, "loss": 0.3095, "step": 5237 }, { "epoch": 4.411566535654127, "grad_norm": 0.3356349766254425, "learning_rate": 4.1435113757916036e-07, "loss": 0.3518, "step": 5238 }, { "epoch": 4.412408759124087, "grad_norm": 0.327846497297287, "learning_rate": 4.131796753127781e-07, "loss": 0.3391, "step": 5239 }, { "epoch": 4.4132509825940485, "grad_norm": 0.2967770993709564, "learning_rate": 4.1200980000758397e-07, "loss": 0.3264, "step": 5240 }, { "epoch": 4.414093206064009, "grad_norm": 0.30664369463920593, "learning_rate": 4.1084151206833287e-07, "loss": 0.3585, "step": 5241 }, { "epoch": 4.41493542953397, "grad_norm": 0.30320027470588684, "learning_rate": 4.0967481189923386e-07, "loss": 0.3178, "step": 5242 }, { "epoch": 4.41577765300393, "grad_norm": 0.2881230413913727, "learning_rate": 4.085096999039434e-07, "loss": 0.3398, "step": 5243 }, { "epoch": 4.4166198764738915, "grad_norm": 0.3120655417442322, "learning_rate": 4.0734617648557186e-07, "loss": 0.3714, "step": 5244 }, { "epoch": 4.417462099943852, "grad_norm": 0.30112168192863464, "learning_rate": 4.0618424204667886e-07, "loss": 0.3348, "step": 5245 }, { "epoch": 4.418304323413812, "grad_norm": 0.31265658140182495, "learning_rate": 4.050238969892728e-07, "loss": 0.3459, "step": 5246 }, { "epoch": 4.419146546883773, "grad_norm": 0.3037571310997009, "learning_rate": 4.0386514171481204e-07, "loss": 0.3808, "step": 5247 }, { "epoch": 4.419988770353734, "grad_norm": 0.3273394703865051, "learning_rate": 4.027079766242076e-07, "loss": 0.3284, "step": 5248 }, { "epoch": 4.420830993823695, "grad_norm": 0.3189765512943268, "learning_rate": 4.0155240211781966e-07, "loss": 0.3233, "step": 5249 }, { "epoch": 4.421673217293655, "grad_norm": 0.31536367535591125, "learning_rate": 4.0039841859545515e-07, "loss": 0.3864, "step": 5250 }, { "epoch": 4.422515440763616, "grad_norm": 0.31042537093162537, "learning_rate": 3.99246026456373e-07, "loss": 0.3389, "step": 5251 }, { "epoch": 4.423357664233577, "grad_norm": 0.2992699444293976, "learning_rate": 3.980952260992815e-07, "loss": 0.3038, "step": 5252 }, { "epoch": 4.424199887703537, "grad_norm": 0.30274227261543274, "learning_rate": 3.969460179223389e-07, "loss": 0.3424, "step": 5253 }, { "epoch": 4.425042111173498, "grad_norm": 0.30435675382614136, "learning_rate": 3.957984023231498e-07, "loss": 0.3656, "step": 5254 }, { "epoch": 4.425884334643459, "grad_norm": 0.3262827694416046, "learning_rate": 3.9465237969877e-07, "loss": 0.3705, "step": 5255 }, { "epoch": 4.426726558113419, "grad_norm": 0.30323728919029236, "learning_rate": 3.9350795044570345e-07, "loss": 0.291, "step": 5256 }, { "epoch": 4.42756878158338, "grad_norm": 0.3165193200111389, "learning_rate": 3.9236511495990503e-07, "loss": 0.3638, "step": 5257 }, { "epoch": 4.4284110050533405, "grad_norm": 0.3112754821777344, "learning_rate": 3.9122387363677406e-07, "loss": 0.3222, "step": 5258 }, { "epoch": 4.429253228523302, "grad_norm": 0.3332512378692627, "learning_rate": 3.900842268711602e-07, "loss": 0.3385, "step": 5259 }, { "epoch": 4.430095451993262, "grad_norm": 0.3044538199901581, "learning_rate": 3.8894617505736255e-07, "loss": 0.3139, "step": 5260 }, { "epoch": 4.430937675463223, "grad_norm": 0.3278732895851135, "learning_rate": 3.8780971858912884e-07, "loss": 0.3619, "step": 5261 }, { "epoch": 4.431779898933184, "grad_norm": 0.3224804103374481, "learning_rate": 3.866748578596519e-07, "loss": 0.3501, "step": 5262 }, { "epoch": 4.432622122403144, "grad_norm": 0.2857128381729126, "learning_rate": 3.8554159326157304e-07, "loss": 0.3015, "step": 5263 }, { "epoch": 4.433464345873105, "grad_norm": 0.3064214289188385, "learning_rate": 3.8440992518698483e-07, "loss": 0.3584, "step": 5264 }, { "epoch": 4.434306569343065, "grad_norm": 0.32259684801101685, "learning_rate": 3.832798540274246e-07, "loss": 0.3601, "step": 5265 }, { "epoch": 4.435148792813027, "grad_norm": 0.2961054742336273, "learning_rate": 3.821513801738763e-07, "loss": 0.3278, "step": 5266 }, { "epoch": 4.435991016282987, "grad_norm": 0.3122723400592804, "learning_rate": 3.8102450401677203e-07, "loss": 0.3454, "step": 5267 }, { "epoch": 4.436833239752948, "grad_norm": 0.3114301264286041, "learning_rate": 3.798992259459944e-07, "loss": 0.3459, "step": 5268 }, { "epoch": 4.4376754632229085, "grad_norm": 0.29892823100090027, "learning_rate": 3.7877554635086857e-07, "loss": 0.3246, "step": 5269 }, { "epoch": 4.438517686692869, "grad_norm": 0.30595600605010986, "learning_rate": 3.776534656201675e-07, "loss": 0.3403, "step": 5270 }, { "epoch": 4.43935991016283, "grad_norm": 0.29530584812164307, "learning_rate": 3.7653298414211336e-07, "loss": 0.3358, "step": 5271 }, { "epoch": 4.44020213363279, "grad_norm": 0.30045750737190247, "learning_rate": 3.7541410230437335e-07, "loss": 0.3496, "step": 5272 }, { "epoch": 4.441044357102752, "grad_norm": 0.2963896095752716, "learning_rate": 3.742968204940617e-07, "loss": 0.3377, "step": 5273 }, { "epoch": 4.441886580572712, "grad_norm": 0.32234448194503784, "learning_rate": 3.731811390977369e-07, "loss": 0.3686, "step": 5274 }, { "epoch": 4.442728804042673, "grad_norm": 0.31149303913116455, "learning_rate": 3.7206705850140744e-07, "loss": 0.3202, "step": 5275 }, { "epoch": 4.443571027512633, "grad_norm": 0.3264768421649933, "learning_rate": 3.7095457909052554e-07, "loss": 0.3669, "step": 5276 }, { "epoch": 4.444413250982594, "grad_norm": 0.287827730178833, "learning_rate": 3.6984370124999056e-07, "loss": 0.3115, "step": 5277 }, { "epoch": 4.445255474452555, "grad_norm": 0.30336910486221313, "learning_rate": 3.6873442536414614e-07, "loss": 0.3731, "step": 5278 }, { "epoch": 4.446097697922515, "grad_norm": 0.31313127279281616, "learning_rate": 3.676267518167825e-07, "loss": 0.3447, "step": 5279 }, { "epoch": 4.4469399213924765, "grad_norm": 0.2870131731033325, "learning_rate": 3.6652068099113756e-07, "loss": 0.3289, "step": 5280 }, { "epoch": 4.447782144862437, "grad_norm": 0.30209285020828247, "learning_rate": 3.6541621326989183e-07, "loss": 0.3584, "step": 5281 }, { "epoch": 4.448624368332397, "grad_norm": 0.3073284924030304, "learning_rate": 3.6431334903517133e-07, "loss": 0.3342, "step": 5282 }, { "epoch": 4.449466591802358, "grad_norm": 0.2901993691921234, "learning_rate": 3.6321208866854864e-07, "loss": 0.3212, "step": 5283 }, { "epoch": 4.450308815272319, "grad_norm": 0.3525388538837433, "learning_rate": 3.6211243255104225e-07, "loss": 0.413, "step": 5284 }, { "epoch": 4.45115103874228, "grad_norm": 0.28819039463996887, "learning_rate": 3.6101438106311336e-07, "loss": 0.3199, "step": 5285 }, { "epoch": 4.45199326221224, "grad_norm": 0.3171965181827545, "learning_rate": 3.599179345846676e-07, "loss": 0.3504, "step": 5286 }, { "epoch": 4.4528354856822006, "grad_norm": 0.3152416944503784, "learning_rate": 3.588230934950587e-07, "loss": 0.3553, "step": 5287 }, { "epoch": 4.453677709152162, "grad_norm": 0.28408974409103394, "learning_rate": 3.577298581730826e-07, "loss": 0.3223, "step": 5288 }, { "epoch": 4.454519932622122, "grad_norm": 0.31032517552375793, "learning_rate": 3.566382289969789e-07, "loss": 0.359, "step": 5289 }, { "epoch": 4.455362156092083, "grad_norm": 0.30128079652786255, "learning_rate": 3.555482063444332e-07, "loss": 0.3265, "step": 5290 }, { "epoch": 4.456204379562044, "grad_norm": 0.31641846895217896, "learning_rate": 3.544597905925751e-07, "loss": 0.3481, "step": 5291 }, { "epoch": 4.457046603032005, "grad_norm": 0.33388280868530273, "learning_rate": 3.5337298211797767e-07, "loss": 0.3493, "step": 5292 }, { "epoch": 4.457888826501965, "grad_norm": 0.31085801124572754, "learning_rate": 3.5228778129665686e-07, "loss": 0.3633, "step": 5293 }, { "epoch": 4.4587310499719255, "grad_norm": 0.30988964438438416, "learning_rate": 3.5120418850407457e-07, "loss": 0.3521, "step": 5294 }, { "epoch": 4.459573273441887, "grad_norm": 0.3015978932380676, "learning_rate": 3.501222041151359e-07, "loss": 0.3164, "step": 5295 }, { "epoch": 4.460415496911847, "grad_norm": 0.33690667152404785, "learning_rate": 3.4904182850418855e-07, "loss": 0.3933, "step": 5296 }, { "epoch": 4.461257720381808, "grad_norm": 0.31388935446739197, "learning_rate": 3.4796306204502196e-07, "loss": 0.324, "step": 5297 }, { "epoch": 4.462099943851769, "grad_norm": 0.2973192036151886, "learning_rate": 3.4688590511087304e-07, "loss": 0.3409, "step": 5298 }, { "epoch": 4.46294216732173, "grad_norm": 0.30549588799476624, "learning_rate": 3.4581035807441933e-07, "loss": 0.3357, "step": 5299 }, { "epoch": 4.46378439079169, "grad_norm": 0.3235492408275604, "learning_rate": 3.447364213077814e-07, "loss": 0.3251, "step": 5300 }, { "epoch": 4.46462661426165, "grad_norm": 0.3013160526752472, "learning_rate": 3.4366409518252266e-07, "loss": 0.3369, "step": 5301 }, { "epoch": 4.465468837731612, "grad_norm": 0.30494368076324463, "learning_rate": 3.4259338006964906e-07, "loss": 0.313, "step": 5302 }, { "epoch": 4.466311061201572, "grad_norm": 0.2869059443473816, "learning_rate": 3.4152427633961204e-07, "loss": 0.3149, "step": 5303 }, { "epoch": 4.467153284671533, "grad_norm": 0.31892311573028564, "learning_rate": 3.4045678436230077e-07, "loss": 0.3475, "step": 5304 }, { "epoch": 4.4679955081414935, "grad_norm": 0.32766684889793396, "learning_rate": 3.3939090450704924e-07, "loss": 0.3425, "step": 5305 }, { "epoch": 4.468837731611455, "grad_norm": 0.30636030435562134, "learning_rate": 3.383266371426347e-07, "loss": 0.3391, "step": 5306 }, { "epoch": 4.469679955081415, "grad_norm": 0.2834102213382721, "learning_rate": 3.3726398263727533e-07, "loss": 0.3553, "step": 5307 }, { "epoch": 4.470522178551375, "grad_norm": 0.3106828033924103, "learning_rate": 3.362029413586315e-07, "loss": 0.3303, "step": 5308 }, { "epoch": 4.471364402021337, "grad_norm": 0.3178477883338928, "learning_rate": 3.35143513673804e-07, "loss": 0.3463, "step": 5309 }, { "epoch": 4.472206625491297, "grad_norm": 0.28327083587646484, "learning_rate": 3.340856999493375e-07, "loss": 0.329, "step": 5310 }, { "epoch": 4.473048848961258, "grad_norm": 0.30245062708854675, "learning_rate": 3.3302950055121796e-07, "loss": 0.3715, "step": 5311 }, { "epoch": 4.473891072431218, "grad_norm": 0.29377344250679016, "learning_rate": 3.3197491584487096e-07, "loss": 0.3113, "step": 5312 }, { "epoch": 4.474733295901179, "grad_norm": 0.31630390882492065, "learning_rate": 3.3092194619516626e-07, "loss": 0.3102, "step": 5313 }, { "epoch": 4.47557551937114, "grad_norm": 0.31633907556533813, "learning_rate": 3.2987059196641134e-07, "loss": 0.3395, "step": 5314 }, { "epoch": 4.4764177428411, "grad_norm": 0.3224036991596222, "learning_rate": 3.2882085352235916e-07, "loss": 0.3541, "step": 5315 }, { "epoch": 4.4772599663110615, "grad_norm": 0.2892427146434784, "learning_rate": 3.277727312261991e-07, "loss": 0.3259, "step": 5316 }, { "epoch": 4.478102189781022, "grad_norm": 0.30580413341522217, "learning_rate": 3.2672622544056564e-07, "loss": 0.3514, "step": 5317 }, { "epoch": 4.478944413250982, "grad_norm": 0.3008541464805603, "learning_rate": 3.2568133652752907e-07, "loss": 0.3593, "step": 5318 }, { "epoch": 4.479786636720943, "grad_norm": 0.30700984597206116, "learning_rate": 3.2463806484860583e-07, "loss": 0.3207, "step": 5319 }, { "epoch": 4.480628860190904, "grad_norm": 0.31028351187705994, "learning_rate": 3.235964107647477e-07, "loss": 0.383, "step": 5320 }, { "epoch": 4.481471083660865, "grad_norm": 0.28650397062301636, "learning_rate": 3.225563746363508e-07, "loss": 0.3425, "step": 5321 }, { "epoch": 4.482313307130825, "grad_norm": 0.28042376041412354, "learning_rate": 3.215179568232485e-07, "loss": 0.304, "step": 5322 }, { "epoch": 4.483155530600786, "grad_norm": 0.31750231981277466, "learning_rate": 3.204811576847172e-07, "loss": 0.3473, "step": 5323 }, { "epoch": 4.483997754070747, "grad_norm": 0.4230107069015503, "learning_rate": 3.1944597757947005e-07, "loss": 0.3466, "step": 5324 }, { "epoch": 4.484839977540707, "grad_norm": 0.30203092098236084, "learning_rate": 3.1841241686566273e-07, "loss": 0.3679, "step": 5325 }, { "epoch": 4.485682201010668, "grad_norm": 0.308300256729126, "learning_rate": 3.1738047590088807e-07, "loss": 0.3366, "step": 5326 }, { "epoch": 4.486524424480629, "grad_norm": 0.3093225359916687, "learning_rate": 3.1635015504218216e-07, "loss": 0.3346, "step": 5327 }, { "epoch": 4.48736664795059, "grad_norm": 0.3168126046657562, "learning_rate": 3.153214546460159e-07, "loss": 0.3427, "step": 5328 }, { "epoch": 4.48820887142055, "grad_norm": 0.31416764855384827, "learning_rate": 3.142943750683042e-07, "loss": 0.3555, "step": 5329 }, { "epoch": 4.489051094890511, "grad_norm": 0.30646616220474243, "learning_rate": 3.1326891666439706e-07, "loss": 0.362, "step": 5330 }, { "epoch": 4.489893318360472, "grad_norm": 0.29597651958465576, "learning_rate": 3.1224507978908636e-07, "loss": 0.3321, "step": 5331 }, { "epoch": 4.490735541830432, "grad_norm": 0.3001215159893036, "learning_rate": 3.112228647966031e-07, "loss": 0.3568, "step": 5332 }, { "epoch": 4.491577765300393, "grad_norm": 0.30107295513153076, "learning_rate": 3.10202272040615e-07, "loss": 0.3025, "step": 5333 }, { "epoch": 4.4924199887703535, "grad_norm": 0.29265889525413513, "learning_rate": 3.091833018742296e-07, "loss": 0.3252, "step": 5334 }, { "epoch": 4.493262212240315, "grad_norm": 0.3089595139026642, "learning_rate": 3.081659546499927e-07, "loss": 0.3724, "step": 5335 }, { "epoch": 4.494104435710275, "grad_norm": 0.31906452775001526, "learning_rate": 3.071502307198909e-07, "loss": 0.3524, "step": 5336 }, { "epoch": 4.494946659180236, "grad_norm": 0.3142894506454468, "learning_rate": 3.06136130435345e-07, "loss": 0.3485, "step": 5337 }, { "epoch": 4.495788882650197, "grad_norm": 0.28886187076568604, "learning_rate": 3.05123654147218e-07, "loss": 0.3487, "step": 5338 }, { "epoch": 4.496631106120157, "grad_norm": 0.3064599931240082, "learning_rate": 3.041128022058082e-07, "loss": 0.3411, "step": 5339 }, { "epoch": 4.497473329590118, "grad_norm": 0.31888461112976074, "learning_rate": 3.0310357496085406e-07, "loss": 0.3634, "step": 5340 }, { "epoch": 4.4983155530600785, "grad_norm": 0.3035995364189148, "learning_rate": 3.0209597276152967e-07, "loss": 0.3154, "step": 5341 }, { "epoch": 4.49915777653004, "grad_norm": 0.2984228730201721, "learning_rate": 3.0108999595645006e-07, "loss": 0.332, "step": 5342 }, { "epoch": 4.5, "grad_norm": 0.297006219625473, "learning_rate": 3.0008564489366344e-07, "loss": 0.3301, "step": 5343 }, { "epoch": 4.50084222346996, "grad_norm": 0.3105577230453491, "learning_rate": 2.9908291992065965e-07, "loss": 0.3593, "step": 5344 }, { "epoch": 4.5016844469399215, "grad_norm": 0.31018853187561035, "learning_rate": 2.9808182138436393e-07, "loss": 0.3503, "step": 5345 }, { "epoch": 4.502526670409882, "grad_norm": 0.28495320677757263, "learning_rate": 2.970823496311398e-07, "loss": 0.3163, "step": 5346 }, { "epoch": 4.503368893879843, "grad_norm": 0.29453733563423157, "learning_rate": 2.9608450500678566e-07, "loss": 0.3411, "step": 5347 }, { "epoch": 4.504211117349803, "grad_norm": 0.29224586486816406, "learning_rate": 2.9508828785654085e-07, "loss": 0.3492, "step": 5348 }, { "epoch": 4.505053340819764, "grad_norm": 0.29138681292533875, "learning_rate": 2.940936985250781e-07, "loss": 0.3543, "step": 5349 }, { "epoch": 4.505895564289725, "grad_norm": 0.3059431314468384, "learning_rate": 2.9310073735650814e-07, "loss": 0.385, "step": 5350 }, { "epoch": 4.506737787759685, "grad_norm": 0.30467864871025085, "learning_rate": 2.9210940469437955e-07, "loss": 0.3254, "step": 5351 }, { "epoch": 4.5075800112296465, "grad_norm": 0.3259301781654358, "learning_rate": 2.9111970088167575e-07, "loss": 0.3647, "step": 5352 }, { "epoch": 4.508422234699607, "grad_norm": 0.2972586154937744, "learning_rate": 2.901316262608167e-07, "loss": 0.3048, "step": 5353 }, { "epoch": 4.509264458169568, "grad_norm": 0.2964493930339813, "learning_rate": 2.891451811736601e-07, "loss": 0.3688, "step": 5354 }, { "epoch": 4.510106681639528, "grad_norm": 0.299431711435318, "learning_rate": 2.8816036596149956e-07, "loss": 0.2963, "step": 5355 }, { "epoch": 4.510948905109489, "grad_norm": 0.32226645946502686, "learning_rate": 2.8717718096506307e-07, "loss": 0.3759, "step": 5356 }, { "epoch": 4.51179112857945, "grad_norm": 0.30358821153640747, "learning_rate": 2.861956265245164e-07, "loss": 0.3507, "step": 5357 }, { "epoch": 4.51263335204941, "grad_norm": 0.31764164566993713, "learning_rate": 2.8521570297945944e-07, "loss": 0.3478, "step": 5358 }, { "epoch": 4.513475575519371, "grad_norm": 0.2958790063858032, "learning_rate": 2.842374106689316e-07, "loss": 0.3405, "step": 5359 }, { "epoch": 4.514317798989332, "grad_norm": 0.31899911165237427, "learning_rate": 2.8326074993140275e-07, "loss": 0.3427, "step": 5360 }, { "epoch": 4.515160022459293, "grad_norm": 0.30223432183265686, "learning_rate": 2.822857211047814e-07, "loss": 0.3074, "step": 5361 }, { "epoch": 4.516002245929253, "grad_norm": 0.3131091594696045, "learning_rate": 2.813123245264104e-07, "loss": 0.3556, "step": 5362 }, { "epoch": 4.516844469399214, "grad_norm": 0.29149705171585083, "learning_rate": 2.8034056053307e-07, "loss": 0.3283, "step": 5363 }, { "epoch": 4.517686692869175, "grad_norm": 0.3051091134548187, "learning_rate": 2.793704294609728e-07, "loss": 0.3327, "step": 5364 }, { "epoch": 4.518528916339135, "grad_norm": 0.31097638607025146, "learning_rate": 2.7840193164576645e-07, "loss": 0.3239, "step": 5365 }, { "epoch": 4.519371139809096, "grad_norm": 0.3310883939266205, "learning_rate": 2.774350674225357e-07, "loss": 0.3547, "step": 5366 }, { "epoch": 4.520213363279057, "grad_norm": 0.2880393862724304, "learning_rate": 2.7646983712579956e-07, "loss": 0.3479, "step": 5367 }, { "epoch": 4.521055586749018, "grad_norm": 0.31122517585754395, "learning_rate": 2.755062410895104e-07, "loss": 0.3381, "step": 5368 }, { "epoch": 4.521897810218978, "grad_norm": 0.3122255206108093, "learning_rate": 2.7454427964705485e-07, "loss": 0.284, "step": 5369 }, { "epoch": 4.5227400336889385, "grad_norm": 0.3142257034778595, "learning_rate": 2.7358395313125785e-07, "loss": 0.3577, "step": 5370 }, { "epoch": 4.5235822571589, "grad_norm": 0.2946200668811798, "learning_rate": 2.7262526187437357e-07, "loss": 0.3431, "step": 5371 }, { "epoch": 4.52442448062886, "grad_norm": 0.30014172196388245, "learning_rate": 2.7166820620809387e-07, "loss": 0.3261, "step": 5372 }, { "epoch": 4.525266704098821, "grad_norm": 0.27955248951911926, "learning_rate": 2.707127864635417e-07, "loss": 0.3422, "step": 5373 }, { "epoch": 4.526108927568782, "grad_norm": 0.30673643946647644, "learning_rate": 2.697590029712793e-07, "loss": 0.3722, "step": 5374 }, { "epoch": 4.526951151038742, "grad_norm": 0.32467278838157654, "learning_rate": 2.6880685606129666e-07, "loss": 0.3572, "step": 5375 }, { "epoch": 4.527793374508703, "grad_norm": 0.30916815996170044, "learning_rate": 2.6785634606302126e-07, "loss": 0.3188, "step": 5376 }, { "epoch": 4.528635597978663, "grad_norm": 0.31201863288879395, "learning_rate": 2.6690747330531286e-07, "loss": 0.3432, "step": 5377 }, { "epoch": 4.529477821448625, "grad_norm": 0.32246696949005127, "learning_rate": 2.6596023811646554e-07, "loss": 0.3846, "step": 5378 }, { "epoch": 4.530320044918585, "grad_norm": 0.29790711402893066, "learning_rate": 2.650146408242071e-07, "loss": 0.3205, "step": 5379 }, { "epoch": 4.531162268388545, "grad_norm": 0.3139197826385498, "learning_rate": 2.640706817556965e-07, "loss": 0.366, "step": 5380 }, { "epoch": 4.5320044918585065, "grad_norm": 0.31875503063201904, "learning_rate": 2.6312836123752793e-07, "loss": 0.339, "step": 5381 }, { "epoch": 4.532846715328467, "grad_norm": 0.311514675617218, "learning_rate": 2.62187679595729e-07, "loss": 0.3344, "step": 5382 }, { "epoch": 4.533688938798428, "grad_norm": 0.2995169460773468, "learning_rate": 2.612486371557588e-07, "loss": 0.361, "step": 5383 }, { "epoch": 4.534531162268388, "grad_norm": 0.2933889925479889, "learning_rate": 2.6031123424250904e-07, "loss": 0.3354, "step": 5384 }, { "epoch": 4.53537338573835, "grad_norm": 0.3216431140899658, "learning_rate": 2.593754711803059e-07, "loss": 0.3392, "step": 5385 }, { "epoch": 4.53621560920831, "grad_norm": 0.2923242151737213, "learning_rate": 2.584413482929082e-07, "loss": 0.3317, "step": 5386 }, { "epoch": 4.53705783267827, "grad_norm": 0.2720673382282257, "learning_rate": 2.575088659035052e-07, "loss": 0.3088, "step": 5387 }, { "epoch": 4.537900056148231, "grad_norm": 0.3199785351753235, "learning_rate": 2.565780243347199e-07, "loss": 0.3736, "step": 5388 }, { "epoch": 4.538742279618192, "grad_norm": 0.3121381402015686, "learning_rate": 2.55648823908608e-07, "loss": 0.3721, "step": 5389 }, { "epoch": 4.539584503088153, "grad_norm": 0.303064227104187, "learning_rate": 2.547212649466568e-07, "loss": 0.3543, "step": 5390 }, { "epoch": 4.540426726558113, "grad_norm": 0.2875792682170868, "learning_rate": 2.537953477697863e-07, "loss": 0.3027, "step": 5391 }, { "epoch": 4.5412689500280745, "grad_norm": 0.326001912355423, "learning_rate": 2.528710726983458e-07, "loss": 0.3842, "step": 5392 }, { "epoch": 4.542111173498035, "grad_norm": 0.3071518838405609, "learning_rate": 2.519484400521216e-07, "loss": 0.3475, "step": 5393 }, { "epoch": 4.542953396967995, "grad_norm": 0.3052944242954254, "learning_rate": 2.510274501503274e-07, "loss": 0.3396, "step": 5394 }, { "epoch": 4.543795620437956, "grad_norm": 0.31249043345451355, "learning_rate": 2.501081033116093e-07, "loss": 0.3739, "step": 5395 }, { "epoch": 4.544637843907917, "grad_norm": 0.30214786529541016, "learning_rate": 2.4919039985404626e-07, "loss": 0.324, "step": 5396 }, { "epoch": 4.545480067377878, "grad_norm": 0.31525176763534546, "learning_rate": 2.4827434009514817e-07, "loss": 0.3158, "step": 5397 }, { "epoch": 4.546322290847838, "grad_norm": 0.3164271414279938, "learning_rate": 2.4735992435185595e-07, "loss": 0.3533, "step": 5398 }, { "epoch": 4.5471645143177994, "grad_norm": 0.33017510175704956, "learning_rate": 2.46447152940541e-07, "loss": 0.312, "step": 5399 }, { "epoch": 4.54800673778776, "grad_norm": 0.316776841878891, "learning_rate": 2.455360261770068e-07, "loss": 0.3481, "step": 5400 }, { "epoch": 4.54884896125772, "grad_norm": 0.2970413565635681, "learning_rate": 2.4462654437648836e-07, "loss": 0.3358, "step": 5401 }, { "epoch": 4.549691184727681, "grad_norm": 0.31356024742126465, "learning_rate": 2.437187078536507e-07, "loss": 0.3435, "step": 5402 }, { "epoch": 4.550533408197642, "grad_norm": 0.31034380197525024, "learning_rate": 2.4281251692258814e-07, "loss": 0.3282, "step": 5403 }, { "epoch": 4.551375631667603, "grad_norm": 0.30724942684173584, "learning_rate": 2.4190797189682866e-07, "loss": 0.3518, "step": 5404 }, { "epoch": 4.552217855137563, "grad_norm": 0.31542980670928955, "learning_rate": 2.410050730893299e-07, "loss": 0.373, "step": 5405 }, { "epoch": 4.5530600786075235, "grad_norm": 0.3025630712509155, "learning_rate": 2.401038208124784e-07, "loss": 0.342, "step": 5406 }, { "epoch": 4.553902302077485, "grad_norm": 0.27948758006095886, "learning_rate": 2.39204215378091e-07, "loss": 0.313, "step": 5407 }, { "epoch": 4.554744525547445, "grad_norm": 0.4117429852485657, "learning_rate": 2.3830625709741707e-07, "loss": 0.3589, "step": 5408 }, { "epoch": 4.555586749017406, "grad_norm": 0.29828357696533203, "learning_rate": 2.3740994628113523e-07, "loss": 0.3427, "step": 5409 }, { "epoch": 4.556428972487367, "grad_norm": 0.3158494532108307, "learning_rate": 2.3651528323935303e-07, "loss": 0.3592, "step": 5410 }, { "epoch": 4.557271195957327, "grad_norm": 0.3102034032344818, "learning_rate": 2.3562226828160784e-07, "loss": 0.3236, "step": 5411 }, { "epoch": 4.558113419427288, "grad_norm": 0.30195146799087524, "learning_rate": 2.3473090171686807e-07, "loss": 0.3481, "step": 5412 }, { "epoch": 4.558955642897248, "grad_norm": 0.282296359539032, "learning_rate": 2.3384118385353206e-07, "loss": 0.3276, "step": 5413 }, { "epoch": 4.55979786636721, "grad_norm": 0.2988031506538391, "learning_rate": 2.3295311499942574e-07, "loss": 0.354, "step": 5414 }, { "epoch": 4.56064008983717, "grad_norm": 0.3153878152370453, "learning_rate": 2.3206669546180616e-07, "loss": 0.3494, "step": 5415 }, { "epoch": 4.561482313307131, "grad_norm": 0.2885936200618744, "learning_rate": 2.3118192554735907e-07, "loss": 0.3298, "step": 5416 }, { "epoch": 4.5623245367770915, "grad_norm": 0.2962367534637451, "learning_rate": 2.3029880556220075e-07, "loss": 0.3576, "step": 5417 }, { "epoch": 4.563166760247052, "grad_norm": 0.28445538878440857, "learning_rate": 2.2941733581187398e-07, "loss": 0.3304, "step": 5418 }, { "epoch": 4.564008983717013, "grad_norm": 0.3014557957649231, "learning_rate": 2.285375166013537e-07, "loss": 0.3632, "step": 5419 }, { "epoch": 4.564851207186973, "grad_norm": 0.28545570373535156, "learning_rate": 2.2765934823504088e-07, "loss": 0.2984, "step": 5420 }, { "epoch": 4.565693430656935, "grad_norm": 0.2879563271999359, "learning_rate": 2.2678283101676802e-07, "loss": 0.3486, "step": 5421 }, { "epoch": 4.566535654126895, "grad_norm": 0.30047282576560974, "learning_rate": 2.2590796524979363e-07, "loss": 0.3806, "step": 5422 }, { "epoch": 4.567377877596856, "grad_norm": 0.2952415347099304, "learning_rate": 2.2503475123680786e-07, "loss": 0.323, "step": 5423 }, { "epoch": 4.568220101066816, "grad_norm": 0.3491751253604889, "learning_rate": 2.241631892799262e-07, "loss": 0.3681, "step": 5424 }, { "epoch": 4.569062324536777, "grad_norm": 0.29904282093048096, "learning_rate": 2.232932796806958e-07, "loss": 0.3185, "step": 5425 }, { "epoch": 4.569904548006738, "grad_norm": 0.323837012052536, "learning_rate": 2.224250227400887e-07, "loss": 0.3765, "step": 5426 }, { "epoch": 4.570746771476698, "grad_norm": 0.27647772431373596, "learning_rate": 2.215584187585085e-07, "loss": 0.3023, "step": 5427 }, { "epoch": 4.5715889949466595, "grad_norm": 0.3288542926311493, "learning_rate": 2.2069346803578418e-07, "loss": 0.3636, "step": 5428 }, { "epoch": 4.57243121841662, "grad_norm": 0.2931191623210907, "learning_rate": 2.1983017087117475e-07, "loss": 0.3566, "step": 5429 }, { "epoch": 4.573273441886581, "grad_norm": 0.2879174053668976, "learning_rate": 2.1896852756336517e-07, "loss": 0.3272, "step": 5430 }, { "epoch": 4.574115665356541, "grad_norm": 0.29109764099121094, "learning_rate": 2.1810853841047032e-07, "loss": 0.3395, "step": 5431 }, { "epoch": 4.574957888826502, "grad_norm": 0.2892135977745056, "learning_rate": 2.1725020371003046e-07, "loss": 0.3355, "step": 5432 }, { "epoch": 4.575800112296463, "grad_norm": 0.3256883919239044, "learning_rate": 2.1639352375901645e-07, "loss": 0.3639, "step": 5433 }, { "epoch": 4.576642335766423, "grad_norm": 0.3247363865375519, "learning_rate": 2.1553849885382283e-07, "loss": 0.348, "step": 5434 }, { "epoch": 4.577484559236384, "grad_norm": 0.3070462644100189, "learning_rate": 2.1468512929027574e-07, "loss": 0.3252, "step": 5435 }, { "epoch": 4.578326782706345, "grad_norm": 0.30333906412124634, "learning_rate": 2.1383341536362402e-07, "loss": 0.3513, "step": 5436 }, { "epoch": 4.579169006176305, "grad_norm": 0.3165588676929474, "learning_rate": 2.1298335736854803e-07, "loss": 0.3219, "step": 5437 }, { "epoch": 4.580011229646266, "grad_norm": 0.2880454361438751, "learning_rate": 2.1213495559915255e-07, "loss": 0.2917, "step": 5438 }, { "epoch": 4.580853453116227, "grad_norm": 0.3349563181400299, "learning_rate": 2.1128821034896995e-07, "loss": 0.3671, "step": 5439 }, { "epoch": 4.581695676586188, "grad_norm": 0.3314633369445801, "learning_rate": 2.1044312191095984e-07, "loss": 0.3652, "step": 5440 }, { "epoch": 4.582537900056148, "grad_norm": 0.2980303466320038, "learning_rate": 2.0959969057750774e-07, "loss": 0.3456, "step": 5441 }, { "epoch": 4.5833801235261085, "grad_norm": 0.3009263277053833, "learning_rate": 2.0875791664042743e-07, "loss": 0.3574, "step": 5442 }, { "epoch": 4.58422234699607, "grad_norm": 0.2941194772720337, "learning_rate": 2.0791780039095765e-07, "loss": 0.347, "step": 5443 }, { "epoch": 4.58506457046603, "grad_norm": 0.28724706172943115, "learning_rate": 2.070793421197642e-07, "loss": 0.3487, "step": 5444 }, { "epoch": 4.585906793935991, "grad_norm": 0.28741997480392456, "learning_rate": 2.0624254211693894e-07, "loss": 0.3421, "step": 5445 }, { "epoch": 4.5867490174059515, "grad_norm": 0.2810497283935547, "learning_rate": 2.0540740067200082e-07, "loss": 0.3284, "step": 5446 }, { "epoch": 4.587591240875913, "grad_norm": 0.31864792108535767, "learning_rate": 2.0457391807389426e-07, "loss": 0.3851, "step": 5447 }, { "epoch": 4.588433464345873, "grad_norm": 0.3437565267086029, "learning_rate": 2.0374209461099083e-07, "loss": 0.3348, "step": 5448 }, { "epoch": 4.589275687815833, "grad_norm": 0.3100665807723999, "learning_rate": 2.0291193057108528e-07, "loss": 0.3181, "step": 5449 }, { "epoch": 4.590117911285795, "grad_norm": 0.2903996407985687, "learning_rate": 2.020834262414023e-07, "loss": 0.3351, "step": 5450 }, { "epoch": 4.590960134755755, "grad_norm": 0.31161946058273315, "learning_rate": 2.012565819085882e-07, "loss": 0.3635, "step": 5451 }, { "epoch": 4.591802358225716, "grad_norm": 0.30150845646858215, "learning_rate": 2.0043139785871857e-07, "loss": 0.3125, "step": 5452 }, { "epoch": 4.5926445816956765, "grad_norm": 0.32099223136901855, "learning_rate": 1.9960787437729168e-07, "loss": 0.3239, "step": 5453 }, { "epoch": 4.593486805165638, "grad_norm": 0.3315921425819397, "learning_rate": 1.9878601174923352e-07, "loss": 0.3628, "step": 5454 }, { "epoch": 4.594329028635598, "grad_norm": 0.2962494492530823, "learning_rate": 1.9796581025889328e-07, "loss": 0.3176, "step": 5455 }, { "epoch": 4.595171252105558, "grad_norm": 0.3065938353538513, "learning_rate": 1.9714727019004787e-07, "loss": 0.3557, "step": 5456 }, { "epoch": 4.5960134755755195, "grad_norm": 0.31097525358200073, "learning_rate": 1.9633039182589797e-07, "loss": 0.3161, "step": 5457 }, { "epoch": 4.59685569904548, "grad_norm": 0.317865252494812, "learning_rate": 1.9551517544906917e-07, "loss": 0.355, "step": 5458 }, { "epoch": 4.597697922515441, "grad_norm": 0.30842310190200806, "learning_rate": 1.9470162134161143e-07, "loss": 0.3389, "step": 5459 }, { "epoch": 4.598540145985401, "grad_norm": 0.29784998297691345, "learning_rate": 1.9388972978500175e-07, "loss": 0.3396, "step": 5460 }, { "epoch": 4.599382369455363, "grad_norm": 0.3066953122615814, "learning_rate": 1.9307950106014106e-07, "loss": 0.3522, "step": 5461 }, { "epoch": 4.600224592925323, "grad_norm": 0.2840117812156677, "learning_rate": 1.9227093544735398e-07, "loss": 0.3245, "step": 5462 }, { "epoch": 4.601066816395283, "grad_norm": 0.287533164024353, "learning_rate": 1.914640332263895e-07, "loss": 0.3285, "step": 5463 }, { "epoch": 4.6019090398652445, "grad_norm": 0.2916044592857361, "learning_rate": 1.9065879467642268e-07, "loss": 0.3372, "step": 5464 }, { "epoch": 4.602751263335205, "grad_norm": 0.2785119414329529, "learning_rate": 1.8985522007605284e-07, "loss": 0.3558, "step": 5465 }, { "epoch": 4.603593486805166, "grad_norm": 0.28416621685028076, "learning_rate": 1.890533097033026e-07, "loss": 0.316, "step": 5466 }, { "epoch": 4.604435710275126, "grad_norm": 0.29600003361701965, "learning_rate": 1.8825306383561836e-07, "loss": 0.3273, "step": 5467 }, { "epoch": 4.605277933745087, "grad_norm": 0.30537083745002747, "learning_rate": 1.87454482749872e-07, "loss": 0.3482, "step": 5468 }, { "epoch": 4.606120157215048, "grad_norm": 0.29285457730293274, "learning_rate": 1.8665756672236025e-07, "loss": 0.3507, "step": 5469 }, { "epoch": 4.606962380685008, "grad_norm": 0.30526286363601685, "learning_rate": 1.8586231602880035e-07, "loss": 0.3571, "step": 5470 }, { "epoch": 4.607804604154969, "grad_norm": 0.3078595995903015, "learning_rate": 1.8506873094433663e-07, "loss": 0.3605, "step": 5471 }, { "epoch": 4.60864682762493, "grad_norm": 0.3043935298919678, "learning_rate": 1.8427681174353506e-07, "loss": 0.3631, "step": 5472 }, { "epoch": 4.60948905109489, "grad_norm": 0.3398006558418274, "learning_rate": 1.83486558700387e-07, "loss": 0.3425, "step": 5473 }, { "epoch": 4.610331274564851, "grad_norm": 0.32063549757003784, "learning_rate": 1.8269797208830654e-07, "loss": 0.3256, "step": 5474 }, { "epoch": 4.611173498034812, "grad_norm": 0.30710041522979736, "learning_rate": 1.8191105218012928e-07, "loss": 0.3246, "step": 5475 }, { "epoch": 4.612015721504773, "grad_norm": 0.3245845139026642, "learning_rate": 1.8112579924811801e-07, "loss": 0.3566, "step": 5476 }, { "epoch": 4.612857944974733, "grad_norm": 0.32014885544776917, "learning_rate": 1.8034221356395654e-07, "loss": 0.3587, "step": 5477 }, { "epoch": 4.613700168444694, "grad_norm": 0.2782115936279297, "learning_rate": 1.7956029539875186e-07, "loss": 0.3119, "step": 5478 }, { "epoch": 4.614542391914655, "grad_norm": 0.2995143234729767, "learning_rate": 1.787800450230326e-07, "loss": 0.3641, "step": 5479 }, { "epoch": 4.615384615384615, "grad_norm": 0.2945175766944885, "learning_rate": 1.78001462706755e-07, "loss": 0.3576, "step": 5480 }, { "epoch": 4.616226838854576, "grad_norm": 0.2925828695297241, "learning_rate": 1.7722454871929363e-07, "loss": 0.3561, "step": 5481 }, { "epoch": 4.6170690623245365, "grad_norm": 0.30720579624176025, "learning_rate": 1.7644930332944678e-07, "loss": 0.3529, "step": 5482 }, { "epoch": 4.617911285794498, "grad_norm": 0.29181933403015137, "learning_rate": 1.7567572680543654e-07, "loss": 0.3111, "step": 5483 }, { "epoch": 4.618753509264458, "grad_norm": 0.31283241510391235, "learning_rate": 1.749038194149083e-07, "loss": 0.3421, "step": 5484 }, { "epoch": 4.619595732734419, "grad_norm": 0.33428558707237244, "learning_rate": 1.7413358142492732e-07, "loss": 0.4146, "step": 5485 }, { "epoch": 4.62043795620438, "grad_norm": 0.29247400164604187, "learning_rate": 1.7336501310198318e-07, "loss": 0.3132, "step": 5486 }, { "epoch": 4.62128017967434, "grad_norm": 0.313398540019989, "learning_rate": 1.7259811471198705e-07, "loss": 0.3455, "step": 5487 }, { "epoch": 4.622122403144301, "grad_norm": 0.3281310200691223, "learning_rate": 1.718328865202734e-07, "loss": 0.3871, "step": 5488 }, { "epoch": 4.622964626614261, "grad_norm": 0.2939252555370331, "learning_rate": 1.7106932879159822e-07, "loss": 0.3413, "step": 5489 }, { "epoch": 4.623806850084223, "grad_norm": 0.3085804879665375, "learning_rate": 1.703074417901379e-07, "loss": 0.3611, "step": 5490 }, { "epoch": 4.624649073554183, "grad_norm": 0.2995326519012451, "learning_rate": 1.695472257794928e-07, "loss": 0.3347, "step": 5491 }, { "epoch": 4.625491297024144, "grad_norm": 0.2874888777732849, "learning_rate": 1.6878868102268632e-07, "loss": 0.3457, "step": 5492 }, { "epoch": 4.6263335204941045, "grad_norm": 0.30546337366104126, "learning_rate": 1.6803180778215967e-07, "loss": 0.3452, "step": 5493 }, { "epoch": 4.627175743964065, "grad_norm": 0.30731621384620667, "learning_rate": 1.6727660631977894e-07, "loss": 0.3684, "step": 5494 }, { "epoch": 4.628017967434026, "grad_norm": 0.2870134711265564, "learning_rate": 1.6652307689683123e-07, "loss": 0.3262, "step": 5495 }, { "epoch": 4.628860190903986, "grad_norm": 0.32518669962882996, "learning_rate": 1.6577121977402467e-07, "loss": 0.3572, "step": 5496 }, { "epoch": 4.629702414373948, "grad_norm": 0.3162247836589813, "learning_rate": 1.6502103521148893e-07, "loss": 0.2975, "step": 5497 }, { "epoch": 4.630544637843908, "grad_norm": 0.31598180532455444, "learning_rate": 1.642725234687731e-07, "loss": 0.3512, "step": 5498 }, { "epoch": 4.631386861313868, "grad_norm": 0.2881428897380829, "learning_rate": 1.6352568480485277e-07, "loss": 0.3194, "step": 5499 }, { "epoch": 4.632229084783829, "grad_norm": 0.325639933347702, "learning_rate": 1.6278051947811956e-07, "loss": 0.3645, "step": 5500 }, { "epoch": 4.63307130825379, "grad_norm": 0.29929524660110474, "learning_rate": 1.6203702774638842e-07, "loss": 0.3836, "step": 5501 }, { "epoch": 4.633913531723751, "grad_norm": 0.2904515564441681, "learning_rate": 1.612952098668924e-07, "loss": 0.322, "step": 5502 }, { "epoch": 4.634755755193711, "grad_norm": 0.29685714840888977, "learning_rate": 1.6055506609629123e-07, "loss": 0.3359, "step": 5503 }, { "epoch": 4.635597978663672, "grad_norm": 0.31809327006340027, "learning_rate": 1.5981659669066063e-07, "loss": 0.342, "step": 5504 }, { "epoch": 4.636440202133633, "grad_norm": 0.2770005166530609, "learning_rate": 1.5907980190549732e-07, "loss": 0.2875, "step": 5505 }, { "epoch": 4.637282425603593, "grad_norm": 0.3110273778438568, "learning_rate": 1.5834468199572073e-07, "loss": 0.3825, "step": 5506 }, { "epoch": 4.638124649073554, "grad_norm": 0.30720245838165283, "learning_rate": 1.5761123721567019e-07, "loss": 0.3459, "step": 5507 }, { "epoch": 4.638966872543515, "grad_norm": 0.32818081974983215, "learning_rate": 1.568794678191038e-07, "loss": 0.3118, "step": 5508 }, { "epoch": 4.639809096013476, "grad_norm": 0.3156241178512573, "learning_rate": 1.5614937405920184e-07, "loss": 0.348, "step": 5509 }, { "epoch": 4.640651319483436, "grad_norm": 0.2988746464252472, "learning_rate": 1.5542095618856335e-07, "loss": 0.3091, "step": 5510 }, { "epoch": 4.641493542953397, "grad_norm": 0.35503536462783813, "learning_rate": 1.5469421445921063e-07, "loss": 0.3624, "step": 5511 }, { "epoch": 4.642335766423358, "grad_norm": 0.33254143595695496, "learning_rate": 1.5396914912258198e-07, "loss": 0.3252, "step": 5512 }, { "epoch": 4.643177989893318, "grad_norm": 0.3049752116203308, "learning_rate": 1.5324576042953731e-07, "loss": 0.3588, "step": 5513 }, { "epoch": 4.644020213363279, "grad_norm": 0.30581000447273254, "learning_rate": 1.5252404863035807e-07, "loss": 0.3345, "step": 5514 }, { "epoch": 4.64486243683324, "grad_norm": 0.29759591817855835, "learning_rate": 1.5180401397474343e-07, "loss": 0.3415, "step": 5515 }, { "epoch": 4.645704660303201, "grad_norm": 0.2957257032394409, "learning_rate": 1.510856567118135e-07, "loss": 0.3448, "step": 5516 }, { "epoch": 4.646546883773161, "grad_norm": 0.32667672634124756, "learning_rate": 1.503689770901068e-07, "loss": 0.3702, "step": 5517 }, { "epoch": 4.6473891072431215, "grad_norm": 0.2923109531402588, "learning_rate": 1.4965397535758265e-07, "loss": 0.3651, "step": 5518 }, { "epoch": 4.648231330713083, "grad_norm": 0.2883763015270233, "learning_rate": 1.4894065176161988e-07, "loss": 0.3178, "step": 5519 }, { "epoch": 4.649073554183043, "grad_norm": 0.29692956805229187, "learning_rate": 1.4822900654901551e-07, "loss": 0.341, "step": 5520 }, { "epoch": 4.649915777653004, "grad_norm": 0.3008793890476227, "learning_rate": 1.4751903996598593e-07, "loss": 0.3523, "step": 5521 }, { "epoch": 4.650758001122965, "grad_norm": 0.309303879737854, "learning_rate": 1.4681075225816855e-07, "loss": 0.3832, "step": 5522 }, { "epoch": 4.651600224592926, "grad_norm": 0.2908450663089752, "learning_rate": 1.4610414367061897e-07, "loss": 0.3431, "step": 5523 }, { "epoch": 4.652442448062886, "grad_norm": 0.296745240688324, "learning_rate": 1.4539921444781112e-07, "loss": 0.3178, "step": 5524 }, { "epoch": 4.653284671532846, "grad_norm": 0.30107203125953674, "learning_rate": 1.4469596483363825e-07, "loss": 0.3334, "step": 5525 }, { "epoch": 4.654126895002808, "grad_norm": 0.29503026604652405, "learning_rate": 1.439943950714129e-07, "loss": 0.3398, "step": 5526 }, { "epoch": 4.654969118472768, "grad_norm": 0.2951544523239136, "learning_rate": 1.4329450540386647e-07, "loss": 0.315, "step": 5527 }, { "epoch": 4.655811341942729, "grad_norm": 0.381742924451828, "learning_rate": 1.4259629607314752e-07, "loss": 0.3308, "step": 5528 }, { "epoch": 4.6566535654126895, "grad_norm": 0.29856863617897034, "learning_rate": 1.4189976732082667e-07, "loss": 0.3333, "step": 5529 }, { "epoch": 4.65749578888265, "grad_norm": 0.32264235615730286, "learning_rate": 1.4120491938788894e-07, "loss": 0.3582, "step": 5530 }, { "epoch": 4.658338012352611, "grad_norm": 0.3017025887966156, "learning_rate": 1.4051175251474092e-07, "loss": 0.3671, "step": 5531 }, { "epoch": 4.659180235822571, "grad_norm": 0.30550631880760193, "learning_rate": 1.3982026694120576e-07, "loss": 0.3465, "step": 5532 }, { "epoch": 4.660022459292533, "grad_norm": 0.32052212953567505, "learning_rate": 1.3913046290652598e-07, "loss": 0.372, "step": 5533 }, { "epoch": 4.660864682762493, "grad_norm": 0.29193115234375, "learning_rate": 1.384423406493618e-07, "loss": 0.3215, "step": 5534 }, { "epoch": 4.661706906232453, "grad_norm": 0.31294137239456177, "learning_rate": 1.3775590040779275e-07, "loss": 0.3353, "step": 5535 }, { "epoch": 4.662549129702414, "grad_norm": 0.31451183557510376, "learning_rate": 1.370711424193133e-07, "loss": 0.3826, "step": 5536 }, { "epoch": 4.663391353172375, "grad_norm": 0.3189631700515747, "learning_rate": 1.363880669208395e-07, "loss": 0.3617, "step": 5537 }, { "epoch": 4.664233576642336, "grad_norm": 0.3189268112182617, "learning_rate": 1.3570667414870399e-07, "loss": 0.3427, "step": 5538 }, { "epoch": 4.665075800112296, "grad_norm": 0.2970408499240875, "learning_rate": 1.3502696433865646e-07, "loss": 0.3367, "step": 5539 }, { "epoch": 4.6659180235822575, "grad_norm": 0.2992413640022278, "learning_rate": 1.3434893772586387e-07, "loss": 0.3425, "step": 5540 }, { "epoch": 4.666760247052218, "grad_norm": 0.2947779595851898, "learning_rate": 1.33672594544913e-07, "loss": 0.3202, "step": 5541 }, { "epoch": 4.667602470522178, "grad_norm": 0.3081522583961487, "learning_rate": 1.3299793502980774e-07, "loss": 0.3151, "step": 5542 }, { "epoch": 4.668444693992139, "grad_norm": 0.30582451820373535, "learning_rate": 1.323249594139664e-07, "loss": 0.3698, "step": 5543 }, { "epoch": 4.6692869174621, "grad_norm": 0.3060275912284851, "learning_rate": 1.3165366793022938e-07, "loss": 0.3424, "step": 5544 }, { "epoch": 4.670129140932061, "grad_norm": 0.29892659187316895, "learning_rate": 1.3098406081085036e-07, "loss": 0.37, "step": 5545 }, { "epoch": 4.670971364402021, "grad_norm": 0.29382914304733276, "learning_rate": 1.303161382875029e-07, "loss": 0.3086, "step": 5546 }, { "epoch": 4.671813587871982, "grad_norm": 0.30443695187568665, "learning_rate": 1.2964990059127546e-07, "loss": 0.3662, "step": 5547 }, { "epoch": 4.672655811341943, "grad_norm": 0.31445929408073425, "learning_rate": 1.2898534795267704e-07, "loss": 0.3554, "step": 5548 }, { "epoch": 4.673498034811903, "grad_norm": 0.2892036736011505, "learning_rate": 1.283224806016292e-07, "loss": 0.3402, "step": 5549 }, { "epoch": 4.674340258281864, "grad_norm": 0.31022942066192627, "learning_rate": 1.2766129876747412e-07, "loss": 0.3472, "step": 5550 }, { "epoch": 4.675182481751825, "grad_norm": 0.28944846987724304, "learning_rate": 1.2700180267896878e-07, "loss": 0.3208, "step": 5551 }, { "epoch": 4.676024705221786, "grad_norm": 0.307036817073822, "learning_rate": 1.2634399256428788e-07, "loss": 0.3184, "step": 5552 }, { "epoch": 4.676866928691746, "grad_norm": 0.29626375436782837, "learning_rate": 1.2568786865102156e-07, "loss": 0.3386, "step": 5553 }, { "epoch": 4.677709152161707, "grad_norm": 0.31852808594703674, "learning_rate": 1.2503343116617882e-07, "loss": 0.3644, "step": 5554 }, { "epoch": 4.678551375631668, "grad_norm": 0.3015611171722412, "learning_rate": 1.2438068033618244e-07, "loss": 0.3238, "step": 5555 }, { "epoch": 4.679393599101628, "grad_norm": 0.3288446366786957, "learning_rate": 1.2372961638687454e-07, "loss": 0.3649, "step": 5556 }, { "epoch": 4.680235822571589, "grad_norm": 0.2957417368888855, "learning_rate": 1.2308023954351045e-07, "loss": 0.3094, "step": 5557 }, { "epoch": 4.6810780460415495, "grad_norm": 0.28084447979927063, "learning_rate": 1.2243255003076437e-07, "loss": 0.3353, "step": 5558 }, { "epoch": 4.681920269511511, "grad_norm": 0.3204539716243744, "learning_rate": 1.2178654807272538e-07, "loss": 0.3647, "step": 5559 }, { "epoch": 4.682762492981471, "grad_norm": 0.30956992506980896, "learning_rate": 1.211422338928997e-07, "loss": 0.3411, "step": 5560 }, { "epoch": 4.683604716451431, "grad_norm": 0.2968314588069916, "learning_rate": 1.2049960771420844e-07, "loss": 0.3363, "step": 5561 }, { "epoch": 4.684446939921393, "grad_norm": 0.30981895327568054, "learning_rate": 1.1985866975898875e-07, "loss": 0.3579, "step": 5562 }, { "epoch": 4.685289163391353, "grad_norm": 0.30675390362739563, "learning_rate": 1.1921942024899546e-07, "loss": 0.3398, "step": 5563 }, { "epoch": 4.686131386861314, "grad_norm": 0.30251365900039673, "learning_rate": 1.1858185940539779e-07, "loss": 0.312, "step": 5564 }, { "epoch": 4.6869736103312745, "grad_norm": 0.2876482307910919, "learning_rate": 1.1794598744877928e-07, "loss": 0.3542, "step": 5565 }, { "epoch": 4.687815833801235, "grad_norm": 0.29327523708343506, "learning_rate": 1.173118045991417e-07, "loss": 0.3823, "step": 5566 }, { "epoch": 4.688658057271196, "grad_norm": 0.2782207429409027, "learning_rate": 1.1667931107590236e-07, "loss": 0.3263, "step": 5567 }, { "epoch": 4.689500280741156, "grad_norm": 0.2813691794872284, "learning_rate": 1.1604850709789228e-07, "loss": 0.3649, "step": 5568 }, { "epoch": 4.6903425042111175, "grad_norm": 0.28045913577079773, "learning_rate": 1.1541939288335857e-07, "loss": 0.3442, "step": 5569 }, { "epoch": 4.691184727681078, "grad_norm": 0.29696571826934814, "learning_rate": 1.1479196864996379e-07, "loss": 0.344, "step": 5570 }, { "epoch": 4.692026951151039, "grad_norm": 0.30355557799339294, "learning_rate": 1.1416623461478704e-07, "loss": 0.3517, "step": 5571 }, { "epoch": 4.692869174620999, "grad_norm": 0.302247554063797, "learning_rate": 1.1354219099432128e-07, "loss": 0.3292, "step": 5572 }, { "epoch": 4.69371139809096, "grad_norm": 0.30586758255958557, "learning_rate": 1.1291983800447382e-07, "loss": 0.3481, "step": 5573 }, { "epoch": 4.694553621560921, "grad_norm": 0.3308839499950409, "learning_rate": 1.1229917586056904e-07, "loss": 0.4013, "step": 5574 }, { "epoch": 4.695395845030881, "grad_norm": 0.2834588289260864, "learning_rate": 1.1168020477734576e-07, "loss": 0.315, "step": 5575 }, { "epoch": 4.6962380685008425, "grad_norm": 0.28000709414482117, "learning_rate": 1.110629249689571e-07, "loss": 0.3171, "step": 5576 }, { "epoch": 4.697080291970803, "grad_norm": 0.302685409784317, "learning_rate": 1.1044733664897e-07, "loss": 0.3385, "step": 5577 }, { "epoch": 4.697922515440764, "grad_norm": 0.3143957853317261, "learning_rate": 1.098334400303691e-07, "loss": 0.3472, "step": 5578 }, { "epoch": 4.698764738910724, "grad_norm": 0.32867446541786194, "learning_rate": 1.0922123532555173e-07, "loss": 0.3623, "step": 5579 }, { "epoch": 4.699606962380685, "grad_norm": 0.30518513917922974, "learning_rate": 1.0861072274633012e-07, "loss": 0.3648, "step": 5580 }, { "epoch": 4.700449185850646, "grad_norm": 0.3042427599430084, "learning_rate": 1.0800190250393028e-07, "loss": 0.3554, "step": 5581 }, { "epoch": 4.701291409320606, "grad_norm": 0.30883073806762695, "learning_rate": 1.0739477480899485e-07, "loss": 0.3383, "step": 5582 }, { "epoch": 4.702133632790567, "grad_norm": 0.29946985840797424, "learning_rate": 1.0678933987157913e-07, "loss": 0.3068, "step": 5583 }, { "epoch": 4.702975856260528, "grad_norm": 0.3099057674407959, "learning_rate": 1.0618559790115335e-07, "loss": 0.3494, "step": 5584 }, { "epoch": 4.703818079730489, "grad_norm": 0.3459226191043854, "learning_rate": 1.055835491066004e-07, "loss": 0.3651, "step": 5585 }, { "epoch": 4.704660303200449, "grad_norm": 0.3094930946826935, "learning_rate": 1.0498319369622145e-07, "loss": 0.3342, "step": 5586 }, { "epoch": 4.70550252667041, "grad_norm": 0.27598321437835693, "learning_rate": 1.0438453187772812e-07, "loss": 0.3201, "step": 5587 }, { "epoch": 4.706344750140371, "grad_norm": 0.3204381465911865, "learning_rate": 1.037875638582464e-07, "loss": 0.3899, "step": 5588 }, { "epoch": 4.707186973610331, "grad_norm": 0.2835240364074707, "learning_rate": 1.0319228984431718e-07, "loss": 0.3138, "step": 5589 }, { "epoch": 4.708029197080292, "grad_norm": 0.3196372985839844, "learning_rate": 1.025987100418957e-07, "loss": 0.361, "step": 5590 }, { "epoch": 4.708871420550253, "grad_norm": 0.3441949486732483, "learning_rate": 1.0200682465635048e-07, "loss": 0.3315, "step": 5591 }, { "epoch": 4.709713644020213, "grad_norm": 0.29912835359573364, "learning_rate": 1.014166338924627e-07, "loss": 0.3338, "step": 5592 }, { "epoch": 4.710555867490174, "grad_norm": 0.30664896965026855, "learning_rate": 1.0082813795442902e-07, "loss": 0.365, "step": 5593 }, { "epoch": 4.7113980909601345, "grad_norm": 0.30490735173225403, "learning_rate": 1.0024133704585881e-07, "loss": 0.3223, "step": 5594 }, { "epoch": 4.712240314430096, "grad_norm": 0.3324238061904907, "learning_rate": 9.96562313697752e-08, "loss": 0.3606, "step": 5595 }, { "epoch": 4.713082537900056, "grad_norm": 0.3252851963043213, "learning_rate": 9.907282112861461e-08, "loss": 0.3165, "step": 5596 }, { "epoch": 4.713924761370016, "grad_norm": 0.2994556128978729, "learning_rate": 9.849110652422666e-08, "loss": 0.3701, "step": 5597 }, { "epoch": 4.714766984839978, "grad_norm": 0.287494957447052, "learning_rate": 9.791108775787539e-08, "loss": 0.3267, "step": 5598 }, { "epoch": 4.715609208309938, "grad_norm": 0.28999319672584534, "learning_rate": 9.733276503023692e-08, "loss": 0.3332, "step": 5599 }, { "epoch": 4.716451431779899, "grad_norm": 0.2910189628601074, "learning_rate": 9.675613854140065e-08, "loss": 0.3417, "step": 5600 }, { "epoch": 4.717293655249859, "grad_norm": 0.2823447287082672, "learning_rate": 9.618120849086976e-08, "loss": 0.3102, "step": 5601 }, { "epoch": 4.718135878719821, "grad_norm": 0.30297183990478516, "learning_rate": 9.560797507756014e-08, "loss": 0.3392, "step": 5602 }, { "epoch": 4.718978102189781, "grad_norm": 0.3124760091304779, "learning_rate": 9.503643849980148e-08, "loss": 0.3522, "step": 5603 }, { "epoch": 4.719820325659741, "grad_norm": 0.28454962372779846, "learning_rate": 9.446659895533394e-08, "loss": 0.3353, "step": 5604 }, { "epoch": 4.7206625491297025, "grad_norm": 0.2971835732460022, "learning_rate": 9.38984566413137e-08, "loss": 0.3066, "step": 5605 }, { "epoch": 4.721504772599663, "grad_norm": 0.30358248949050903, "learning_rate": 9.333201175430851e-08, "loss": 0.3503, "step": 5606 }, { "epoch": 4.722346996069624, "grad_norm": 0.29188260436058044, "learning_rate": 9.276726449029716e-08, "loss": 0.3102, "step": 5607 }, { "epoch": 4.723189219539584, "grad_norm": 0.3273991048336029, "learning_rate": 9.22042150446728e-08, "loss": 0.3861, "step": 5608 }, { "epoch": 4.724031443009546, "grad_norm": 0.2715218663215637, "learning_rate": 9.164286361224239e-08, "loss": 0.2911, "step": 5609 }, { "epoch": 4.724873666479506, "grad_norm": 0.2905413508415222, "learning_rate": 9.108321038722278e-08, "loss": 0.3459, "step": 5610 }, { "epoch": 4.725715889949466, "grad_norm": 0.2820955216884613, "learning_rate": 9.052525556324354e-08, "loss": 0.3486, "step": 5611 }, { "epoch": 4.726558113419427, "grad_norm": 0.3158682584762573, "learning_rate": 8.996899933334913e-08, "loss": 0.3902, "step": 5612 }, { "epoch": 4.727400336889388, "grad_norm": 0.3170660436153412, "learning_rate": 8.941444188999394e-08, "loss": 0.3465, "step": 5613 }, { "epoch": 4.728242560359349, "grad_norm": 0.31234076619148254, "learning_rate": 8.886158342504503e-08, "loss": 0.3578, "step": 5614 }, { "epoch": 4.729084783829309, "grad_norm": 0.2867434024810791, "learning_rate": 8.83104241297822e-08, "loss": 0.3215, "step": 5615 }, { "epoch": 4.7299270072992705, "grad_norm": 0.317394882440567, "learning_rate": 8.776096419489732e-08, "loss": 0.3681, "step": 5616 }, { "epoch": 4.730769230769231, "grad_norm": 0.31549790501594543, "learning_rate": 8.721320381049391e-08, "loss": 0.3227, "step": 5617 }, { "epoch": 4.731611454239191, "grad_norm": 0.2986947298049927, "learning_rate": 8.666714316608815e-08, "loss": 0.2964, "step": 5618 }, { "epoch": 4.732453677709152, "grad_norm": 0.3180459439754486, "learning_rate": 8.612278245060668e-08, "loss": 0.3569, "step": 5619 }, { "epoch": 4.733295901179113, "grad_norm": 0.30311572551727295, "learning_rate": 8.55801218523894e-08, "loss": 0.3213, "step": 5620 }, { "epoch": 4.734138124649074, "grad_norm": 0.30118653178215027, "learning_rate": 8.503916155918779e-08, "loss": 0.3275, "step": 5621 }, { "epoch": 4.734980348119034, "grad_norm": 0.335983008146286, "learning_rate": 8.449990175816492e-08, "loss": 0.3459, "step": 5622 }, { "epoch": 4.7358225715889954, "grad_norm": 0.28725576400756836, "learning_rate": 8.396234263589542e-08, "loss": 0.3206, "step": 5623 }, { "epoch": 4.736664795058956, "grad_norm": 0.3218049108982086, "learning_rate": 8.342648437836498e-08, "loss": 0.3803, "step": 5624 }, { "epoch": 4.737507018528916, "grad_norm": 0.30028271675109863, "learning_rate": 8.289232717097195e-08, "loss": 0.3272, "step": 5625 }, { "epoch": 4.738349241998877, "grad_norm": 0.31726744771003723, "learning_rate": 8.235987119852517e-08, "loss": 0.3672, "step": 5626 }, { "epoch": 4.739191465468838, "grad_norm": 0.30510202050209045, "learning_rate": 8.182911664524562e-08, "loss": 0.3488, "step": 5627 }, { "epoch": 4.740033688938798, "grad_norm": 0.32230767607688904, "learning_rate": 8.130006369476473e-08, "loss": 0.3741, "step": 5628 }, { "epoch": 4.740875912408759, "grad_norm": 0.2858891487121582, "learning_rate": 8.077271253012664e-08, "loss": 0.3066, "step": 5629 }, { "epoch": 4.7417181358787195, "grad_norm": 0.3059341311454773, "learning_rate": 8.02470633337854e-08, "loss": 0.3642, "step": 5630 }, { "epoch": 4.742560359348681, "grad_norm": 0.30419448018074036, "learning_rate": 7.97231162876061e-08, "loss": 0.349, "step": 5631 }, { "epoch": 4.743402582818641, "grad_norm": 0.31899645924568176, "learning_rate": 7.920087157286594e-08, "loss": 0.3369, "step": 5632 }, { "epoch": 4.744244806288602, "grad_norm": 0.3207145631313324, "learning_rate": 7.868032937025317e-08, "loss": 0.3409, "step": 5633 }, { "epoch": 4.745087029758563, "grad_norm": 0.305549293756485, "learning_rate": 7.816148985986483e-08, "loss": 0.3464, "step": 5634 }, { "epoch": 4.745929253228523, "grad_norm": 0.30695635080337524, "learning_rate": 7.764435322121233e-08, "loss": 0.3496, "step": 5635 }, { "epoch": 4.746771476698484, "grad_norm": 0.3046221435070038, "learning_rate": 7.712891963321478e-08, "loss": 0.3682, "step": 5636 }, { "epoch": 4.747613700168444, "grad_norm": 0.307830810546875, "learning_rate": 7.661518927420452e-08, "loss": 0.3559, "step": 5637 }, { "epoch": 4.748455923638406, "grad_norm": 0.31099459528923035, "learning_rate": 7.610316232192216e-08, "loss": 0.3451, "step": 5638 }, { "epoch": 4.749298147108366, "grad_norm": 0.3037114143371582, "learning_rate": 7.559283895352099e-08, "loss": 0.3629, "step": 5639 }, { "epoch": 4.750140370578327, "grad_norm": 0.3054758310317993, "learning_rate": 7.50842193455642e-08, "loss": 0.3487, "step": 5640 }, { "epoch": 4.7509825940482875, "grad_norm": 0.31322112679481506, "learning_rate": 7.45773036740255e-08, "loss": 0.3453, "step": 5641 }, { "epoch": 4.751824817518248, "grad_norm": 0.30124083161354065, "learning_rate": 7.407209211428845e-08, "loss": 0.3573, "step": 5642 }, { "epoch": 4.752667040988209, "grad_norm": 0.3049775958061218, "learning_rate": 7.356858484114771e-08, "loss": 0.3356, "step": 5643 }, { "epoch": 4.753509264458169, "grad_norm": 0.2900490462779999, "learning_rate": 7.30667820288089e-08, "loss": 0.3319, "step": 5644 }, { "epoch": 4.754351487928131, "grad_norm": 0.3235769271850586, "learning_rate": 7.25666838508865e-08, "loss": 0.3491, "step": 5645 }, { "epoch": 4.755193711398091, "grad_norm": 0.3127809464931488, "learning_rate": 7.206829048040597e-08, "loss": 0.3274, "step": 5646 }, { "epoch": 4.756035934868052, "grad_norm": 0.31862369179725647, "learning_rate": 7.157160208980274e-08, "loss": 0.3502, "step": 5647 }, { "epoch": 4.756878158338012, "grad_norm": 0.2942161560058594, "learning_rate": 7.107661885092321e-08, "loss": 0.3104, "step": 5648 }, { "epoch": 4.757720381807973, "grad_norm": 0.3117566704750061, "learning_rate": 7.058334093502262e-08, "loss": 0.3494, "step": 5649 }, { "epoch": 4.758562605277934, "grad_norm": 0.3393673598766327, "learning_rate": 7.009176851276666e-08, "loss": 0.3806, "step": 5650 }, { "epoch": 4.759404828747894, "grad_norm": 0.29275092482566833, "learning_rate": 6.960190175423043e-08, "loss": 0.2998, "step": 5651 }, { "epoch": 4.7602470522178555, "grad_norm": 0.29389050602912903, "learning_rate": 6.911374082890054e-08, "loss": 0.3446, "step": 5652 }, { "epoch": 4.761089275687816, "grad_norm": 0.3141312301158905, "learning_rate": 6.862728590567136e-08, "loss": 0.3549, "step": 5653 }, { "epoch": 4.761931499157777, "grad_norm": 0.30751723051071167, "learning_rate": 6.814253715284824e-08, "loss": 0.313, "step": 5654 }, { "epoch": 4.762773722627737, "grad_norm": 0.3150999844074249, "learning_rate": 6.765949473814648e-08, "loss": 0.3443, "step": 5655 }, { "epoch": 4.763615946097698, "grad_norm": 0.2988455295562744, "learning_rate": 6.717815882868961e-08, "loss": 0.3132, "step": 5656 }, { "epoch": 4.764458169567659, "grad_norm": 0.29774540662765503, "learning_rate": 6.669852959101219e-08, "loss": 0.3342, "step": 5657 }, { "epoch": 4.765300393037619, "grad_norm": 0.2958545982837677, "learning_rate": 6.622060719105761e-08, "loss": 0.3647, "step": 5658 }, { "epoch": 4.7661426165075795, "grad_norm": 0.29149940609931946, "learning_rate": 6.574439179417802e-08, "loss": 0.3261, "step": 5659 }, { "epoch": 4.766984839977541, "grad_norm": 0.3194414973258972, "learning_rate": 6.526988356513719e-08, "loss": 0.4037, "step": 5660 }, { "epoch": 4.767827063447501, "grad_norm": 0.27769002318382263, "learning_rate": 6.479708266810603e-08, "loss": 0.3188, "step": 5661 }, { "epoch": 4.768669286917462, "grad_norm": 0.29778826236724854, "learning_rate": 6.432598926666589e-08, "loss": 0.3733, "step": 5662 }, { "epoch": 4.769511510387423, "grad_norm": 0.3126421868801117, "learning_rate": 6.385660352380585e-08, "loss": 0.3732, "step": 5663 }, { "epoch": 4.770353733857384, "grad_norm": 0.31895026564598083, "learning_rate": 6.338892560192711e-08, "loss": 0.3266, "step": 5664 }, { "epoch": 4.771195957327344, "grad_norm": 0.29969674348831177, "learning_rate": 6.292295566283691e-08, "loss": 0.309, "step": 5665 }, { "epoch": 4.7720381807973045, "grad_norm": 0.32613930106163025, "learning_rate": 6.245869386775405e-08, "loss": 0.3556, "step": 5666 }, { "epoch": 4.772880404267266, "grad_norm": 0.32746392488479614, "learning_rate": 6.199614037730339e-08, "loss": 0.3748, "step": 5667 }, { "epoch": 4.773722627737226, "grad_norm": 0.2806459963321686, "learning_rate": 6.153529535152247e-08, "loss": 0.318, "step": 5668 }, { "epoch": 4.774564851207187, "grad_norm": 0.2849215269088745, "learning_rate": 6.107615894985375e-08, "loss": 0.3491, "step": 5669 }, { "epoch": 4.7754070746771475, "grad_norm": 0.28077155351638794, "learning_rate": 6.061873133115237e-08, "loss": 0.3205, "step": 5670 }, { "epoch": 4.776249298147109, "grad_norm": 0.30299854278564453, "learning_rate": 6.016301265367952e-08, "loss": 0.3483, "step": 5671 }, { "epoch": 4.777091521617069, "grad_norm": 0.2764627933502197, "learning_rate": 5.970900307510574e-08, "loss": 0.2955, "step": 5672 }, { "epoch": 4.777933745087029, "grad_norm": 0.3304811120033264, "learning_rate": 5.9256702752512054e-08, "loss": 0.3654, "step": 5673 }, { "epoch": 4.778775968556991, "grad_norm": 0.31006479263305664, "learning_rate": 5.8806111842384936e-08, "loss": 0.3447, "step": 5674 }, { "epoch": 4.779618192026951, "grad_norm": 0.2951917052268982, "learning_rate": 5.835723050062192e-08, "loss": 0.3594, "step": 5675 }, { "epoch": 4.780460415496912, "grad_norm": 0.291292279958725, "learning_rate": 5.791005888252765e-08, "loss": 0.3387, "step": 5676 }, { "epoch": 4.7813026389668725, "grad_norm": 0.2950809597969055, "learning_rate": 5.746459714281727e-08, "loss": 0.3388, "step": 5677 }, { "epoch": 4.782144862436834, "grad_norm": 0.3020569980144501, "learning_rate": 5.7020845435611375e-08, "loss": 0.3116, "step": 5678 }, { "epoch": 4.782987085906794, "grad_norm": 0.31103676557540894, "learning_rate": 5.657880391444104e-08, "loss": 0.3719, "step": 5679 }, { "epoch": 4.783829309376754, "grad_norm": 0.3137308955192566, "learning_rate": 5.6138472732244486e-08, "loss": 0.3349, "step": 5680 }, { "epoch": 4.7846715328467155, "grad_norm": 0.29804766178131104, "learning_rate": 5.569985204137041e-08, "loss": 0.3079, "step": 5681 }, { "epoch": 4.785513756316676, "grad_norm": 0.3087077736854553, "learning_rate": 5.526294199357241e-08, "loss": 0.3714, "step": 5682 }, { "epoch": 4.786355979786637, "grad_norm": 0.3085900843143463, "learning_rate": 5.482774274001401e-08, "loss": 0.3264, "step": 5683 }, { "epoch": 4.787198203256597, "grad_norm": 0.32058441638946533, "learning_rate": 5.439425443126756e-08, "loss": 0.3629, "step": 5684 }, { "epoch": 4.788040426726559, "grad_norm": 0.2987420856952667, "learning_rate": 5.396247721731196e-08, "loss": 0.3119, "step": 5685 }, { "epoch": 4.788882650196519, "grad_norm": 0.3095020651817322, "learning_rate": 5.353241124753439e-08, "loss": 0.3686, "step": 5686 }, { "epoch": 4.789724873666479, "grad_norm": 0.2913981080055237, "learning_rate": 5.310405667073137e-08, "loss": 0.3093, "step": 5687 }, { "epoch": 4.7905670971364405, "grad_norm": 0.31319451332092285, "learning_rate": 5.267741363510548e-08, "loss": 0.3641, "step": 5688 }, { "epoch": 4.791409320606401, "grad_norm": 0.3048374354839325, "learning_rate": 5.225248228826807e-08, "loss": 0.3229, "step": 5689 }, { "epoch": 4.792251544076361, "grad_norm": 0.29327091574668884, "learning_rate": 5.182926277723821e-08, "loss": 0.3178, "step": 5690 }, { "epoch": 4.793093767546322, "grad_norm": 0.30400529503822327, "learning_rate": 5.1407755248442106e-08, "loss": 0.3232, "step": 5691 }, { "epoch": 4.793935991016283, "grad_norm": 0.3013661205768585, "learning_rate": 5.0987959847714766e-08, "loss": 0.3434, "step": 5692 }, { "epoch": 4.794778214486244, "grad_norm": 0.29228347539901733, "learning_rate": 5.056987672029778e-08, "loss": 0.3511, "step": 5693 }, { "epoch": 4.795620437956204, "grad_norm": 0.28762441873550415, "learning_rate": 5.015350601084101e-08, "loss": 0.333, "step": 5694 }, { "epoch": 4.796462661426165, "grad_norm": 0.3245258033275604, "learning_rate": 4.9738847863400887e-08, "loss": 0.3765, "step": 5695 }, { "epoch": 4.797304884896126, "grad_norm": 0.3245284855365753, "learning_rate": 4.932590242144375e-08, "loss": 0.3687, "step": 5696 }, { "epoch": 4.798147108366086, "grad_norm": 0.3027959167957306, "learning_rate": 4.8914669827839765e-08, "loss": 0.3029, "step": 5697 }, { "epoch": 4.798989331836047, "grad_norm": 0.3278782069683075, "learning_rate": 4.8505150224869015e-08, "loss": 0.3399, "step": 5698 }, { "epoch": 4.799831555306008, "grad_norm": 0.2896909713745117, "learning_rate": 4.809734375421815e-08, "loss": 0.3041, "step": 5699 }, { "epoch": 4.800673778775969, "grad_norm": 0.29354509711265564, "learning_rate": 4.769125055698209e-08, "loss": 0.3202, "step": 5700 }, { "epoch": 4.801516002245929, "grad_norm": 0.3354615867137909, "learning_rate": 4.728687077366123e-08, "loss": 0.3998, "step": 5701 }, { "epoch": 4.80235822571589, "grad_norm": 0.2853464186191559, "learning_rate": 4.6884204544164204e-08, "loss": 0.3034, "step": 5702 }, { "epoch": 4.803200449185851, "grad_norm": 0.28177782893180847, "learning_rate": 4.648325200780624e-08, "loss": 0.3038, "step": 5703 }, { "epoch": 4.804042672655811, "grad_norm": 0.3015603721141815, "learning_rate": 4.6084013303310804e-08, "loss": 0.3676, "step": 5704 }, { "epoch": 4.804884896125772, "grad_norm": 0.30443432927131653, "learning_rate": 4.568648856880742e-08, "loss": 0.3138, "step": 5705 }, { "epoch": 4.8057271195957325, "grad_norm": 0.30778244137763977, "learning_rate": 4.529067794183217e-08, "loss": 0.3571, "step": 5706 }, { "epoch": 4.806569343065694, "grad_norm": 0.31804847717285156, "learning_rate": 4.489658155932941e-08, "loss": 0.3428, "step": 5707 }, { "epoch": 4.807411566535654, "grad_norm": 0.29810309410095215, "learning_rate": 4.4504199557650063e-08, "loss": 0.3407, "step": 5708 }, { "epoch": 4.808253790005615, "grad_norm": 0.2829437255859375, "learning_rate": 4.411353207255109e-08, "loss": 0.3275, "step": 5709 }, { "epoch": 4.809096013475576, "grad_norm": 0.3120591640472412, "learning_rate": 4.372457923919604e-08, "loss": 0.3567, "step": 5710 }, { "epoch": 4.809938236945536, "grad_norm": 0.3094587028026581, "learning_rate": 4.3337341192157265e-08, "loss": 0.3249, "step": 5711 }, { "epoch": 4.810780460415497, "grad_norm": 0.3080337345600128, "learning_rate": 4.295181806541204e-08, "loss": 0.3361, "step": 5712 }, { "epoch": 4.811622683885457, "grad_norm": 0.30753934383392334, "learning_rate": 4.256800999234423e-08, "loss": 0.3077, "step": 5713 }, { "epoch": 4.812464907355419, "grad_norm": 0.3032006323337555, "learning_rate": 4.2185917105744825e-08, "loss": 0.3716, "step": 5714 }, { "epoch": 4.813307130825379, "grad_norm": 0.3190382421016693, "learning_rate": 4.1805539537812525e-08, "loss": 0.3573, "step": 5715 }, { "epoch": 4.81414935429534, "grad_norm": 0.306581050157547, "learning_rate": 4.142687742015039e-08, "loss": 0.3504, "step": 5716 }, { "epoch": 4.8149915777653005, "grad_norm": 0.29406115412712097, "learning_rate": 4.104993088376974e-08, "loss": 0.3396, "step": 5717 }, { "epoch": 4.815833801235261, "grad_norm": 0.286020427942276, "learning_rate": 4.067470005908625e-08, "loss": 0.3537, "step": 5718 }, { "epoch": 4.816676024705222, "grad_norm": 0.2976120114326477, "learning_rate": 4.0301185075925506e-08, "loss": 0.3449, "step": 5719 }, { "epoch": 4.817518248175182, "grad_norm": 0.31784486770629883, "learning_rate": 3.9929386063515266e-08, "loss": 0.327, "step": 5720 }, { "epoch": 4.818360471645143, "grad_norm": 0.3107951283454895, "learning_rate": 3.955930315049261e-08, "loss": 0.3384, "step": 5721 }, { "epoch": 4.819202695115104, "grad_norm": 0.2975400686264038, "learning_rate": 3.919093646490013e-08, "loss": 0.3258, "step": 5722 }, { "epoch": 4.820044918585064, "grad_norm": 0.2970634698867798, "learning_rate": 3.882428613418532e-08, "loss": 0.3204, "step": 5723 }, { "epoch": 4.820887142055025, "grad_norm": 0.2977302074432373, "learning_rate": 3.845935228520448e-08, "loss": 0.3257, "step": 5724 }, { "epoch": 4.821729365524986, "grad_norm": 0.2997322380542755, "learning_rate": 3.809613504421661e-08, "loss": 0.3488, "step": 5725 }, { "epoch": 4.822571588994947, "grad_norm": 0.3161165118217468, "learning_rate": 3.773463453689008e-08, "loss": 0.3303, "step": 5726 }, { "epoch": 4.823413812464907, "grad_norm": 0.30900323390960693, "learning_rate": 3.7374850888297065e-08, "loss": 0.3279, "step": 5727 }, { "epoch": 4.824256035934868, "grad_norm": 0.29855576157569885, "learning_rate": 3.7016784222917436e-08, "loss": 0.3644, "step": 5728 }, { "epoch": 4.825098259404829, "grad_norm": 0.30122220516204834, "learning_rate": 3.666043466463487e-08, "loss": 0.338, "step": 5729 }, { "epoch": 4.825940482874789, "grad_norm": 0.3037762939929962, "learning_rate": 3.6305802336740745e-08, "loss": 0.3318, "step": 5730 }, { "epoch": 4.82678270634475, "grad_norm": 0.304608553647995, "learning_rate": 3.595288736193248e-08, "loss": 0.3174, "step": 5731 }, { "epoch": 4.827624929814711, "grad_norm": 0.3072862923145294, "learning_rate": 3.560168986231183e-08, "loss": 0.3491, "step": 5732 }, { "epoch": 4.828467153284672, "grad_norm": 0.2946315407752991, "learning_rate": 3.5252209959387163e-08, "loss": 0.3721, "step": 5733 }, { "epoch": 4.829309376754632, "grad_norm": 0.2890043258666992, "learning_rate": 3.490444777407287e-08, "loss": 0.3131, "step": 5734 }, { "epoch": 4.830151600224593, "grad_norm": 0.3176303207874298, "learning_rate": 3.455840342668826e-08, "loss": 0.3395, "step": 5735 }, { "epoch": 4.830993823694554, "grad_norm": 0.3161151707172394, "learning_rate": 3.421407703695922e-08, "loss": 0.3579, "step": 5736 }, { "epoch": 4.831836047164514, "grad_norm": 0.2946292459964752, "learning_rate": 3.387146872401603e-08, "loss": 0.3637, "step": 5737 }, { "epoch": 4.832678270634475, "grad_norm": 0.324479341506958, "learning_rate": 3.353057860639608e-08, "loss": 0.3705, "step": 5738 }, { "epoch": 4.833520494104436, "grad_norm": 0.3087499737739563, "learning_rate": 3.3191406802041693e-08, "loss": 0.3309, "step": 5739 }, { "epoch": 4.834362717574397, "grad_norm": 0.2990916073322296, "learning_rate": 3.2853953428299555e-08, "loss": 0.3227, "step": 5740 }, { "epoch": 4.835204941044357, "grad_norm": 0.3037185072898865, "learning_rate": 3.2518218601922944e-08, "loss": 0.3331, "step": 5741 }, { "epoch": 4.8360471645143175, "grad_norm": 0.3140299618244171, "learning_rate": 3.2184202439071165e-08, "loss": 0.3398, "step": 5742 }, { "epoch": 4.836889387984279, "grad_norm": 0.3145023584365845, "learning_rate": 3.185190505530733e-08, "loss": 0.3376, "step": 5743 }, { "epoch": 4.837731611454239, "grad_norm": 0.3167359530925751, "learning_rate": 3.152132656560114e-08, "loss": 0.3512, "step": 5744 }, { "epoch": 4.8385738349242, "grad_norm": 0.28793075680732727, "learning_rate": 3.1192467084326106e-08, "loss": 0.3501, "step": 5745 }, { "epoch": 4.839416058394161, "grad_norm": 0.3013622760772705, "learning_rate": 3.0865326725263435e-08, "loss": 0.3344, "step": 5746 }, { "epoch": 4.840258281864122, "grad_norm": 0.3026391565799713, "learning_rate": 3.053990560159703e-08, "loss": 0.3559, "step": 5747 }, { "epoch": 4.841100505334082, "grad_norm": 0.30241531133651733, "learning_rate": 3.021620382591683e-08, "loss": 0.327, "step": 5748 }, { "epoch": 4.841942728804042, "grad_norm": 0.3067898452281952, "learning_rate": 2.989422151021881e-08, "loss": 0.3747, "step": 5749 }, { "epoch": 4.842784952274004, "grad_norm": 0.2729976773262024, "learning_rate": 2.9573958765903854e-08, "loss": 0.3237, "step": 5750 }, { "epoch": 4.843627175743964, "grad_norm": 0.31082040071487427, "learning_rate": 2.92554157037761e-08, "loss": 0.374, "step": 5751 }, { "epoch": 4.844469399213924, "grad_norm": 0.30378031730651855, "learning_rate": 2.8938592434046285e-08, "loss": 0.3445, "step": 5752 }, { "epoch": 4.8453116226838855, "grad_norm": 0.3234860301017761, "learning_rate": 2.86234890663295e-08, "loss": 0.3549, "step": 5753 }, { "epoch": 4.846153846153846, "grad_norm": 0.3207099735736847, "learning_rate": 2.831010570964743e-08, "loss": 0.3392, "step": 5754 }, { "epoch": 4.846996069623807, "grad_norm": 0.3113251030445099, "learning_rate": 2.7998442472424446e-08, "loss": 0.3421, "step": 5755 }, { "epoch": 4.847838293093767, "grad_norm": 0.29584014415740967, "learning_rate": 2.768849946249097e-08, "loss": 0.3468, "step": 5756 }, { "epoch": 4.848680516563729, "grad_norm": 0.30705830454826355, "learning_rate": 2.738027678708177e-08, "loss": 0.3288, "step": 5757 }, { "epoch": 4.849522740033689, "grad_norm": 0.3223762810230255, "learning_rate": 2.7073774552836553e-08, "loss": 0.3628, "step": 5758 }, { "epoch": 4.850364963503649, "grad_norm": 0.3205292820930481, "learning_rate": 2.676899286579937e-08, "loss": 0.3474, "step": 5759 }, { "epoch": 4.85120718697361, "grad_norm": 0.3206430673599243, "learning_rate": 2.6465931831420877e-08, "loss": 0.3289, "step": 5760 }, { "epoch": 4.852049410443571, "grad_norm": 0.30151861906051636, "learning_rate": 2.6164591554553865e-08, "loss": 0.3681, "step": 5761 }, { "epoch": 4.852891633913532, "grad_norm": 0.31265881657600403, "learning_rate": 2.5864972139457157e-08, "loss": 0.307, "step": 5762 }, { "epoch": 4.853733857383492, "grad_norm": 0.30619922280311584, "learning_rate": 2.5567073689793387e-08, "loss": 0.3162, "step": 5763 }, { "epoch": 4.8545760808534535, "grad_norm": 0.29296785593032837, "learning_rate": 2.5270896308631775e-08, "loss": 0.3292, "step": 5764 }, { "epoch": 4.855418304323414, "grad_norm": 0.3121234178543091, "learning_rate": 2.4976440098443135e-08, "loss": 0.3392, "step": 5765 }, { "epoch": 4.856260527793374, "grad_norm": 0.3216954469680786, "learning_rate": 2.4683705161104852e-08, "loss": 0.335, "step": 5766 }, { "epoch": 4.857102751263335, "grad_norm": 0.2805759012699127, "learning_rate": 2.4392691597898144e-08, "loss": 0.3139, "step": 5767 }, { "epoch": 4.857944974733296, "grad_norm": 0.3292483687400818, "learning_rate": 2.410339950950913e-08, "loss": 0.372, "step": 5768 }, { "epoch": 4.858787198203257, "grad_norm": 0.3173235058784485, "learning_rate": 2.3815828996027192e-08, "loss": 0.3458, "step": 5769 }, { "epoch": 4.859629421673217, "grad_norm": 0.29461249709129333, "learning_rate": 2.3529980156947185e-08, "loss": 0.3394, "step": 5770 }, { "epoch": 4.860471645143178, "grad_norm": 0.30249837040901184, "learning_rate": 2.3245853091167782e-08, "loss": 0.3337, "step": 5771 }, { "epoch": 4.861313868613139, "grad_norm": 0.3085329234600067, "learning_rate": 2.2963447896992564e-08, "loss": 0.3551, "step": 5772 }, { "epoch": 4.862156092083099, "grad_norm": 0.2963785231113434, "learning_rate": 2.2682764672127823e-08, "loss": 0.3448, "step": 5773 }, { "epoch": 4.86299831555306, "grad_norm": 0.3008818030357361, "learning_rate": 2.240380351368643e-08, "loss": 0.3605, "step": 5774 }, { "epoch": 4.863840539023021, "grad_norm": 0.30624938011169434, "learning_rate": 2.21265645181834e-08, "loss": 0.3229, "step": 5775 }, { "epoch": 4.864682762492982, "grad_norm": 0.3013368546962738, "learning_rate": 2.1851047781538682e-08, "loss": 0.3386, "step": 5776 }, { "epoch": 4.865524985962942, "grad_norm": 0.3056620657444, "learning_rate": 2.157725339907657e-08, "loss": 0.3501, "step": 5777 }, { "epoch": 4.866367209432903, "grad_norm": 0.30760157108306885, "learning_rate": 2.130518146552463e-08, "loss": 0.3305, "step": 5778 }, { "epoch": 4.867209432902864, "grad_norm": 0.30289140343666077, "learning_rate": 2.1034832075016443e-08, "loss": 0.332, "step": 5779 }, { "epoch": 4.868051656372824, "grad_norm": 0.3024524450302124, "learning_rate": 2.076620532108664e-08, "loss": 0.3729, "step": 5780 }, { "epoch": 4.868893879842785, "grad_norm": 0.3065566420555115, "learning_rate": 2.0499301296676434e-08, "loss": 0.3356, "step": 5781 }, { "epoch": 4.8697361033127455, "grad_norm": 0.2961845397949219, "learning_rate": 2.023412009413028e-08, "loss": 0.2916, "step": 5782 }, { "epoch": 4.870578326782706, "grad_norm": 0.3265790045261383, "learning_rate": 1.9970661805195358e-08, "loss": 0.3827, "step": 5783 }, { "epoch": 4.871420550252667, "grad_norm": 0.28887301683425903, "learning_rate": 1.9708926521024297e-08, "loss": 0.3563, "step": 5784 }, { "epoch": 4.872262773722627, "grad_norm": 0.29949790239334106, "learning_rate": 1.9448914332172996e-08, "loss": 0.3502, "step": 5785 }, { "epoch": 4.873104997192589, "grad_norm": 0.31299740076065063, "learning_rate": 1.9190625328601164e-08, "loss": 0.3612, "step": 5786 }, { "epoch": 4.873947220662549, "grad_norm": 0.2861705720424652, "learning_rate": 1.8934059599672872e-08, "loss": 0.3191, "step": 5787 }, { "epoch": 4.87478944413251, "grad_norm": 0.3150510787963867, "learning_rate": 1.8679217234154335e-08, "loss": 0.3364, "step": 5788 }, { "epoch": 4.8756316676024705, "grad_norm": 0.3012487590312958, "learning_rate": 1.84260983202178e-08, "loss": 0.3366, "step": 5789 }, { "epoch": 4.876473891072431, "grad_norm": 0.30428972840309143, "learning_rate": 1.8174702945437106e-08, "loss": 0.3481, "step": 5790 }, { "epoch": 4.877316114542392, "grad_norm": 0.3319021761417389, "learning_rate": 1.7925031196791565e-08, "loss": 0.3745, "step": 5791 }, { "epoch": 4.878158338012352, "grad_norm": 0.2938607335090637, "learning_rate": 1.7677083160662632e-08, "loss": 0.3531, "step": 5792 }, { "epoch": 4.8790005614823135, "grad_norm": 0.29848748445510864, "learning_rate": 1.7430858922836692e-08, "loss": 0.3477, "step": 5793 }, { "epoch": 4.879842784952274, "grad_norm": 0.2871935963630676, "learning_rate": 1.718635856850226e-08, "loss": 0.2973, "step": 5794 }, { "epoch": 4.880685008422235, "grad_norm": 0.3235379755496979, "learning_rate": 1.6943582182253338e-08, "loss": 0.3674, "step": 5795 }, { "epoch": 4.881527231892195, "grad_norm": 0.30829304456710815, "learning_rate": 1.6702529848085514e-08, "loss": 0.3098, "step": 5796 }, { "epoch": 4.882369455362156, "grad_norm": 0.29342740774154663, "learning_rate": 1.6463201649399297e-08, "loss": 0.2812, "step": 5797 }, { "epoch": 4.883211678832117, "grad_norm": 0.30406466126441956, "learning_rate": 1.6225597668997894e-08, "loss": 0.3449, "step": 5798 }, { "epoch": 4.884053902302077, "grad_norm": 0.2997269928455353, "learning_rate": 1.598971798908888e-08, "loss": 0.3276, "step": 5799 }, { "epoch": 4.8848961257720385, "grad_norm": 0.31418734788894653, "learning_rate": 1.5755562691281424e-08, "loss": 0.3394, "step": 5800 }, { "epoch": 4.885738349241999, "grad_norm": 0.3088889420032501, "learning_rate": 1.55231318565896e-08, "loss": 0.3323, "step": 5801 }, { "epoch": 4.88658057271196, "grad_norm": 0.2890072464942932, "learning_rate": 1.5292425565430758e-08, "loss": 0.314, "step": 5802 }, { "epoch": 4.88742279618192, "grad_norm": 0.3144424557685852, "learning_rate": 1.5063443897625484e-08, "loss": 0.3668, "step": 5803 }, { "epoch": 4.888265019651881, "grad_norm": 0.29707667231559753, "learning_rate": 1.483618693239708e-08, "loss": 0.3264, "step": 5804 }, { "epoch": 4.889107243121842, "grad_norm": 0.3093074560165405, "learning_rate": 1.4610654748373198e-08, "loss": 0.381, "step": 5805 }, { "epoch": 4.889949466591802, "grad_norm": 0.28467121720314026, "learning_rate": 1.4386847423583094e-08, "loss": 0.3366, "step": 5806 }, { "epoch": 4.890791690061763, "grad_norm": 0.28801870346069336, "learning_rate": 1.4164765035461492e-08, "loss": 0.3504, "step": 5807 }, { "epoch": 4.891633913531724, "grad_norm": 0.2775857746601105, "learning_rate": 1.3944407660843596e-08, "loss": 0.3258, "step": 5808 }, { "epoch": 4.892476137001685, "grad_norm": 0.3047701418399811, "learning_rate": 1.3725775375970641e-08, "loss": 0.3212, "step": 5809 }, { "epoch": 4.893318360471645, "grad_norm": 0.3002820312976837, "learning_rate": 1.3508868256484896e-08, "loss": 0.337, "step": 5810 }, { "epoch": 4.894160583941606, "grad_norm": 0.31145861744880676, "learning_rate": 1.3293686377433002e-08, "loss": 0.377, "step": 5811 }, { "epoch": 4.895002807411567, "grad_norm": 0.306947261095047, "learning_rate": 1.3080229813263734e-08, "loss": 0.3561, "step": 5812 }, { "epoch": 4.895845030881527, "grad_norm": 0.30101701617240906, "learning_rate": 1.2868498637829686e-08, "loss": 0.3556, "step": 5813 }, { "epoch": 4.896687254351487, "grad_norm": 0.291780948638916, "learning_rate": 1.2658492924386145e-08, "loss": 0.3238, "step": 5814 }, { "epoch": 4.897529477821449, "grad_norm": 0.29887568950653076, "learning_rate": 1.2450212745591661e-08, "loss": 0.3473, "step": 5815 }, { "epoch": 4.898371701291409, "grad_norm": 0.321196973323822, "learning_rate": 1.2243658173506923e-08, "loss": 0.3774, "step": 5816 }, { "epoch": 4.89921392476137, "grad_norm": 0.3074091970920563, "learning_rate": 1.2038829279596986e-08, "loss": 0.3337, "step": 5817 }, { "epoch": 4.9000561482313305, "grad_norm": 0.3146440386772156, "learning_rate": 1.1835726134729054e-08, "loss": 0.3739, "step": 5818 }, { "epoch": 4.900898371701292, "grad_norm": 0.2965931296348572, "learning_rate": 1.1634348809173023e-08, "loss": 0.3508, "step": 5819 }, { "epoch": 4.901740595171252, "grad_norm": 0.30223792791366577, "learning_rate": 1.1434697372602055e-08, "loss": 0.3495, "step": 5820 }, { "epoch": 4.902582818641212, "grad_norm": 0.30042195320129395, "learning_rate": 1.1236771894092557e-08, "loss": 0.323, "step": 5821 }, { "epoch": 4.903425042111174, "grad_norm": 0.30732473731040955, "learning_rate": 1.1040572442122532e-08, "loss": 0.3665, "step": 5822 }, { "epoch": 4.904267265581134, "grad_norm": 0.29967236518859863, "learning_rate": 1.0846099084574346e-08, "loss": 0.3249, "step": 5823 }, { "epoch": 4.905109489051095, "grad_norm": 0.2879403531551361, "learning_rate": 1.065335188873251e-08, "loss": 0.3445, "step": 5824 }, { "epoch": 4.905951712521055, "grad_norm": 0.32563352584838867, "learning_rate": 1.046233092128368e-08, "loss": 0.3607, "step": 5825 }, { "epoch": 4.906793935991017, "grad_norm": 0.3144632577896118, "learning_rate": 1.0273036248318325e-08, "loss": 0.3245, "step": 5826 }, { "epoch": 4.907636159460977, "grad_norm": 0.31059345602989197, "learning_rate": 1.0085467935329052e-08, "loss": 0.3231, "step": 5827 }, { "epoch": 4.908478382930937, "grad_norm": 0.2827950716018677, "learning_rate": 9.89962604721062e-09, "loss": 0.3382, "step": 5828 }, { "epoch": 4.9093206064008985, "grad_norm": 0.30618390440940857, "learning_rate": 9.71551064826215e-09, "loss": 0.3533, "step": 5829 }, { "epoch": 4.910162829870859, "grad_norm": 0.307717502117157, "learning_rate": 9.533121802183797e-09, "loss": 0.3235, "step": 5830 }, { "epoch": 4.91100505334082, "grad_norm": 0.2938036322593689, "learning_rate": 9.352459572078976e-09, "loss": 0.3318, "step": 5831 }, { "epoch": 4.91184727681078, "grad_norm": 0.28673163056373596, "learning_rate": 9.173524020453795e-09, "loss": 0.3084, "step": 5832 }, { "epoch": 4.912689500280742, "grad_norm": 0.320830762386322, "learning_rate": 8.99631520921762e-09, "loss": 0.3995, "step": 5833 }, { "epoch": 4.913531723750702, "grad_norm": 0.288427472114563, "learning_rate": 8.820833199680301e-09, "loss": 0.3066, "step": 5834 }, { "epoch": 4.914373947220662, "grad_norm": 0.312163770198822, "learning_rate": 8.6470780525566e-09, "loss": 0.3906, "step": 5835 }, { "epoch": 4.915216170690623, "grad_norm": 0.2598129212856293, "learning_rate": 8.475049827962878e-09, "loss": 0.289, "step": 5836 }, { "epoch": 4.916058394160584, "grad_norm": 0.32006192207336426, "learning_rate": 8.304748585417077e-09, "loss": 0.3937, "step": 5837 }, { "epoch": 4.916900617630545, "grad_norm": 0.3169005811214447, "learning_rate": 8.13617438384151e-09, "loss": 0.3348, "step": 5838 }, { "epoch": 4.917742841100505, "grad_norm": 0.29699981212615967, "learning_rate": 7.96932728155897e-09, "loss": 0.3554, "step": 5839 }, { "epoch": 4.9185850645704665, "grad_norm": 0.3367904722690582, "learning_rate": 7.804207336296609e-09, "loss": 0.3295, "step": 5840 }, { "epoch": 4.919427288040427, "grad_norm": 0.3184203803539276, "learning_rate": 7.640814605182067e-09, "loss": 0.3422, "step": 5841 }, { "epoch": 4.920269511510387, "grad_norm": 0.2835085093975067, "learning_rate": 7.479149144747898e-09, "loss": 0.2869, "step": 5842 }, { "epoch": 4.921111734980348, "grad_norm": 0.31508246064186096, "learning_rate": 7.3192110109260305e-09, "loss": 0.4141, "step": 5843 }, { "epoch": 4.921953958450309, "grad_norm": 0.3096730411052704, "learning_rate": 7.161000259053308e-09, "loss": 0.2972, "step": 5844 }, { "epoch": 4.922796181920269, "grad_norm": 0.30005931854248047, "learning_rate": 7.004516943868167e-09, "loss": 0.3559, "step": 5845 }, { "epoch": 4.92363840539023, "grad_norm": 0.2919882833957672, "learning_rate": 6.849761119510079e-09, "loss": 0.3337, "step": 5846 }, { "epoch": 4.924480628860191, "grad_norm": 0.2974596619606018, "learning_rate": 6.6967328395223244e-09, "loss": 0.3567, "step": 5847 }, { "epoch": 4.925322852330152, "grad_norm": 0.33140358328819275, "learning_rate": 6.545432156850329e-09, "loss": 0.3584, "step": 5848 }, { "epoch": 4.926165075800112, "grad_norm": 0.3154292404651642, "learning_rate": 6.3958591238416634e-09, "loss": 0.331, "step": 5849 }, { "epoch": 4.927007299270073, "grad_norm": 0.30250173807144165, "learning_rate": 6.248013792245489e-09, "loss": 0.3427, "step": 5850 }, { "epoch": 4.927849522740034, "grad_norm": 0.27558937668800354, "learning_rate": 6.10189621321422e-09, "loss": 0.3292, "step": 5851 }, { "epoch": 4.928691746209994, "grad_norm": 0.29001471400260925, "learning_rate": 5.957506437301863e-09, "loss": 0.3333, "step": 5852 }, { "epoch": 4.929533969679955, "grad_norm": 0.29568207263946533, "learning_rate": 5.8148445144645686e-09, "loss": 0.3463, "step": 5853 }, { "epoch": 4.9303761931499155, "grad_norm": 0.30256226658821106, "learning_rate": 5.67391049406174e-09, "loss": 0.3434, "step": 5854 }, { "epoch": 4.931218416619877, "grad_norm": 0.3083930015563965, "learning_rate": 5.534704424852711e-09, "loss": 0.352, "step": 5855 }, { "epoch": 4.932060640089837, "grad_norm": 0.31448403000831604, "learning_rate": 5.3972263550017275e-09, "loss": 0.3713, "step": 5856 }, { "epoch": 4.932902863559798, "grad_norm": 0.3070605993270874, "learning_rate": 5.261476332073523e-09, "loss": 0.3323, "step": 5857 }, { "epoch": 4.933745087029759, "grad_norm": 0.3064126670360565, "learning_rate": 5.1274544030344156e-09, "loss": 0.3736, "step": 5858 }, { "epoch": 4.934587310499719, "grad_norm": 0.27625536918640137, "learning_rate": 4.99516061425509e-09, "loss": 0.3165, "step": 5859 }, { "epoch": 4.93542953396968, "grad_norm": 0.28161418437957764, "learning_rate": 4.864595011506157e-09, "loss": 0.3036, "step": 5860 }, { "epoch": 4.93627175743964, "grad_norm": 0.29392266273498535, "learning_rate": 4.735757639960925e-09, "loss": 0.3342, "step": 5861 }, { "epoch": 4.937113980909602, "grad_norm": 0.3131870925426483, "learning_rate": 4.6086485441948495e-09, "loss": 0.3743, "step": 5862 }, { "epoch": 4.937956204379562, "grad_norm": 0.29208725690841675, "learning_rate": 4.483267768186084e-09, "loss": 0.3158, "step": 5863 }, { "epoch": 4.938798427849523, "grad_norm": 0.29143208265304565, "learning_rate": 4.359615355313818e-09, "loss": 0.358, "step": 5864 }, { "epoch": 4.9396406513194835, "grad_norm": 0.28705736994743347, "learning_rate": 4.23769134835994e-09, "loss": 0.3114, "step": 5865 }, { "epoch": 4.940482874789444, "grad_norm": 0.29745376110076904, "learning_rate": 4.117495789507375e-09, "loss": 0.3553, "step": 5866 }, { "epoch": 4.941325098259405, "grad_norm": 0.2844071686267853, "learning_rate": 3.999028720342857e-09, "loss": 0.3544, "step": 5867 }, { "epoch": 4.942167321729365, "grad_norm": 0.31317925453186035, "learning_rate": 3.882290181853043e-09, "loss": 0.3465, "step": 5868 }, { "epoch": 4.943009545199327, "grad_norm": 0.31521984934806824, "learning_rate": 3.767280214427849e-09, "loss": 0.3226, "step": 5869 }, { "epoch": 4.943851768669287, "grad_norm": 0.30577656626701355, "learning_rate": 3.653998857858776e-09, "loss": 0.3271, "step": 5870 }, { "epoch": 4.944693992139248, "grad_norm": 0.27558159828186035, "learning_rate": 3.5424461513389187e-09, "loss": 0.3247, "step": 5871 }, { "epoch": 4.945536215609208, "grad_norm": 0.2928052842617035, "learning_rate": 3.4326221334640697e-09, "loss": 0.3134, "step": 5872 }, { "epoch": 4.946378439079169, "grad_norm": 0.3117440342903137, "learning_rate": 3.3245268422305023e-09, "loss": 0.3635, "step": 5873 }, { "epoch": 4.94722066254913, "grad_norm": 0.2798506021499634, "learning_rate": 3.218160315038854e-09, "loss": 0.3396, "step": 5874 }, { "epoch": 4.94806288601909, "grad_norm": 0.301107257604599, "learning_rate": 3.113522588689133e-09, "loss": 0.3738, "step": 5875 }, { "epoch": 4.948905109489051, "grad_norm": 0.27446967363357544, "learning_rate": 3.0106136993840463e-09, "loss": 0.329, "step": 5876 }, { "epoch": 4.949747332959012, "grad_norm": 0.30317890644073486, "learning_rate": 2.9094336827284464e-09, "loss": 0.3208, "step": 5877 }, { "epoch": 4.950589556428972, "grad_norm": 0.28571903705596924, "learning_rate": 2.809982573729886e-09, "loss": 0.3362, "step": 5878 }, { "epoch": 4.951431779898933, "grad_norm": 0.30620279908180237, "learning_rate": 2.7122604067952863e-09, "loss": 0.3736, "step": 5879 }, { "epoch": 4.952274003368894, "grad_norm": 0.2981039583683014, "learning_rate": 2.6162672157353795e-09, "loss": 0.3698, "step": 5880 }, { "epoch": 4.953116226838855, "grad_norm": 0.294215589761734, "learning_rate": 2.522003033762488e-09, "loss": 0.3071, "step": 5881 }, { "epoch": 4.953958450308815, "grad_norm": 0.3157990276813507, "learning_rate": 2.4294678934899674e-09, "loss": 0.3652, "step": 5882 }, { "epoch": 4.9548006737787755, "grad_norm": 0.3067362606525421, "learning_rate": 2.3386618269338747e-09, "loss": 0.3567, "step": 5883 }, { "epoch": 4.955642897248737, "grad_norm": 0.2873389422893524, "learning_rate": 2.2495848655113007e-09, "loss": 0.3417, "step": 5884 }, { "epoch": 4.956485120718697, "grad_norm": 0.3125622570514679, "learning_rate": 2.1622370400409266e-09, "loss": 0.3501, "step": 5885 }, { "epoch": 4.957327344188658, "grad_norm": 0.2997153103351593, "learning_rate": 2.076618380744133e-09, "loss": 0.3277, "step": 5886 }, { "epoch": 4.958169567658619, "grad_norm": 0.30470502376556396, "learning_rate": 1.992728917243336e-09, "loss": 0.3637, "step": 5887 }, { "epoch": 4.95901179112858, "grad_norm": 0.31840991973876953, "learning_rate": 1.910568678563096e-09, "loss": 0.3527, "step": 5888 }, { "epoch": 4.95985401459854, "grad_norm": 0.3225303888320923, "learning_rate": 1.8301376931290083e-09, "loss": 0.3714, "step": 5889 }, { "epoch": 4.9606962380685005, "grad_norm": 0.2950524687767029, "learning_rate": 1.7514359887693677e-09, "loss": 0.2867, "step": 5890 }, { "epoch": 4.961538461538462, "grad_norm": 0.3320896625518799, "learning_rate": 1.6744635927129494e-09, "loss": 0.3664, "step": 5891 }, { "epoch": 4.962380685008422, "grad_norm": 0.30792298913002014, "learning_rate": 1.599220531591228e-09, "loss": 0.3615, "step": 5892 }, { "epoch": 4.963222908478383, "grad_norm": 0.2998102903366089, "learning_rate": 1.5257068314372681e-09, "loss": 0.3208, "step": 5893 }, { "epoch": 4.9640651319483435, "grad_norm": 0.32168063521385193, "learning_rate": 1.4539225176851691e-09, "loss": 0.3409, "step": 5894 }, { "epoch": 4.964907355418305, "grad_norm": 0.3111848831176758, "learning_rate": 1.3838676151706198e-09, "loss": 0.3199, "step": 5895 }, { "epoch": 4.965749578888265, "grad_norm": 0.3162185847759247, "learning_rate": 1.3155421481325648e-09, "loss": 0.3742, "step": 5896 }, { "epoch": 4.966591802358225, "grad_norm": 0.2839168310165405, "learning_rate": 1.2489461402098724e-09, "loss": 0.316, "step": 5897 }, { "epoch": 4.967434025828187, "grad_norm": 0.3019958734512329, "learning_rate": 1.1840796144435562e-09, "loss": 0.3203, "step": 5898 }, { "epoch": 4.968276249298147, "grad_norm": 0.309857577085495, "learning_rate": 1.1209425932762196e-09, "loss": 0.3475, "step": 5899 }, { "epoch": 4.969118472768108, "grad_norm": 0.3164571225643158, "learning_rate": 1.0595350985526109e-09, "loss": 0.3758, "step": 5900 }, { "epoch": 4.9699606962380685, "grad_norm": 0.313961386680603, "learning_rate": 9.998571515179577e-10, "loss": 0.3614, "step": 5901 }, { "epoch": 4.97080291970803, "grad_norm": 0.27438125014305115, "learning_rate": 9.419087728207432e-10, "loss": 0.3308, "step": 5902 }, { "epoch": 4.97164514317799, "grad_norm": 0.3052203357219696, "learning_rate": 8.856899825093746e-10, "loss": 0.3357, "step": 5903 }, { "epoch": 4.97248736664795, "grad_norm": 0.32409974932670593, "learning_rate": 8.312008000349592e-10, "loss": 0.3173, "step": 5904 }, { "epoch": 4.9733295901179115, "grad_norm": 0.30670174956321716, "learning_rate": 7.784412442490841e-10, "loss": 0.355, "step": 5905 }, { "epoch": 4.974171813587872, "grad_norm": 0.2874554395675659, "learning_rate": 7.274113334071464e-10, "loss": 0.2966, "step": 5906 }, { "epoch": 4.975014037057832, "grad_norm": 0.3098822236061096, "learning_rate": 6.781110851633576e-10, "loss": 0.3529, "step": 5907 }, { "epoch": 4.975856260527793, "grad_norm": 0.31920844316482544, "learning_rate": 6.305405165751843e-10, "loss": 0.3953, "step": 5908 }, { "epoch": 4.976698483997754, "grad_norm": 0.2614160478115082, "learning_rate": 5.846996441011277e-10, "loss": 0.2944, "step": 5909 }, { "epoch": 4.977540707467715, "grad_norm": 0.2738765776157379, "learning_rate": 5.405884836012787e-10, "loss": 0.3148, "step": 5910 }, { "epoch": 4.978382930937675, "grad_norm": 0.2968199849128723, "learning_rate": 4.982070503373182e-10, "loss": 0.3988, "step": 5911 }, { "epoch": 4.9792251544076365, "grad_norm": 0.27969294786453247, "learning_rate": 4.575553589730719e-10, "loss": 0.3004, "step": 5912 }, { "epoch": 4.980067377877597, "grad_norm": 0.30071577429771423, "learning_rate": 4.186334235728451e-10, "loss": 0.3424, "step": 5913 }, { "epoch": 4.980909601347557, "grad_norm": 0.3126188814640045, "learning_rate": 3.814412576025328e-10, "loss": 0.3396, "step": 5914 }, { "epoch": 4.981751824817518, "grad_norm": 0.2982785403728485, "learning_rate": 3.459788739307302e-10, "loss": 0.3016, "step": 5915 }, { "epoch": 4.982594048287479, "grad_norm": 0.3112945556640625, "learning_rate": 3.1224628482651177e-10, "loss": 0.304, "step": 5916 }, { "epoch": 4.98343627175744, "grad_norm": 0.305965781211853, "learning_rate": 2.8024350196109715e-10, "loss": 0.3725, "step": 5917 }, { "epoch": 4.9842784952274, "grad_norm": 0.30515193939208984, "learning_rate": 2.499705364061855e-10, "loss": 0.3396, "step": 5918 }, { "epoch": 4.985120718697361, "grad_norm": 0.29072463512420654, "learning_rate": 2.2142739863673103e-10, "loss": 0.3117, "step": 5919 }, { "epoch": 4.985962942167322, "grad_norm": 0.31736496090888977, "learning_rate": 1.9461409852705727e-10, "loss": 0.3289, "step": 5920 }, { "epoch": 4.986805165637282, "grad_norm": 0.2957375943660736, "learning_rate": 1.6953064535474296e-10, "loss": 0.3608, "step": 5921 }, { "epoch": 4.987647389107243, "grad_norm": 0.3218628764152527, "learning_rate": 1.4617704779840146e-10, "loss": 0.378, "step": 5922 }, { "epoch": 4.988489612577204, "grad_norm": 0.3062000274658203, "learning_rate": 1.2455331393712578e-10, "loss": 0.3666, "step": 5923 }, { "epoch": 4.989331836047165, "grad_norm": 0.30687278509140015, "learning_rate": 1.0465945125326393e-10, "loss": 0.362, "step": 5924 }, { "epoch": 4.990174059517125, "grad_norm": 0.27235859632492065, "learning_rate": 8.649546662908848e-11, "loss": 0.324, "step": 5925 }, { "epoch": 4.991016282987086, "grad_norm": 0.30044376850128174, "learning_rate": 7.006136634957194e-11, "loss": 0.362, "step": 5926 }, { "epoch": 4.991858506457047, "grad_norm": 0.32089921832084656, "learning_rate": 5.5357156100721566e-11, "loss": 0.37, "step": 5927 }, { "epoch": 4.992700729927007, "grad_norm": 0.32539135217666626, "learning_rate": 4.238284096902412e-11, "loss": 0.3685, "step": 5928 }, { "epoch": 4.993542953396968, "grad_norm": 0.2885964512825012, "learning_rate": 3.113842544422152e-11, "loss": 0.3124, "step": 5929 }, { "epoch": 4.9943851768669285, "grad_norm": 0.2918040454387665, "learning_rate": 2.1623913416535246e-11, "loss": 0.3421, "step": 5930 }, { "epoch": 4.99522740033689, "grad_norm": 0.28548091650009155, "learning_rate": 1.3839308177776567e-11, "loss": 0.335, "step": 5931 }, { "epoch": 4.99606962380685, "grad_norm": 0.2919279932975769, "learning_rate": 7.784612421346538e-12, "loss": 0.3493, "step": 5932 }, { "epoch": 4.996911847276811, "grad_norm": 0.2852227985858917, "learning_rate": 3.459828241680896e-12, "loss": 0.3305, "step": 5933 }, { "epoch": 4.997754070746772, "grad_norm": 0.2956923544406891, "learning_rate": 8.649571353602782e-13, "loss": 0.3364, "step": 5934 }, { "epoch": 4.998596294216732, "grad_norm": 0.29733940958976746, "learning_rate": 0.0, "loss": 0.3353, "step": 5935 }, { "epoch": 4.998596294216732, "step": 5935, "total_flos": 7698482478710784.0, "train_loss": 0.4043946771081396, "train_runtime": 201556.4659, "train_samples_per_second": 2.827, "train_steps_per_second": 0.029 } ], "logging_steps": 1.0, "max_steps": 5935, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7698482478710784.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }