diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31104 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 4434, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006765899864682003, + "grad_norm": 1294.7799740470043, + "learning_rate": 2.2522522522522524e-08, + "loss": 11.9131, + "step": 1 + }, + { + "epoch": 0.0013531799729364006, + "grad_norm": 1298.4331558214524, + "learning_rate": 4.504504504504505e-08, + "loss": 11.9163, + "step": 2 + }, + { + "epoch": 0.0020297699594046007, + "grad_norm": 1325.2434867763561, + "learning_rate": 6.756756756756757e-08, + "loss": 11.9102, + "step": 3 + }, + { + "epoch": 0.0027063599458728013, + "grad_norm": 1366.1141393954351, + "learning_rate": 9.00900900900901e-08, + "loss": 11.9533, + "step": 4 + }, + { + "epoch": 0.0033829499323410014, + "grad_norm": 1425.1438127122265, + "learning_rate": 1.1261261261261262e-07, + "loss": 11.928, + "step": 5 + }, + { + "epoch": 0.0040595399188092015, + "grad_norm": 1395.5312633979067, + "learning_rate": 1.3513513513513515e-07, + "loss": 11.9208, + "step": 6 + }, + { + "epoch": 0.004736129905277402, + "grad_norm": 1438.0298310330422, + "learning_rate": 1.5765765765765766e-07, + "loss": 11.8879, + "step": 7 + }, + { + "epoch": 0.005412719891745603, + "grad_norm": 1636.1424356180714, + "learning_rate": 1.801801801801802e-07, + "loss": 11.8713, + "step": 8 + }, + { + "epoch": 0.006089309878213802, + "grad_norm": 1638.3207575482402, + "learning_rate": 2.0270270270270273e-07, + "loss": 11.7031, + "step": 9 + }, + { + "epoch": 0.006765899864682003, + "grad_norm": 1796.5344880486534, + "learning_rate": 2.2522522522522524e-07, + "loss": 11.6355, + "step": 10 + }, + { + "epoch": 0.007442489851150203, + "grad_norm": 1140.5545815120106, + "learning_rate": 2.477477477477478e-07, + "loss": 11.1436, + "step": 11 + }, + { + "epoch": 0.008119079837618403, + "grad_norm": 1054.7170118427175, + "learning_rate": 2.702702702702703e-07, + "loss": 11.0597, + "step": 12 + }, + { + "epoch": 0.008795669824086604, + "grad_norm": 1026.8961411097375, + "learning_rate": 2.927927927927928e-07, + "loss": 10.9774, + "step": 13 + }, + { + "epoch": 0.009472259810554804, + "grad_norm": 1032.8878691346852, + "learning_rate": 3.153153153153153e-07, + "loss": 10.8926, + "step": 14 + }, + { + "epoch": 0.010148849797023005, + "grad_norm": 982.9642790210778, + "learning_rate": 3.378378378378379e-07, + "loss": 10.0553, + "step": 15 + }, + { + "epoch": 0.010825439783491205, + "grad_norm": 1205.4064168126627, + "learning_rate": 3.603603603603604e-07, + "loss": 9.9305, + "step": 16 + }, + { + "epoch": 0.011502029769959404, + "grad_norm": 889.7008770631086, + "learning_rate": 3.828828828828829e-07, + "loss": 9.8475, + "step": 17 + }, + { + "epoch": 0.012178619756427604, + "grad_norm": 797.7351269703299, + "learning_rate": 4.0540540540540546e-07, + "loss": 9.6404, + "step": 18 + }, + { + "epoch": 0.012855209742895805, + "grad_norm": 588.7198873727792, + "learning_rate": 4.27927927927928e-07, + "loss": 9.5483, + "step": 19 + }, + { + "epoch": 0.013531799729364006, + "grad_norm": 312.9505155993464, + "learning_rate": 4.504504504504505e-07, + "loss": 8.9424, + "step": 20 + }, + { + "epoch": 0.014208389715832206, + "grad_norm": 365.06295714206897, + "learning_rate": 4.7297297297297305e-07, + "loss": 8.8538, + "step": 21 + }, + { + "epoch": 0.014884979702300407, + "grad_norm": 278.7861480433846, + "learning_rate": 4.954954954954956e-07, + "loss": 8.7556, + "step": 22 + }, + { + "epoch": 0.015561569688768605, + "grad_norm": 224.60328456473022, + "learning_rate": 5.180180180180181e-07, + "loss": 8.7774, + "step": 23 + }, + { + "epoch": 0.016238159675236806, + "grad_norm": 194.31032323395746, + "learning_rate": 5.405405405405406e-07, + "loss": 8.6965, + "step": 24 + }, + { + "epoch": 0.016914749661705007, + "grad_norm": 171.4227734694513, + "learning_rate": 5.630630630630631e-07, + "loss": 8.6245, + "step": 25 + }, + { + "epoch": 0.017591339648173207, + "grad_norm": 181.2827273623685, + "learning_rate": 5.855855855855856e-07, + "loss": 8.5409, + "step": 26 + }, + { + "epoch": 0.018267929634641408, + "grad_norm": 316.2362978808448, + "learning_rate": 6.081081081081082e-07, + "loss": 8.3035, + "step": 27 + }, + { + "epoch": 0.018944519621109608, + "grad_norm": 199.10162323615126, + "learning_rate": 6.306306306306306e-07, + "loss": 8.1246, + "step": 28 + }, + { + "epoch": 0.01962110960757781, + "grad_norm": 158.8747267496112, + "learning_rate": 6.531531531531532e-07, + "loss": 7.9727, + "step": 29 + }, + { + "epoch": 0.02029769959404601, + "grad_norm": 142.90489272104213, + "learning_rate": 6.756756756756758e-07, + "loss": 7.9089, + "step": 30 + }, + { + "epoch": 0.02097428958051421, + "grad_norm": 112.95667777019665, + "learning_rate": 6.981981981981982e-07, + "loss": 7.8067, + "step": 31 + }, + { + "epoch": 0.02165087956698241, + "grad_norm": 99.97524285645189, + "learning_rate": 7.207207207207208e-07, + "loss": 7.7026, + "step": 32 + }, + { + "epoch": 0.022327469553450607, + "grad_norm": 130.4578778756448, + "learning_rate": 7.432432432432434e-07, + "loss": 7.6579, + "step": 33 + }, + { + "epoch": 0.023004059539918808, + "grad_norm": 168.55159538067102, + "learning_rate": 7.657657657657658e-07, + "loss": 7.5495, + "step": 34 + }, + { + "epoch": 0.02368064952638701, + "grad_norm": 82.56038654646147, + "learning_rate": 7.882882882882883e-07, + "loss": 7.4503, + "step": 35 + }, + { + "epoch": 0.02435723951285521, + "grad_norm": 153.7208230043944, + "learning_rate": 8.108108108108109e-07, + "loss": 7.3843, + "step": 36 + }, + { + "epoch": 0.02503382949932341, + "grad_norm": 91.97910168568909, + "learning_rate": 8.333333333333333e-07, + "loss": 7.2279, + "step": 37 + }, + { + "epoch": 0.02571041948579161, + "grad_norm": 69.76268667391646, + "learning_rate": 8.55855855855856e-07, + "loss": 7.2079, + "step": 38 + }, + { + "epoch": 0.02638700947225981, + "grad_norm": 77.57527495553569, + "learning_rate": 8.783783783783785e-07, + "loss": 7.0677, + "step": 39 + }, + { + "epoch": 0.02706359945872801, + "grad_norm": 52.26613705638054, + "learning_rate": 9.00900900900901e-07, + "loss": 7.0129, + "step": 40 + }, + { + "epoch": 0.02774018944519621, + "grad_norm": 53.80168795569409, + "learning_rate": 9.234234234234235e-07, + "loss": 6.9802, + "step": 41 + }, + { + "epoch": 0.028416779431664412, + "grad_norm": 59.68642115374619, + "learning_rate": 9.459459459459461e-07, + "loss": 6.8825, + "step": 42 + }, + { + "epoch": 0.029093369418132613, + "grad_norm": 91.42597837015056, + "learning_rate": 9.684684684684686e-07, + "loss": 6.8503, + "step": 43 + }, + { + "epoch": 0.029769959404600813, + "grad_norm": 101.20418159876743, + "learning_rate": 9.909909909909911e-07, + "loss": 6.8065, + "step": 44 + }, + { + "epoch": 0.030446549391069014, + "grad_norm": 43.005131526460474, + "learning_rate": 1.0135135135135136e-06, + "loss": 6.7757, + "step": 45 + }, + { + "epoch": 0.03112313937753721, + "grad_norm": 42.64828727931448, + "learning_rate": 1.0360360360360361e-06, + "loss": 6.6841, + "step": 46 + }, + { + "epoch": 0.031799729364005415, + "grad_norm": 97.23527419575188, + "learning_rate": 1.0585585585585587e-06, + "loss": 6.6109, + "step": 47 + }, + { + "epoch": 0.03247631935047361, + "grad_norm": 169.8678960637773, + "learning_rate": 1.0810810810810812e-06, + "loss": 6.5767, + "step": 48 + }, + { + "epoch": 0.033152909336941816, + "grad_norm": 62.92191915861204, + "learning_rate": 1.1036036036036037e-06, + "loss": 6.5223, + "step": 49 + }, + { + "epoch": 0.03382949932341001, + "grad_norm": 136.66699482372732, + "learning_rate": 1.1261261261261262e-06, + "loss": 6.493, + "step": 50 + }, + { + "epoch": 0.03450608930987822, + "grad_norm": 81.89062026113558, + "learning_rate": 1.148648648648649e-06, + "loss": 6.3889, + "step": 51 + }, + { + "epoch": 0.035182679296346414, + "grad_norm": 118.4391312567866, + "learning_rate": 1.1711711711711712e-06, + "loss": 6.4154, + "step": 52 + }, + { + "epoch": 0.03585926928281461, + "grad_norm": 73.87596492845805, + "learning_rate": 1.1936936936936937e-06, + "loss": 6.361, + "step": 53 + }, + { + "epoch": 0.036535859269282815, + "grad_norm": 89.03923620297563, + "learning_rate": 1.2162162162162164e-06, + "loss": 6.2851, + "step": 54 + }, + { + "epoch": 0.03721244925575101, + "grad_norm": 73.1383569774123, + "learning_rate": 1.2387387387387387e-06, + "loss": 6.2606, + "step": 55 + }, + { + "epoch": 0.037889039242219216, + "grad_norm": 44.97513849432748, + "learning_rate": 1.2612612612612613e-06, + "loss": 6.2387, + "step": 56 + }, + { + "epoch": 0.03856562922868741, + "grad_norm": 79.41788212317785, + "learning_rate": 1.2837837837837838e-06, + "loss": 6.181, + "step": 57 + }, + { + "epoch": 0.03924221921515562, + "grad_norm": 58.612598455402114, + "learning_rate": 1.3063063063063065e-06, + "loss": 6.1738, + "step": 58 + }, + { + "epoch": 0.039918809201623814, + "grad_norm": 63.80154957894249, + "learning_rate": 1.328828828828829e-06, + "loss": 6.0666, + "step": 59 + }, + { + "epoch": 0.04059539918809202, + "grad_norm": 31.593281308336145, + "learning_rate": 1.3513513513513515e-06, + "loss": 6.0429, + "step": 60 + }, + { + "epoch": 0.041271989174560215, + "grad_norm": 74.39180683057303, + "learning_rate": 1.373873873873874e-06, + "loss": 6.0485, + "step": 61 + }, + { + "epoch": 0.04194857916102842, + "grad_norm": 74.28723111978216, + "learning_rate": 1.3963963963963963e-06, + "loss": 5.9679, + "step": 62 + }, + { + "epoch": 0.04262516914749662, + "grad_norm": 71.70645602731112, + "learning_rate": 1.418918918918919e-06, + "loss": 5.9602, + "step": 63 + }, + { + "epoch": 0.04330175913396482, + "grad_norm": 61.89579228047879, + "learning_rate": 1.4414414414414416e-06, + "loss": 5.8796, + "step": 64 + }, + { + "epoch": 0.04397834912043302, + "grad_norm": 90.73272911379827, + "learning_rate": 1.463963963963964e-06, + "loss": 5.8541, + "step": 65 + }, + { + "epoch": 0.044654939106901215, + "grad_norm": 45.77250463669355, + "learning_rate": 1.4864864864864868e-06, + "loss": 5.7697, + "step": 66 + }, + { + "epoch": 0.04533152909336942, + "grad_norm": 91.74739934484515, + "learning_rate": 1.5090090090090093e-06, + "loss": 5.7921, + "step": 67 + }, + { + "epoch": 0.046008119079837616, + "grad_norm": 93.61184773344448, + "learning_rate": 1.5315315315315316e-06, + "loss": 5.7767, + "step": 68 + }, + { + "epoch": 0.04668470906630582, + "grad_norm": 39.84854834313949, + "learning_rate": 1.5540540540540541e-06, + "loss": 5.5705, + "step": 69 + }, + { + "epoch": 0.04736129905277402, + "grad_norm": 61.450451753613216, + "learning_rate": 1.5765765765765766e-06, + "loss": 5.5836, + "step": 70 + }, + { + "epoch": 0.04803788903924222, + "grad_norm": 134.76740317296654, + "learning_rate": 1.5990990990990993e-06, + "loss": 5.5517, + "step": 71 + }, + { + "epoch": 0.04871447902571042, + "grad_norm": 54.398999052693796, + "learning_rate": 1.6216216216216219e-06, + "loss": 5.4925, + "step": 72 + }, + { + "epoch": 0.04939106901217862, + "grad_norm": 68.95008079459163, + "learning_rate": 1.6441441441441444e-06, + "loss": 5.4054, + "step": 73 + }, + { + "epoch": 0.05006765899864682, + "grad_norm": 152.54862412840902, + "learning_rate": 1.6666666666666667e-06, + "loss": 5.3895, + "step": 74 + }, + { + "epoch": 0.05074424898511502, + "grad_norm": 59.01945826720156, + "learning_rate": 1.6891891891891894e-06, + "loss": 5.3621, + "step": 75 + }, + { + "epoch": 0.05142083897158322, + "grad_norm": 64.87355979260256, + "learning_rate": 1.711711711711712e-06, + "loss": 5.2075, + "step": 76 + }, + { + "epoch": 0.052097428958051424, + "grad_norm": 84.64730212336376, + "learning_rate": 1.7342342342342344e-06, + "loss": 5.1949, + "step": 77 + }, + { + "epoch": 0.05277401894451962, + "grad_norm": 55.40660875297934, + "learning_rate": 1.756756756756757e-06, + "loss": 5.1719, + "step": 78 + }, + { + "epoch": 0.05345060893098782, + "grad_norm": 149.80942207452873, + "learning_rate": 1.7792792792792792e-06, + "loss": 5.1801, + "step": 79 + }, + { + "epoch": 0.05412719891745602, + "grad_norm": 86.22324212364583, + "learning_rate": 1.801801801801802e-06, + "loss": 5.0994, + "step": 80 + }, + { + "epoch": 0.05480378890392422, + "grad_norm": 99.81963307964912, + "learning_rate": 1.8243243243243245e-06, + "loss": 5.0621, + "step": 81 + }, + { + "epoch": 0.05548037889039242, + "grad_norm": 80.3364116036936, + "learning_rate": 1.846846846846847e-06, + "loss": 4.9882, + "step": 82 + }, + { + "epoch": 0.05615696887686062, + "grad_norm": 88.89892195412031, + "learning_rate": 1.8693693693693697e-06, + "loss": 4.9389, + "step": 83 + }, + { + "epoch": 0.056833558863328824, + "grad_norm": 113.17311584350547, + "learning_rate": 1.8918918918918922e-06, + "loss": 4.9489, + "step": 84 + }, + { + "epoch": 0.05751014884979702, + "grad_norm": 136.83618643424379, + "learning_rate": 1.9144144144144145e-06, + "loss": 4.8699, + "step": 85 + }, + { + "epoch": 0.058186738836265225, + "grad_norm": 75.51256109039457, + "learning_rate": 1.9369369369369372e-06, + "loss": 4.7703, + "step": 86 + }, + { + "epoch": 0.05886332882273342, + "grad_norm": 96.23041293661558, + "learning_rate": 1.9594594594594595e-06, + "loss": 4.7873, + "step": 87 + }, + { + "epoch": 0.05953991880920163, + "grad_norm": 161.37019408814405, + "learning_rate": 1.9819819819819822e-06, + "loss": 4.7921, + "step": 88 + }, + { + "epoch": 0.060216508795669824, + "grad_norm": 71.3560661853614, + "learning_rate": 2.0045045045045045e-06, + "loss": 4.7185, + "step": 89 + }, + { + "epoch": 0.06089309878213803, + "grad_norm": 69.99168283497767, + "learning_rate": 2.0270270270270273e-06, + "loss": 4.5539, + "step": 90 + }, + { + "epoch": 0.061569688768606225, + "grad_norm": 114.51118147162934, + "learning_rate": 2.0495495495495496e-06, + "loss": 4.5638, + "step": 91 + }, + { + "epoch": 0.06224627875507442, + "grad_norm": 178.4518511158092, + "learning_rate": 2.0720720720720723e-06, + "loss": 4.6301, + "step": 92 + }, + { + "epoch": 0.06292286874154263, + "grad_norm": 68.22787157402642, + "learning_rate": 2.0945945945945946e-06, + "loss": 4.4462, + "step": 93 + }, + { + "epoch": 0.06359945872801083, + "grad_norm": 160.54020098575876, + "learning_rate": 2.1171171171171173e-06, + "loss": 4.5235, + "step": 94 + }, + { + "epoch": 0.06427604871447902, + "grad_norm": 132.99722363643008, + "learning_rate": 2.13963963963964e-06, + "loss": 4.5069, + "step": 95 + }, + { + "epoch": 0.06495263870094722, + "grad_norm": 90.41930171832493, + "learning_rate": 2.1621621621621623e-06, + "loss": 4.4033, + "step": 96 + }, + { + "epoch": 0.06562922868741543, + "grad_norm": 155.6113006075671, + "learning_rate": 2.1846846846846846e-06, + "loss": 4.3758, + "step": 97 + }, + { + "epoch": 0.06630581867388363, + "grad_norm": 122.2472173078203, + "learning_rate": 2.2072072072072073e-06, + "loss": 4.2974, + "step": 98 + }, + { + "epoch": 0.06698240866035182, + "grad_norm": 111.61521534772868, + "learning_rate": 2.22972972972973e-06, + "loss": 4.3209, + "step": 99 + }, + { + "epoch": 0.06765899864682003, + "grad_norm": 122.18880976249555, + "learning_rate": 2.2522522522522524e-06, + "loss": 4.26, + "step": 100 + }, + { + "epoch": 0.06833558863328823, + "grad_norm": 80.45807204609973, + "learning_rate": 2.274774774774775e-06, + "loss": 4.1875, + "step": 101 + }, + { + "epoch": 0.06901217861975643, + "grad_norm": 109.02033184349753, + "learning_rate": 2.297297297297298e-06, + "loss": 4.1745, + "step": 102 + }, + { + "epoch": 0.06968876860622462, + "grad_norm": 110.10640416298837, + "learning_rate": 2.31981981981982e-06, + "loss": 4.1548, + "step": 103 + }, + { + "epoch": 0.07036535859269283, + "grad_norm": 86.96841790122264, + "learning_rate": 2.3423423423423424e-06, + "loss": 4.1614, + "step": 104 + }, + { + "epoch": 0.07104194857916103, + "grad_norm": 97.15417103395869, + "learning_rate": 2.364864864864865e-06, + "loss": 4.0024, + "step": 105 + }, + { + "epoch": 0.07171853856562922, + "grad_norm": 136.2349346627184, + "learning_rate": 2.3873873873873874e-06, + "loss": 4.1771, + "step": 106 + }, + { + "epoch": 0.07239512855209743, + "grad_norm": 70.38037906411675, + "learning_rate": 2.40990990990991e-06, + "loss": 4.0541, + "step": 107 + }, + { + "epoch": 0.07307171853856563, + "grad_norm": 90.75425332026454, + "learning_rate": 2.432432432432433e-06, + "loss": 4.0113, + "step": 108 + }, + { + "epoch": 0.07374830852503383, + "grad_norm": 78.1950389368104, + "learning_rate": 2.454954954954955e-06, + "loss": 4.0502, + "step": 109 + }, + { + "epoch": 0.07442489851150202, + "grad_norm": 70.23321966900136, + "learning_rate": 2.4774774774774775e-06, + "loss": 3.9199, + "step": 110 + }, + { + "epoch": 0.07510148849797023, + "grad_norm": 92.15274678991364, + "learning_rate": 2.5e-06, + "loss": 3.9936, + "step": 111 + }, + { + "epoch": 0.07577807848443843, + "grad_norm": 147.6498031425845, + "learning_rate": 2.5225225225225225e-06, + "loss": 3.9897, + "step": 112 + }, + { + "epoch": 0.07645466847090664, + "grad_norm": 71.75018072076564, + "learning_rate": 2.5450450450450452e-06, + "loss": 3.8258, + "step": 113 + }, + { + "epoch": 0.07713125845737483, + "grad_norm": 144.25923235185607, + "learning_rate": 2.5675675675675675e-06, + "loss": 4.0061, + "step": 114 + }, + { + "epoch": 0.07780784844384303, + "grad_norm": 90.10313216179375, + "learning_rate": 2.5900900900900907e-06, + "loss": 3.8232, + "step": 115 + }, + { + "epoch": 0.07848443843031123, + "grad_norm": 62.702162623266574, + "learning_rate": 2.612612612612613e-06, + "loss": 3.7921, + "step": 116 + }, + { + "epoch": 0.07916102841677942, + "grad_norm": 128.51448612306217, + "learning_rate": 2.6351351351351353e-06, + "loss": 3.787, + "step": 117 + }, + { + "epoch": 0.07983761840324763, + "grad_norm": 106.85834399979501, + "learning_rate": 2.657657657657658e-06, + "loss": 3.8209, + "step": 118 + }, + { + "epoch": 0.08051420838971583, + "grad_norm": 74.34385902839736, + "learning_rate": 2.6801801801801803e-06, + "loss": 3.6591, + "step": 119 + }, + { + "epoch": 0.08119079837618404, + "grad_norm": 74.135577478339, + "learning_rate": 2.702702702702703e-06, + "loss": 3.6711, + "step": 120 + }, + { + "epoch": 0.08186738836265223, + "grad_norm": 85.95726780507401, + "learning_rate": 2.7252252252252253e-06, + "loss": 3.7639, + "step": 121 + }, + { + "epoch": 0.08254397834912043, + "grad_norm": 102.2908279358922, + "learning_rate": 2.747747747747748e-06, + "loss": 3.7085, + "step": 122 + }, + { + "epoch": 0.08322056833558863, + "grad_norm": 74.03973608536255, + "learning_rate": 2.7702702702702703e-06, + "loss": 3.7246, + "step": 123 + }, + { + "epoch": 0.08389715832205684, + "grad_norm": 85.12172994853982, + "learning_rate": 2.7927927927927926e-06, + "loss": 3.7063, + "step": 124 + }, + { + "epoch": 0.08457374830852503, + "grad_norm": 54.05878326063868, + "learning_rate": 2.8153153153153158e-06, + "loss": 3.5608, + "step": 125 + }, + { + "epoch": 0.08525033829499323, + "grad_norm": 65.2074038079349, + "learning_rate": 2.837837837837838e-06, + "loss": 3.5081, + "step": 126 + }, + { + "epoch": 0.08592692828146144, + "grad_norm": 44.86678222702895, + "learning_rate": 2.860360360360361e-06, + "loss": 3.5391, + "step": 127 + }, + { + "epoch": 0.08660351826792964, + "grad_norm": 71.466489787615, + "learning_rate": 2.882882882882883e-06, + "loss": 3.4926, + "step": 128 + }, + { + "epoch": 0.08728010825439783, + "grad_norm": 39.11904869837415, + "learning_rate": 2.9054054054054054e-06, + "loss": 3.4806, + "step": 129 + }, + { + "epoch": 0.08795669824086604, + "grad_norm": 124.47175823597068, + "learning_rate": 2.927927927927928e-06, + "loss": 3.6277, + "step": 130 + }, + { + "epoch": 0.08863328822733424, + "grad_norm": 67.66619348885503, + "learning_rate": 2.9504504504504504e-06, + "loss": 3.5235, + "step": 131 + }, + { + "epoch": 0.08930987821380243, + "grad_norm": 67.72976849710265, + "learning_rate": 2.9729729729729736e-06, + "loss": 3.4433, + "step": 132 + }, + { + "epoch": 0.08998646820027063, + "grad_norm": 59.48885329779129, + "learning_rate": 2.995495495495496e-06, + "loss": 3.4593, + "step": 133 + }, + { + "epoch": 0.09066305818673884, + "grad_norm": 41.831980152297795, + "learning_rate": 3.0180180180180186e-06, + "loss": 3.3989, + "step": 134 + }, + { + "epoch": 0.09133964817320704, + "grad_norm": 78.06010995786896, + "learning_rate": 3.040540540540541e-06, + "loss": 3.4448, + "step": 135 + }, + { + "epoch": 0.09201623815967523, + "grad_norm": 51.0196629836936, + "learning_rate": 3.063063063063063e-06, + "loss": 3.3393, + "step": 136 + }, + { + "epoch": 0.09269282814614344, + "grad_norm": 52.123543615948236, + "learning_rate": 3.085585585585586e-06, + "loss": 3.3557, + "step": 137 + }, + { + "epoch": 0.09336941813261164, + "grad_norm": 50.51804959908098, + "learning_rate": 3.1081081081081082e-06, + "loss": 3.3341, + "step": 138 + }, + { + "epoch": 0.09404600811907984, + "grad_norm": 59.51105112522959, + "learning_rate": 3.130630630630631e-06, + "loss": 3.2939, + "step": 139 + }, + { + "epoch": 0.09472259810554803, + "grad_norm": 56.287180220643776, + "learning_rate": 3.1531531531531532e-06, + "loss": 3.341, + "step": 140 + }, + { + "epoch": 0.09539918809201624, + "grad_norm": 75.52518208966724, + "learning_rate": 3.1756756756756755e-06, + "loss": 3.3611, + "step": 141 + }, + { + "epoch": 0.09607577807848444, + "grad_norm": 48.910828651220605, + "learning_rate": 3.1981981981981987e-06, + "loss": 3.2481, + "step": 142 + }, + { + "epoch": 0.09675236806495263, + "grad_norm": 47.218392810236764, + "learning_rate": 3.220720720720721e-06, + "loss": 3.2752, + "step": 143 + }, + { + "epoch": 0.09742895805142084, + "grad_norm": 63.94134891175671, + "learning_rate": 3.2432432432432437e-06, + "loss": 3.3645, + "step": 144 + }, + { + "epoch": 0.09810554803788904, + "grad_norm": 54.666789257562534, + "learning_rate": 3.265765765765766e-06, + "loss": 3.2376, + "step": 145 + }, + { + "epoch": 0.09878213802435724, + "grad_norm": 36.94989947017251, + "learning_rate": 3.2882882882882887e-06, + "loss": 3.1951, + "step": 146 + }, + { + "epoch": 0.09945872801082543, + "grad_norm": 48.33680686944214, + "learning_rate": 3.310810810810811e-06, + "loss": 3.1377, + "step": 147 + }, + { + "epoch": 0.10013531799729364, + "grad_norm": 35.74504153701644, + "learning_rate": 3.3333333333333333e-06, + "loss": 3.2506, + "step": 148 + }, + { + "epoch": 0.10081190798376184, + "grad_norm": 49.239566709083704, + "learning_rate": 3.3558558558558565e-06, + "loss": 3.1739, + "step": 149 + }, + { + "epoch": 0.10148849797023005, + "grad_norm": 49.98074107148794, + "learning_rate": 3.3783783783783788e-06, + "loss": 3.2584, + "step": 150 + }, + { + "epoch": 0.10216508795669824, + "grad_norm": 57.12006421383496, + "learning_rate": 3.4009009009009015e-06, + "loss": 3.1443, + "step": 151 + }, + { + "epoch": 0.10284167794316644, + "grad_norm": 51.83187954914922, + "learning_rate": 3.423423423423424e-06, + "loss": 3.2512, + "step": 152 + }, + { + "epoch": 0.10351826792963464, + "grad_norm": 41.928571108063004, + "learning_rate": 3.445945945945946e-06, + "loss": 3.0448, + "step": 153 + }, + { + "epoch": 0.10419485791610285, + "grad_norm": 44.516486827766585, + "learning_rate": 3.468468468468469e-06, + "loss": 3.1117, + "step": 154 + }, + { + "epoch": 0.10487144790257104, + "grad_norm": 42.27354179773472, + "learning_rate": 3.490990990990991e-06, + "loss": 3.129, + "step": 155 + }, + { + "epoch": 0.10554803788903924, + "grad_norm": 48.86935574613096, + "learning_rate": 3.513513513513514e-06, + "loss": 3.0613, + "step": 156 + }, + { + "epoch": 0.10622462787550745, + "grad_norm": 37.05452600718779, + "learning_rate": 3.536036036036036e-06, + "loss": 2.9485, + "step": 157 + }, + { + "epoch": 0.10690121786197564, + "grad_norm": 42.975006587033576, + "learning_rate": 3.5585585585585584e-06, + "loss": 3.1076, + "step": 158 + }, + { + "epoch": 0.10757780784844384, + "grad_norm": 36.19174264324649, + "learning_rate": 3.5810810810810816e-06, + "loss": 3.1296, + "step": 159 + }, + { + "epoch": 0.10825439783491204, + "grad_norm": 51.57163331423397, + "learning_rate": 3.603603603603604e-06, + "loss": 2.9326, + "step": 160 + }, + { + "epoch": 0.10893098782138025, + "grad_norm": 28.926129651651678, + "learning_rate": 3.6261261261261266e-06, + "loss": 2.8705, + "step": 161 + }, + { + "epoch": 0.10960757780784844, + "grad_norm": 37.471976631295426, + "learning_rate": 3.648648648648649e-06, + "loss": 3.1038, + "step": 162 + }, + { + "epoch": 0.11028416779431664, + "grad_norm": 40.95517313070317, + "learning_rate": 3.6711711711711716e-06, + "loss": 2.8534, + "step": 163 + }, + { + "epoch": 0.11096075778078485, + "grad_norm": 30.52496431382125, + "learning_rate": 3.693693693693694e-06, + "loss": 2.9543, + "step": 164 + }, + { + "epoch": 0.11163734776725305, + "grad_norm": 25.34559414829517, + "learning_rate": 3.7162162162162162e-06, + "loss": 2.9391, + "step": 165 + }, + { + "epoch": 0.11231393775372124, + "grad_norm": 41.00047666306257, + "learning_rate": 3.7387387387387394e-06, + "loss": 2.9402, + "step": 166 + }, + { + "epoch": 0.11299052774018944, + "grad_norm": 38.40358833951516, + "learning_rate": 3.7612612612612612e-06, + "loss": 2.8399, + "step": 167 + }, + { + "epoch": 0.11366711772665765, + "grad_norm": 25.778136989342435, + "learning_rate": 3.7837837837837844e-06, + "loss": 2.8529, + "step": 168 + }, + { + "epoch": 0.11434370771312584, + "grad_norm": 43.89518290972547, + "learning_rate": 3.8063063063063067e-06, + "loss": 2.8869, + "step": 169 + }, + { + "epoch": 0.11502029769959404, + "grad_norm": 35.78579942027662, + "learning_rate": 3.828828828828829e-06, + "loss": 2.8886, + "step": 170 + }, + { + "epoch": 0.11569688768606225, + "grad_norm": 38.32163511272516, + "learning_rate": 3.851351351351352e-06, + "loss": 2.8628, + "step": 171 + }, + { + "epoch": 0.11637347767253045, + "grad_norm": 24.390093584623898, + "learning_rate": 3.8738738738738744e-06, + "loss": 2.8215, + "step": 172 + }, + { + "epoch": 0.11705006765899864, + "grad_norm": 33.2833424715733, + "learning_rate": 3.896396396396397e-06, + "loss": 2.7963, + "step": 173 + }, + { + "epoch": 0.11772665764546685, + "grad_norm": 33.3311496772107, + "learning_rate": 3.918918918918919e-06, + "loss": 2.7788, + "step": 174 + }, + { + "epoch": 0.11840324763193505, + "grad_norm": 30.752650997097287, + "learning_rate": 3.941441441441442e-06, + "loss": 2.8082, + "step": 175 + }, + { + "epoch": 0.11907983761840325, + "grad_norm": 36.63901906403704, + "learning_rate": 3.9639639639639645e-06, + "loss": 2.8628, + "step": 176 + }, + { + "epoch": 0.11975642760487144, + "grad_norm": 27.97375841977239, + "learning_rate": 3.986486486486487e-06, + "loss": 2.6965, + "step": 177 + }, + { + "epoch": 0.12043301759133965, + "grad_norm": 21.067527031604392, + "learning_rate": 4.009009009009009e-06, + "loss": 2.7417, + "step": 178 + }, + { + "epoch": 0.12110960757780785, + "grad_norm": 27.504039817885566, + "learning_rate": 4.031531531531531e-06, + "loss": 2.6387, + "step": 179 + }, + { + "epoch": 0.12178619756427606, + "grad_norm": 40.89949692772889, + "learning_rate": 4.0540540540540545e-06, + "loss": 2.7225, + "step": 180 + }, + { + "epoch": 0.12246278755074425, + "grad_norm": 29.287860242320992, + "learning_rate": 4.076576576576577e-06, + "loss": 2.6841, + "step": 181 + }, + { + "epoch": 0.12313937753721245, + "grad_norm": 23.16703424289219, + "learning_rate": 4.099099099099099e-06, + "loss": 2.5594, + "step": 182 + }, + { + "epoch": 0.12381596752368065, + "grad_norm": 29.543799764885705, + "learning_rate": 4.121621621621622e-06, + "loss": 2.6742, + "step": 183 + }, + { + "epoch": 0.12449255751014884, + "grad_norm": 34.95327011699371, + "learning_rate": 4.1441441441441446e-06, + "loss": 2.6456, + "step": 184 + }, + { + "epoch": 0.12516914749661706, + "grad_norm": 29.66519736577202, + "learning_rate": 4.166666666666667e-06, + "loss": 2.6835, + "step": 185 + }, + { + "epoch": 0.12584573748308525, + "grad_norm": 24.40015340294758, + "learning_rate": 4.189189189189189e-06, + "loss": 2.5426, + "step": 186 + }, + { + "epoch": 0.12652232746955344, + "grad_norm": 35.7957191425465, + "learning_rate": 4.2117117117117115e-06, + "loss": 2.609, + "step": 187 + }, + { + "epoch": 0.12719891745602166, + "grad_norm": 32.11830023640179, + "learning_rate": 4.234234234234235e-06, + "loss": 2.6174, + "step": 188 + }, + { + "epoch": 0.12787550744248985, + "grad_norm": 27.713420304792045, + "learning_rate": 4.256756756756757e-06, + "loss": 2.6129, + "step": 189 + }, + { + "epoch": 0.12855209742895804, + "grad_norm": 25.13598174696013, + "learning_rate": 4.27927927927928e-06, + "loss": 2.4842, + "step": 190 + }, + { + "epoch": 0.12922868741542626, + "grad_norm": 20.315219130144893, + "learning_rate": 4.301801801801802e-06, + "loss": 2.5572, + "step": 191 + }, + { + "epoch": 0.12990527740189445, + "grad_norm": 22.887320533614375, + "learning_rate": 4.324324324324325e-06, + "loss": 2.5483, + "step": 192 + }, + { + "epoch": 0.13058186738836267, + "grad_norm": 20.914247782101484, + "learning_rate": 4.346846846846847e-06, + "loss": 2.5147, + "step": 193 + }, + { + "epoch": 0.13125845737483086, + "grad_norm": 20.697866625432855, + "learning_rate": 4.369369369369369e-06, + "loss": 2.5196, + "step": 194 + }, + { + "epoch": 0.13193504736129905, + "grad_norm": 23.49583918293928, + "learning_rate": 4.391891891891892e-06, + "loss": 2.4808, + "step": 195 + }, + { + "epoch": 0.13261163734776726, + "grad_norm": 32.74575243265221, + "learning_rate": 4.414414414414415e-06, + "loss": 2.4672, + "step": 196 + }, + { + "epoch": 0.13328822733423545, + "grad_norm": 22.23921840689608, + "learning_rate": 4.436936936936938e-06, + "loss": 2.452, + "step": 197 + }, + { + "epoch": 0.13396481732070364, + "grad_norm": 21.013548498079665, + "learning_rate": 4.45945945945946e-06, + "loss": 2.3407, + "step": 198 + }, + { + "epoch": 0.13464140730717186, + "grad_norm": 31.878428050622567, + "learning_rate": 4.4819819819819824e-06, + "loss": 2.469, + "step": 199 + }, + { + "epoch": 0.13531799729364005, + "grad_norm": 22.974412989264575, + "learning_rate": 4.504504504504505e-06, + "loss": 2.4717, + "step": 200 + }, + { + "epoch": 0.13599458728010824, + "grad_norm": 17.882363303347567, + "learning_rate": 4.527027027027027e-06, + "loss": 2.4163, + "step": 201 + }, + { + "epoch": 0.13667117726657646, + "grad_norm": 22.493322907839186, + "learning_rate": 4.54954954954955e-06, + "loss": 2.4467, + "step": 202 + }, + { + "epoch": 0.13734776725304465, + "grad_norm": 19.548539758212335, + "learning_rate": 4.5720720720720725e-06, + "loss": 2.3677, + "step": 203 + }, + { + "epoch": 0.13802435723951287, + "grad_norm": 22.72777213575744, + "learning_rate": 4.594594594594596e-06, + "loss": 2.4086, + "step": 204 + }, + { + "epoch": 0.13870094722598106, + "grad_norm": 22.499189227034087, + "learning_rate": 4.617117117117118e-06, + "loss": 2.3678, + "step": 205 + }, + { + "epoch": 0.13937753721244925, + "grad_norm": 26.867693885650482, + "learning_rate": 4.63963963963964e-06, + "loss": 2.2966, + "step": 206 + }, + { + "epoch": 0.14005412719891747, + "grad_norm": 18.98888773696992, + "learning_rate": 4.6621621621621625e-06, + "loss": 2.2594, + "step": 207 + }, + { + "epoch": 0.14073071718538566, + "grad_norm": 20.923260812895332, + "learning_rate": 4.684684684684685e-06, + "loss": 2.3373, + "step": 208 + }, + { + "epoch": 0.14140730717185385, + "grad_norm": 22.43708159337281, + "learning_rate": 4.707207207207208e-06, + "loss": 2.349, + "step": 209 + }, + { + "epoch": 0.14208389715832206, + "grad_norm": 15.791501622620217, + "learning_rate": 4.72972972972973e-06, + "loss": 2.2408, + "step": 210 + }, + { + "epoch": 0.14276048714479025, + "grad_norm": 27.9477620074452, + "learning_rate": 4.7522522522522526e-06, + "loss": 2.3142, + "step": 211 + }, + { + "epoch": 0.14343707713125844, + "grad_norm": 20.28378726114416, + "learning_rate": 4.774774774774775e-06, + "loss": 2.2812, + "step": 212 + }, + { + "epoch": 0.14411366711772666, + "grad_norm": 23.301416792731526, + "learning_rate": 4.797297297297297e-06, + "loss": 2.2495, + "step": 213 + }, + { + "epoch": 0.14479025710419485, + "grad_norm": 20.477578746933535, + "learning_rate": 4.81981981981982e-06, + "loss": 2.2285, + "step": 214 + }, + { + "epoch": 0.14546684709066307, + "grad_norm": 15.941850024131293, + "learning_rate": 4.842342342342343e-06, + "loss": 2.2056, + "step": 215 + }, + { + "epoch": 0.14614343707713126, + "grad_norm": 32.42025261478585, + "learning_rate": 4.864864864864866e-06, + "loss": 2.3471, + "step": 216 + }, + { + "epoch": 0.14682002706359945, + "grad_norm": 18.338599158407256, + "learning_rate": 4.887387387387388e-06, + "loss": 2.2433, + "step": 217 + }, + { + "epoch": 0.14749661705006767, + "grad_norm": 20.454382479996063, + "learning_rate": 4.90990990990991e-06, + "loss": 2.2717, + "step": 218 + }, + { + "epoch": 0.14817320703653586, + "grad_norm": 16.14586789867164, + "learning_rate": 4.932432432432433e-06, + "loss": 2.2331, + "step": 219 + }, + { + "epoch": 0.14884979702300405, + "grad_norm": 22.113495495267212, + "learning_rate": 4.954954954954955e-06, + "loss": 2.1726, + "step": 220 + }, + { + "epoch": 0.14952638700947227, + "grad_norm": 15.95883139356251, + "learning_rate": 4.977477477477478e-06, + "loss": 2.1805, + "step": 221 + }, + { + "epoch": 0.15020297699594046, + "grad_norm": 18.79118628674278, + "learning_rate": 5e-06, + "loss": 2.148, + "step": 222 + }, + { + "epoch": 0.15087956698240865, + "grad_norm": 14.403072472289919, + "learning_rate": 5.022522522522523e-06, + "loss": 2.1814, + "step": 223 + }, + { + "epoch": 0.15155615696887687, + "grad_norm": 16.137035847145082, + "learning_rate": 5.045045045045045e-06, + "loss": 2.1728, + "step": 224 + }, + { + "epoch": 0.15223274695534506, + "grad_norm": 18.356637411169537, + "learning_rate": 5.067567567567568e-06, + "loss": 2.1512, + "step": 225 + }, + { + "epoch": 0.15290933694181327, + "grad_norm": 16.504673195570593, + "learning_rate": 5.0900900900900905e-06, + "loss": 2.1588, + "step": 226 + }, + { + "epoch": 0.15358592692828146, + "grad_norm": 13.899118228878194, + "learning_rate": 5.112612612612613e-06, + "loss": 2.0877, + "step": 227 + }, + { + "epoch": 0.15426251691474965, + "grad_norm": 17.912245295524656, + "learning_rate": 5.135135135135135e-06, + "loss": 2.1466, + "step": 228 + }, + { + "epoch": 0.15493910690121787, + "grad_norm": 13.534422946138394, + "learning_rate": 5.157657657657657e-06, + "loss": 2.0077, + "step": 229 + }, + { + "epoch": 0.15561569688768606, + "grad_norm": 22.62031309389824, + "learning_rate": 5.180180180180181e-06, + "loss": 2.1199, + "step": 230 + }, + { + "epoch": 0.15629228687415425, + "grad_norm": 15.743397945734683, + "learning_rate": 5.202702702702704e-06, + "loss": 2.0383, + "step": 231 + }, + { + "epoch": 0.15696887686062247, + "grad_norm": 19.395021397972336, + "learning_rate": 5.225225225225226e-06, + "loss": 1.985, + "step": 232 + }, + { + "epoch": 0.15764546684709066, + "grad_norm": 13.009112032469817, + "learning_rate": 5.247747747747748e-06, + "loss": 2.0359, + "step": 233 + }, + { + "epoch": 0.15832205683355885, + "grad_norm": 20.359644663731412, + "learning_rate": 5.2702702702702705e-06, + "loss": 2.0488, + "step": 234 + }, + { + "epoch": 0.15899864682002707, + "grad_norm": 15.53405031855752, + "learning_rate": 5.292792792792794e-06, + "loss": 2.0, + "step": 235 + }, + { + "epoch": 0.15967523680649526, + "grad_norm": 14.809102908639488, + "learning_rate": 5.315315315315316e-06, + "loss": 2.0319, + "step": 236 + }, + { + "epoch": 0.16035182679296348, + "grad_norm": 19.88444381670935, + "learning_rate": 5.337837837837838e-06, + "loss": 1.9792, + "step": 237 + }, + { + "epoch": 0.16102841677943167, + "grad_norm": 13.118123019183166, + "learning_rate": 5.360360360360361e-06, + "loss": 1.9924, + "step": 238 + }, + { + "epoch": 0.16170500676589986, + "grad_norm": 18.482436970789802, + "learning_rate": 5.382882882882884e-06, + "loss": 2.0255, + "step": 239 + }, + { + "epoch": 0.16238159675236807, + "grad_norm": 17.198808199707006, + "learning_rate": 5.405405405405406e-06, + "loss": 1.9914, + "step": 240 + }, + { + "epoch": 0.16305818673883626, + "grad_norm": 18.95392851638832, + "learning_rate": 5.427927927927928e-06, + "loss": 2.0717, + "step": 241 + }, + { + "epoch": 0.16373477672530445, + "grad_norm": 13.5701694050597, + "learning_rate": 5.450450450450451e-06, + "loss": 2.0124, + "step": 242 + }, + { + "epoch": 0.16441136671177267, + "grad_norm": 15.414340952802318, + "learning_rate": 5.472972972972973e-06, + "loss": 1.9557, + "step": 243 + }, + { + "epoch": 0.16508795669824086, + "grad_norm": 17.440870830742558, + "learning_rate": 5.495495495495496e-06, + "loss": 1.9832, + "step": 244 + }, + { + "epoch": 0.16576454668470908, + "grad_norm": 14.239022528198992, + "learning_rate": 5.518018018018018e-06, + "loss": 1.9448, + "step": 245 + }, + { + "epoch": 0.16644113667117727, + "grad_norm": 17.431495370774222, + "learning_rate": 5.540540540540541e-06, + "loss": 1.9717, + "step": 246 + }, + { + "epoch": 0.16711772665764546, + "grad_norm": 14.79637144533563, + "learning_rate": 5.563063063063063e-06, + "loss": 1.9688, + "step": 247 + }, + { + "epoch": 0.16779431664411368, + "grad_norm": 18.333738504558415, + "learning_rate": 5.585585585585585e-06, + "loss": 2.0003, + "step": 248 + }, + { + "epoch": 0.16847090663058187, + "grad_norm": 13.005075190265611, + "learning_rate": 5.608108108108109e-06, + "loss": 1.96, + "step": 249 + }, + { + "epoch": 0.16914749661705006, + "grad_norm": 12.997802259515195, + "learning_rate": 5.6306306306306316e-06, + "loss": 1.8585, + "step": 250 + }, + { + "epoch": 0.16982408660351828, + "grad_norm": 11.014175768405828, + "learning_rate": 5.653153153153154e-06, + "loss": 1.8642, + "step": 251 + }, + { + "epoch": 0.17050067658998647, + "grad_norm": 18.048619090207655, + "learning_rate": 5.675675675675676e-06, + "loss": 1.9065, + "step": 252 + }, + { + "epoch": 0.17117726657645466, + "grad_norm": 15.060560783435594, + "learning_rate": 5.6981981981981985e-06, + "loss": 1.9563, + "step": 253 + }, + { + "epoch": 0.17185385656292287, + "grad_norm": 10.891591875795623, + "learning_rate": 5.720720720720722e-06, + "loss": 1.8022, + "step": 254 + }, + { + "epoch": 0.17253044654939106, + "grad_norm": 12.582318972666503, + "learning_rate": 5.743243243243244e-06, + "loss": 1.9277, + "step": 255 + }, + { + "epoch": 0.17320703653585928, + "grad_norm": 17.020939171544274, + "learning_rate": 5.765765765765766e-06, + "loss": 1.8127, + "step": 256 + }, + { + "epoch": 0.17388362652232747, + "grad_norm": 10.908213123889828, + "learning_rate": 5.7882882882882885e-06, + "loss": 1.7682, + "step": 257 + }, + { + "epoch": 0.17456021650879566, + "grad_norm": 12.609343066655516, + "learning_rate": 5.810810810810811e-06, + "loss": 1.8415, + "step": 258 + }, + { + "epoch": 0.17523680649526388, + "grad_norm": 12.051612217755917, + "learning_rate": 5.833333333333334e-06, + "loss": 1.782, + "step": 259 + }, + { + "epoch": 0.17591339648173207, + "grad_norm": 11.688029779834665, + "learning_rate": 5.855855855855856e-06, + "loss": 1.8429, + "step": 260 + }, + { + "epoch": 0.17658998646820026, + "grad_norm": 9.621847962601386, + "learning_rate": 5.8783783783783786e-06, + "loss": 1.7889, + "step": 261 + }, + { + "epoch": 0.17726657645466848, + "grad_norm": 11.637111117095575, + "learning_rate": 5.900900900900901e-06, + "loss": 1.7855, + "step": 262 + }, + { + "epoch": 0.17794316644113667, + "grad_norm": 15.340550756068295, + "learning_rate": 5.923423423423423e-06, + "loss": 1.8019, + "step": 263 + }, + { + "epoch": 0.17861975642760486, + "grad_norm": 11.88029892034545, + "learning_rate": 5.945945945945947e-06, + "loss": 1.7269, + "step": 264 + }, + { + "epoch": 0.17929634641407308, + "grad_norm": 10.274778570979059, + "learning_rate": 5.9684684684684694e-06, + "loss": 1.703, + "step": 265 + }, + { + "epoch": 0.17997293640054127, + "grad_norm": 13.750962892213895, + "learning_rate": 5.990990990990992e-06, + "loss": 1.7555, + "step": 266 + }, + { + "epoch": 0.18064952638700948, + "grad_norm": 11.485943554362244, + "learning_rate": 6.013513513513514e-06, + "loss": 1.734, + "step": 267 + }, + { + "epoch": 0.18132611637347767, + "grad_norm": 13.961022884592403, + "learning_rate": 6.036036036036037e-06, + "loss": 1.7312, + "step": 268 + }, + { + "epoch": 0.18200270635994586, + "grad_norm": 13.429683042761775, + "learning_rate": 6.0585585585585595e-06, + "loss": 1.6984, + "step": 269 + }, + { + "epoch": 0.18267929634641408, + "grad_norm": 10.691005666701471, + "learning_rate": 6.081081081081082e-06, + "loss": 1.7554, + "step": 270 + }, + { + "epoch": 0.18335588633288227, + "grad_norm": 9.791557551761352, + "learning_rate": 6.103603603603604e-06, + "loss": 1.7218, + "step": 271 + }, + { + "epoch": 0.18403247631935046, + "grad_norm": 12.162695738299966, + "learning_rate": 6.126126126126126e-06, + "loss": 1.6978, + "step": 272 + }, + { + "epoch": 0.18470906630581868, + "grad_norm": 12.52239274552492, + "learning_rate": 6.1486486486486495e-06, + "loss": 1.7204, + "step": 273 + }, + { + "epoch": 0.18538565629228687, + "grad_norm": 11.069520567295438, + "learning_rate": 6.171171171171172e-06, + "loss": 1.8051, + "step": 274 + }, + { + "epoch": 0.18606224627875506, + "grad_norm": 13.646838283946098, + "learning_rate": 6.193693693693694e-06, + "loss": 1.7197, + "step": 275 + }, + { + "epoch": 0.18673883626522328, + "grad_norm": 10.739838933303147, + "learning_rate": 6.2162162162162164e-06, + "loss": 1.6782, + "step": 276 + }, + { + "epoch": 0.18741542625169147, + "grad_norm": 11.402083709510906, + "learning_rate": 6.238738738738739e-06, + "loss": 1.6366, + "step": 277 + }, + { + "epoch": 0.1880920162381597, + "grad_norm": 12.6985139467155, + "learning_rate": 6.261261261261262e-06, + "loss": 1.7035, + "step": 278 + }, + { + "epoch": 0.18876860622462788, + "grad_norm": 11.95572588600681, + "learning_rate": 6.283783783783784e-06, + "loss": 1.7244, + "step": 279 + }, + { + "epoch": 0.18944519621109607, + "grad_norm": 13.151801727488719, + "learning_rate": 6.3063063063063065e-06, + "loss": 1.7128, + "step": 280 + }, + { + "epoch": 0.19012178619756429, + "grad_norm": 9.841504797900614, + "learning_rate": 6.328828828828829e-06, + "loss": 1.6895, + "step": 281 + }, + { + "epoch": 0.19079837618403248, + "grad_norm": 9.787646149578237, + "learning_rate": 6.351351351351351e-06, + "loss": 1.7138, + "step": 282 + }, + { + "epoch": 0.19147496617050067, + "grad_norm": 11.161339025880242, + "learning_rate": 6.373873873873875e-06, + "loss": 1.6576, + "step": 283 + }, + { + "epoch": 0.19215155615696888, + "grad_norm": 9.159618429348464, + "learning_rate": 6.396396396396397e-06, + "loss": 1.5906, + "step": 284 + }, + { + "epoch": 0.19282814614343707, + "grad_norm": 10.811401066646267, + "learning_rate": 6.41891891891892e-06, + "loss": 1.6524, + "step": 285 + }, + { + "epoch": 0.19350473612990526, + "grad_norm": 11.045079449022317, + "learning_rate": 6.441441441441442e-06, + "loss": 1.6468, + "step": 286 + }, + { + "epoch": 0.19418132611637348, + "grad_norm": 9.955353642039189, + "learning_rate": 6.463963963963964e-06, + "loss": 1.6601, + "step": 287 + }, + { + "epoch": 0.19485791610284167, + "grad_norm": 11.353233362482896, + "learning_rate": 6.486486486486487e-06, + "loss": 1.6481, + "step": 288 + }, + { + "epoch": 0.1955345060893099, + "grad_norm": 11.4616219039945, + "learning_rate": 6.50900900900901e-06, + "loss": 1.6293, + "step": 289 + }, + { + "epoch": 0.19621109607577808, + "grad_norm": 8.799864129115917, + "learning_rate": 6.531531531531532e-06, + "loss": 1.548, + "step": 290 + }, + { + "epoch": 0.19688768606224627, + "grad_norm": 12.497463518680908, + "learning_rate": 6.554054054054054e-06, + "loss": 1.6479, + "step": 291 + }, + { + "epoch": 0.1975642760487145, + "grad_norm": 11.300656550432288, + "learning_rate": 6.5765765765765775e-06, + "loss": 1.5232, + "step": 292 + }, + { + "epoch": 0.19824086603518268, + "grad_norm": 11.503528750517631, + "learning_rate": 6.5990990990991e-06, + "loss": 1.6006, + "step": 293 + }, + { + "epoch": 0.19891745602165087, + "grad_norm": 7.911223197398728, + "learning_rate": 6.621621621621622e-06, + "loss": 1.5595, + "step": 294 + }, + { + "epoch": 0.19959404600811909, + "grad_norm": 11.536759095047055, + "learning_rate": 6.644144144144144e-06, + "loss": 1.5718, + "step": 295 + }, + { + "epoch": 0.20027063599458728, + "grad_norm": 8.515018062684854, + "learning_rate": 6.666666666666667e-06, + "loss": 1.5974, + "step": 296 + }, + { + "epoch": 0.2009472259810555, + "grad_norm": 11.189846996969566, + "learning_rate": 6.689189189189191e-06, + "loss": 1.548, + "step": 297 + }, + { + "epoch": 0.20162381596752368, + "grad_norm": 8.714980297029278, + "learning_rate": 6.711711711711713e-06, + "loss": 1.5136, + "step": 298 + }, + { + "epoch": 0.20230040595399187, + "grad_norm": 12.00408954819733, + "learning_rate": 6.734234234234235e-06, + "loss": 1.6347, + "step": 299 + }, + { + "epoch": 0.2029769959404601, + "grad_norm": 10.798788556467306, + "learning_rate": 6.7567567567567575e-06, + "loss": 1.5315, + "step": 300 + }, + { + "epoch": 0.20365358592692828, + "grad_norm": 9.59027269733314, + "learning_rate": 6.77927927927928e-06, + "loss": 1.4653, + "step": 301 + }, + { + "epoch": 0.20433017591339647, + "grad_norm": 9.209203507811885, + "learning_rate": 6.801801801801803e-06, + "loss": 1.5597, + "step": 302 + }, + { + "epoch": 0.2050067658998647, + "grad_norm": 10.137126068903864, + "learning_rate": 6.824324324324325e-06, + "loss": 1.5163, + "step": 303 + }, + { + "epoch": 0.20568335588633288, + "grad_norm": 9.68307545199484, + "learning_rate": 6.846846846846848e-06, + "loss": 1.5265, + "step": 304 + }, + { + "epoch": 0.20635994587280107, + "grad_norm": 8.807889928010955, + "learning_rate": 6.86936936936937e-06, + "loss": 1.4709, + "step": 305 + }, + { + "epoch": 0.2070365358592693, + "grad_norm": 11.257125948904491, + "learning_rate": 6.891891891891892e-06, + "loss": 1.5649, + "step": 306 + }, + { + "epoch": 0.20771312584573748, + "grad_norm": 8.703225933763383, + "learning_rate": 6.914414414414415e-06, + "loss": 1.4575, + "step": 307 + }, + { + "epoch": 0.2083897158322057, + "grad_norm": 9.511676852333661, + "learning_rate": 6.936936936936938e-06, + "loss": 1.5125, + "step": 308 + }, + { + "epoch": 0.2090663058186739, + "grad_norm": 8.873193504576163, + "learning_rate": 6.95945945945946e-06, + "loss": 1.4193, + "step": 309 + }, + { + "epoch": 0.20974289580514208, + "grad_norm": 8.468313508182078, + "learning_rate": 6.981981981981982e-06, + "loss": 1.494, + "step": 310 + }, + { + "epoch": 0.2104194857916103, + "grad_norm": 8.614596249065656, + "learning_rate": 7.0045045045045045e-06, + "loss": 1.5012, + "step": 311 + }, + { + "epoch": 0.21109607577807848, + "grad_norm": 9.01193447897298, + "learning_rate": 7.027027027027028e-06, + "loss": 1.4495, + "step": 312 + }, + { + "epoch": 0.21177266576454667, + "grad_norm": 9.198271014590313, + "learning_rate": 7.04954954954955e-06, + "loss": 1.4238, + "step": 313 + }, + { + "epoch": 0.2124492557510149, + "grad_norm": 8.315800768848687, + "learning_rate": 7.072072072072072e-06, + "loss": 1.4382, + "step": 314 + }, + { + "epoch": 0.21312584573748308, + "grad_norm": 8.162579495233, + "learning_rate": 7.0945945945945946e-06, + "loss": 1.4358, + "step": 315 + }, + { + "epoch": 0.21380243572395127, + "grad_norm": 8.382615195195696, + "learning_rate": 7.117117117117117e-06, + "loss": 1.4738, + "step": 316 + }, + { + "epoch": 0.2144790257104195, + "grad_norm": 6.873539297867009, + "learning_rate": 7.139639639639641e-06, + "loss": 1.4068, + "step": 317 + }, + { + "epoch": 0.21515561569688768, + "grad_norm": 8.057956476417067, + "learning_rate": 7.162162162162163e-06, + "loss": 1.4231, + "step": 318 + }, + { + "epoch": 0.2158322056833559, + "grad_norm": 8.760194161062197, + "learning_rate": 7.1846846846846855e-06, + "loss": 1.432, + "step": 319 + }, + { + "epoch": 0.2165087956698241, + "grad_norm": 8.088862479265627, + "learning_rate": 7.207207207207208e-06, + "loss": 1.317, + "step": 320 + }, + { + "epoch": 0.21718538565629228, + "grad_norm": 7.349788195182965, + "learning_rate": 7.229729729729731e-06, + "loss": 1.3908, + "step": 321 + }, + { + "epoch": 0.2178619756427605, + "grad_norm": 7.161188967637041, + "learning_rate": 7.252252252252253e-06, + "loss": 1.3914, + "step": 322 + }, + { + "epoch": 0.2185385656292287, + "grad_norm": 7.435624966331327, + "learning_rate": 7.2747747747747755e-06, + "loss": 1.4447, + "step": 323 + }, + { + "epoch": 0.21921515561569688, + "grad_norm": 6.450117600056855, + "learning_rate": 7.297297297297298e-06, + "loss": 1.353, + "step": 324 + }, + { + "epoch": 0.2198917456021651, + "grad_norm": 9.342882898605941, + "learning_rate": 7.31981981981982e-06, + "loss": 1.3382, + "step": 325 + }, + { + "epoch": 0.22056833558863329, + "grad_norm": 9.279944488980599, + "learning_rate": 7.342342342342343e-06, + "loss": 1.414, + "step": 326 + }, + { + "epoch": 0.22124492557510148, + "grad_norm": 7.099845515132716, + "learning_rate": 7.3648648648648655e-06, + "loss": 1.3692, + "step": 327 + }, + { + "epoch": 0.2219215155615697, + "grad_norm": 9.031425541989027, + "learning_rate": 7.387387387387388e-06, + "loss": 1.4173, + "step": 328 + }, + { + "epoch": 0.22259810554803788, + "grad_norm": 7.423615320613807, + "learning_rate": 7.40990990990991e-06, + "loss": 1.3511, + "step": 329 + }, + { + "epoch": 0.2232746955345061, + "grad_norm": 8.339528431625467, + "learning_rate": 7.4324324324324324e-06, + "loss": 1.3445, + "step": 330 + }, + { + "epoch": 0.2239512855209743, + "grad_norm": 8.991790302489601, + "learning_rate": 7.4549549549549564e-06, + "loss": 1.3122, + "step": 331 + }, + { + "epoch": 0.22462787550744248, + "grad_norm": 6.199790631454467, + "learning_rate": 7.477477477477479e-06, + "loss": 1.3809, + "step": 332 + }, + { + "epoch": 0.2253044654939107, + "grad_norm": 8.099748400756368, + "learning_rate": 7.500000000000001e-06, + "loss": 1.3352, + "step": 333 + }, + { + "epoch": 0.2259810554803789, + "grad_norm": 8.056852169659802, + "learning_rate": 7.5225225225225225e-06, + "loss": 1.3156, + "step": 334 + }, + { + "epoch": 0.22665764546684708, + "grad_norm": 6.686240008496426, + "learning_rate": 7.545045045045045e-06, + "loss": 1.3499, + "step": 335 + }, + { + "epoch": 0.2273342354533153, + "grad_norm": 7.9309282102043035, + "learning_rate": 7.567567567567569e-06, + "loss": 1.3324, + "step": 336 + }, + { + "epoch": 0.2280108254397835, + "grad_norm": 7.77658996644435, + "learning_rate": 7.590090090090091e-06, + "loss": 1.2565, + "step": 337 + }, + { + "epoch": 0.22868741542625168, + "grad_norm": 6.324303404965645, + "learning_rate": 7.612612612612613e-06, + "loss": 1.3374, + "step": 338 + }, + { + "epoch": 0.2293640054127199, + "grad_norm": 8.628015931255625, + "learning_rate": 7.635135135135135e-06, + "loss": 1.3013, + "step": 339 + }, + { + "epoch": 0.23004059539918809, + "grad_norm": 7.683595210549564, + "learning_rate": 7.657657657657658e-06, + "loss": 1.3019, + "step": 340 + }, + { + "epoch": 0.2307171853856563, + "grad_norm": 7.104540289089895, + "learning_rate": 7.680180180180181e-06, + "loss": 1.2834, + "step": 341 + }, + { + "epoch": 0.2313937753721245, + "grad_norm": 10.484418721383305, + "learning_rate": 7.702702702702704e-06, + "loss": 1.3192, + "step": 342 + }, + { + "epoch": 0.23207036535859268, + "grad_norm": 7.529555458611565, + "learning_rate": 7.725225225225226e-06, + "loss": 1.3345, + "step": 343 + }, + { + "epoch": 0.2327469553450609, + "grad_norm": 7.304557513705602, + "learning_rate": 7.747747747747749e-06, + "loss": 1.2718, + "step": 344 + }, + { + "epoch": 0.2334235453315291, + "grad_norm": 7.468281142178066, + "learning_rate": 7.77027027027027e-06, + "loss": 1.3459, + "step": 345 + }, + { + "epoch": 0.23410013531799728, + "grad_norm": 7.308215956657339, + "learning_rate": 7.792792792792793e-06, + "loss": 1.3549, + "step": 346 + }, + { + "epoch": 0.2347767253044655, + "grad_norm": 7.511453954818753, + "learning_rate": 7.815315315315317e-06, + "loss": 1.2729, + "step": 347 + }, + { + "epoch": 0.2354533152909337, + "grad_norm": 6.692314776874314, + "learning_rate": 7.837837837837838e-06, + "loss": 1.3146, + "step": 348 + }, + { + "epoch": 0.2361299052774019, + "grad_norm": 7.555862621620149, + "learning_rate": 7.860360360360361e-06, + "loss": 1.3208, + "step": 349 + }, + { + "epoch": 0.2368064952638701, + "grad_norm": 7.867176009962364, + "learning_rate": 7.882882882882884e-06, + "loss": 1.2837, + "step": 350 + }, + { + "epoch": 0.2374830852503383, + "grad_norm": 6.821961011231683, + "learning_rate": 7.905405405405406e-06, + "loss": 1.2689, + "step": 351 + }, + { + "epoch": 0.2381596752368065, + "grad_norm": 6.829813150156765, + "learning_rate": 7.927927927927929e-06, + "loss": 1.2085, + "step": 352 + }, + { + "epoch": 0.2388362652232747, + "grad_norm": 6.416445487360607, + "learning_rate": 7.95045045045045e-06, + "loss": 1.3142, + "step": 353 + }, + { + "epoch": 0.2395128552097429, + "grad_norm": 6.755985297167506, + "learning_rate": 7.972972972972974e-06, + "loss": 1.2437, + "step": 354 + }, + { + "epoch": 0.2401894451962111, + "grad_norm": 5.63750947182298, + "learning_rate": 7.995495495495497e-06, + "loss": 1.1827, + "step": 355 + }, + { + "epoch": 0.2408660351826793, + "grad_norm": 8.41215274775629, + "learning_rate": 8.018018018018018e-06, + "loss": 1.3291, + "step": 356 + }, + { + "epoch": 0.24154262516914748, + "grad_norm": 5.695299828620484, + "learning_rate": 8.040540540540541e-06, + "loss": 1.2339, + "step": 357 + }, + { + "epoch": 0.2422192151556157, + "grad_norm": 6.303783125664122, + "learning_rate": 8.063063063063063e-06, + "loss": 1.2393, + "step": 358 + }, + { + "epoch": 0.2428958051420839, + "grad_norm": 5.818013793977989, + "learning_rate": 8.085585585585586e-06, + "loss": 1.264, + "step": 359 + }, + { + "epoch": 0.2435723951285521, + "grad_norm": 6.00149757812294, + "learning_rate": 8.108108108108109e-06, + "loss": 1.25, + "step": 360 + }, + { + "epoch": 0.2442489851150203, + "grad_norm": 7.294950151985955, + "learning_rate": 8.130630630630632e-06, + "loss": 1.2442, + "step": 361 + }, + { + "epoch": 0.2449255751014885, + "grad_norm": 7.319385571720754, + "learning_rate": 8.153153153153154e-06, + "loss": 1.2517, + "step": 362 + }, + { + "epoch": 0.2456021650879567, + "grad_norm": 6.556333265442866, + "learning_rate": 8.175675675675677e-06, + "loss": 1.2393, + "step": 363 + }, + { + "epoch": 0.2462787550744249, + "grad_norm": 5.537101135836502, + "learning_rate": 8.198198198198198e-06, + "loss": 1.1824, + "step": 364 + }, + { + "epoch": 0.2469553450608931, + "grad_norm": 6.095023950167767, + "learning_rate": 8.220720720720721e-06, + "loss": 1.2078, + "step": 365 + }, + { + "epoch": 0.2476319350473613, + "grad_norm": 6.693002070569304, + "learning_rate": 8.243243243243245e-06, + "loss": 1.1672, + "step": 366 + }, + { + "epoch": 0.2483085250338295, + "grad_norm": 4.778897755508491, + "learning_rate": 8.265765765765766e-06, + "loss": 1.2018, + "step": 367 + }, + { + "epoch": 0.2489851150202977, + "grad_norm": 6.41741003265316, + "learning_rate": 8.288288288288289e-06, + "loss": 1.2732, + "step": 368 + }, + { + "epoch": 0.2496617050067659, + "grad_norm": 6.5817670328837545, + "learning_rate": 8.31081081081081e-06, + "loss": 1.138, + "step": 369 + }, + { + "epoch": 0.2503382949932341, + "grad_norm": 4.797250856682376, + "learning_rate": 8.333333333333334e-06, + "loss": 1.1449, + "step": 370 + }, + { + "epoch": 0.2510148849797023, + "grad_norm": 6.884122726840821, + "learning_rate": 8.355855855855857e-06, + "loss": 1.2155, + "step": 371 + }, + { + "epoch": 0.2516914749661705, + "grad_norm": 6.599119872104339, + "learning_rate": 8.378378378378378e-06, + "loss": 1.1852, + "step": 372 + }, + { + "epoch": 0.2523680649526387, + "grad_norm": 5.369629044197062, + "learning_rate": 8.400900900900901e-06, + "loss": 1.1606, + "step": 373 + }, + { + "epoch": 0.2530446549391069, + "grad_norm": 5.812549305075738, + "learning_rate": 8.423423423423423e-06, + "loss": 1.2094, + "step": 374 + }, + { + "epoch": 0.25372124492557513, + "grad_norm": 5.223599468834814, + "learning_rate": 8.445945945945948e-06, + "loss": 1.0854, + "step": 375 + }, + { + "epoch": 0.2543978349120433, + "grad_norm": 4.905683717210968, + "learning_rate": 8.46846846846847e-06, + "loss": 1.1488, + "step": 376 + }, + { + "epoch": 0.2550744248985115, + "grad_norm": 5.421234863754259, + "learning_rate": 8.490990990990992e-06, + "loss": 1.167, + "step": 377 + }, + { + "epoch": 0.2557510148849797, + "grad_norm": 6.8698555994818635, + "learning_rate": 8.513513513513514e-06, + "loss": 1.1931, + "step": 378 + }, + { + "epoch": 0.2564276048714479, + "grad_norm": 5.214335512005905, + "learning_rate": 8.536036036036037e-06, + "loss": 1.1036, + "step": 379 + }, + { + "epoch": 0.2571041948579161, + "grad_norm": 5.631060836540709, + "learning_rate": 8.55855855855856e-06, + "loss": 1.1909, + "step": 380 + }, + { + "epoch": 0.2577807848443843, + "grad_norm": 4.675672390186224, + "learning_rate": 8.581081081081082e-06, + "loss": 1.1228, + "step": 381 + }, + { + "epoch": 0.2584573748308525, + "grad_norm": 6.334523111748299, + "learning_rate": 8.603603603603605e-06, + "loss": 1.1194, + "step": 382 + }, + { + "epoch": 0.2591339648173207, + "grad_norm": 5.391808799329787, + "learning_rate": 8.626126126126126e-06, + "loss": 1.2031, + "step": 383 + }, + { + "epoch": 0.2598105548037889, + "grad_norm": 4.951535252198407, + "learning_rate": 8.64864864864865e-06, + "loss": 1.1924, + "step": 384 + }, + { + "epoch": 0.2604871447902571, + "grad_norm": 5.833225590221079, + "learning_rate": 8.671171171171172e-06, + "loss": 1.1505, + "step": 385 + }, + { + "epoch": 0.26116373477672533, + "grad_norm": 4.990875084577596, + "learning_rate": 8.693693693693694e-06, + "loss": 1.1134, + "step": 386 + }, + { + "epoch": 0.2618403247631935, + "grad_norm": 4.902817282647241, + "learning_rate": 8.716216216216217e-06, + "loss": 1.1128, + "step": 387 + }, + { + "epoch": 0.2625169147496617, + "grad_norm": 4.494411184327583, + "learning_rate": 8.738738738738739e-06, + "loss": 1.1129, + "step": 388 + }, + { + "epoch": 0.2631935047361299, + "grad_norm": 4.888423603959499, + "learning_rate": 8.761261261261262e-06, + "loss": 1.18, + "step": 389 + }, + { + "epoch": 0.2638700947225981, + "grad_norm": 5.832594841589209, + "learning_rate": 8.783783783783785e-06, + "loss": 1.1547, + "step": 390 + }, + { + "epoch": 0.2645466847090663, + "grad_norm": 5.235913407042474, + "learning_rate": 8.806306306306306e-06, + "loss": 1.1537, + "step": 391 + }, + { + "epoch": 0.2652232746955345, + "grad_norm": 4.785604512532903, + "learning_rate": 8.82882882882883e-06, + "loss": 1.1732, + "step": 392 + }, + { + "epoch": 0.2658998646820027, + "grad_norm": 5.313118558349346, + "learning_rate": 8.851351351351351e-06, + "loss": 1.0713, + "step": 393 + }, + { + "epoch": 0.2665764546684709, + "grad_norm": 4.827250818420794, + "learning_rate": 8.873873873873876e-06, + "loss": 1.0751, + "step": 394 + }, + { + "epoch": 0.2672530446549391, + "grad_norm": 4.874301936478751, + "learning_rate": 8.896396396396397e-06, + "loss": 1.0652, + "step": 395 + }, + { + "epoch": 0.2679296346414073, + "grad_norm": 4.8435780244924915, + "learning_rate": 8.91891891891892e-06, + "loss": 1.0936, + "step": 396 + }, + { + "epoch": 0.26860622462787553, + "grad_norm": 4.343161185327432, + "learning_rate": 8.941441441441442e-06, + "loss": 1.0229, + "step": 397 + }, + { + "epoch": 0.2692828146143437, + "grad_norm": 4.913175619810589, + "learning_rate": 8.963963963963965e-06, + "loss": 1.0129, + "step": 398 + }, + { + "epoch": 0.2699594046008119, + "grad_norm": 4.206729558415461, + "learning_rate": 8.986486486486488e-06, + "loss": 1.07, + "step": 399 + }, + { + "epoch": 0.2706359945872801, + "grad_norm": 4.5465248961906175, + "learning_rate": 9.00900900900901e-06, + "loss": 1.006, + "step": 400 + }, + { + "epoch": 0.2713125845737483, + "grad_norm": 4.64094117178924, + "learning_rate": 9.031531531531533e-06, + "loss": 1.0352, + "step": 401 + }, + { + "epoch": 0.2719891745602165, + "grad_norm": 4.941072707337412, + "learning_rate": 9.054054054054054e-06, + "loss": 1.1037, + "step": 402 + }, + { + "epoch": 0.27266576454668473, + "grad_norm": 5.532157984209573, + "learning_rate": 9.076576576576577e-06, + "loss": 1.0686, + "step": 403 + }, + { + "epoch": 0.2733423545331529, + "grad_norm": 4.324221301885782, + "learning_rate": 9.0990990990991e-06, + "loss": 1.0886, + "step": 404 + }, + { + "epoch": 0.2740189445196211, + "grad_norm": 4.672612441909186, + "learning_rate": 9.121621621621622e-06, + "loss": 1.0678, + "step": 405 + }, + { + "epoch": 0.2746955345060893, + "grad_norm": 3.8658548636324337, + "learning_rate": 9.144144144144145e-06, + "loss": 1.0214, + "step": 406 + }, + { + "epoch": 0.2753721244925575, + "grad_norm": 4.774743242339752, + "learning_rate": 9.166666666666666e-06, + "loss": 1.0256, + "step": 407 + }, + { + "epoch": 0.27604871447902574, + "grad_norm": 5.3993821745337565, + "learning_rate": 9.189189189189191e-06, + "loss": 1.0791, + "step": 408 + }, + { + "epoch": 0.2767253044654939, + "grad_norm": 5.057580982546297, + "learning_rate": 9.211711711711713e-06, + "loss": 1.0208, + "step": 409 + }, + { + "epoch": 0.2774018944519621, + "grad_norm": 4.924665267405325, + "learning_rate": 9.234234234234236e-06, + "loss": 1.0023, + "step": 410 + }, + { + "epoch": 0.2780784844384303, + "grad_norm": 4.232329566300859, + "learning_rate": 9.256756756756757e-06, + "loss": 1.0575, + "step": 411 + }, + { + "epoch": 0.2787550744248985, + "grad_norm": 5.239366528410246, + "learning_rate": 9.27927927927928e-06, + "loss": 1.0371, + "step": 412 + }, + { + "epoch": 0.2794316644113667, + "grad_norm": 4.783501098068623, + "learning_rate": 9.301801801801804e-06, + "loss": 1.0027, + "step": 413 + }, + { + "epoch": 0.28010825439783493, + "grad_norm": 3.9728448745576896, + "learning_rate": 9.324324324324325e-06, + "loss": 1.0467, + "step": 414 + }, + { + "epoch": 0.2807848443843031, + "grad_norm": 4.681528219035177, + "learning_rate": 9.346846846846848e-06, + "loss": 1.0132, + "step": 415 + }, + { + "epoch": 0.2814614343707713, + "grad_norm": 4.206918370997306, + "learning_rate": 9.36936936936937e-06, + "loss": 1.0553, + "step": 416 + }, + { + "epoch": 0.2821380243572395, + "grad_norm": 4.553415102778668, + "learning_rate": 9.391891891891893e-06, + "loss": 1.0007, + "step": 417 + }, + { + "epoch": 0.2828146143437077, + "grad_norm": 5.0528643415633905, + "learning_rate": 9.414414414414416e-06, + "loss": 1.0204, + "step": 418 + }, + { + "epoch": 0.28349120433017594, + "grad_norm": 4.3331179687879695, + "learning_rate": 9.436936936936937e-06, + "loss": 1.0374, + "step": 419 + }, + { + "epoch": 0.28416779431664413, + "grad_norm": 4.494054237607846, + "learning_rate": 9.45945945945946e-06, + "loss": 0.9342, + "step": 420 + }, + { + "epoch": 0.2848443843031123, + "grad_norm": 3.8300564753424964, + "learning_rate": 9.481981981981982e-06, + "loss": 1.0262, + "step": 421 + }, + { + "epoch": 0.2855209742895805, + "grad_norm": 3.9015045170124183, + "learning_rate": 9.504504504504505e-06, + "loss": 0.9976, + "step": 422 + }, + { + "epoch": 0.2861975642760487, + "grad_norm": 4.7139488491269725, + "learning_rate": 9.527027027027028e-06, + "loss": 0.9667, + "step": 423 + }, + { + "epoch": 0.2868741542625169, + "grad_norm": 3.9906815821303794, + "learning_rate": 9.54954954954955e-06, + "loss": 1.0557, + "step": 424 + }, + { + "epoch": 0.28755074424898514, + "grad_norm": 4.883974178298484, + "learning_rate": 9.572072072072073e-06, + "loss": 1.0379, + "step": 425 + }, + { + "epoch": 0.2882273342354533, + "grad_norm": 4.125238209478557, + "learning_rate": 9.594594594594594e-06, + "loss": 0.948, + "step": 426 + }, + { + "epoch": 0.2889039242219215, + "grad_norm": 4.067349144549682, + "learning_rate": 9.617117117117117e-06, + "loss": 0.9698, + "step": 427 + }, + { + "epoch": 0.2895805142083897, + "grad_norm": 4.479942419738399, + "learning_rate": 9.63963963963964e-06, + "loss": 0.9977, + "step": 428 + }, + { + "epoch": 0.2902571041948579, + "grad_norm": 3.849447458341483, + "learning_rate": 9.662162162162164e-06, + "loss": 0.989, + "step": 429 + }, + { + "epoch": 0.29093369418132614, + "grad_norm": 4.042946064561824, + "learning_rate": 9.684684684684685e-06, + "loss": 0.9692, + "step": 430 + }, + { + "epoch": 0.29161028416779433, + "grad_norm": 4.091640101512462, + "learning_rate": 9.707207207207208e-06, + "loss": 0.9442, + "step": 431 + }, + { + "epoch": 0.2922868741542625, + "grad_norm": 4.049936658082999, + "learning_rate": 9.729729729729732e-06, + "loss": 1.0213, + "step": 432 + }, + { + "epoch": 0.2929634641407307, + "grad_norm": 3.6882609568257134, + "learning_rate": 9.752252252252253e-06, + "loss": 0.9687, + "step": 433 + }, + { + "epoch": 0.2936400541271989, + "grad_norm": 3.9102149602955025, + "learning_rate": 9.774774774774776e-06, + "loss": 0.941, + "step": 434 + }, + { + "epoch": 0.2943166441136671, + "grad_norm": 4.223777918374561, + "learning_rate": 9.797297297297298e-06, + "loss": 0.9704, + "step": 435 + }, + { + "epoch": 0.29499323410013534, + "grad_norm": 3.8011374914119425, + "learning_rate": 9.81981981981982e-06, + "loss": 0.9702, + "step": 436 + }, + { + "epoch": 0.2956698240866035, + "grad_norm": 3.7303737377420894, + "learning_rate": 9.842342342342344e-06, + "loss": 0.9378, + "step": 437 + }, + { + "epoch": 0.2963464140730717, + "grad_norm": 3.521072752470763, + "learning_rate": 9.864864864864865e-06, + "loss": 0.9507, + "step": 438 + }, + { + "epoch": 0.2970230040595399, + "grad_norm": 4.316004502684595, + "learning_rate": 9.887387387387388e-06, + "loss": 0.9734, + "step": 439 + }, + { + "epoch": 0.2976995940460081, + "grad_norm": 3.8335521195044318, + "learning_rate": 9.90990990990991e-06, + "loss": 0.9253, + "step": 440 + }, + { + "epoch": 0.29837618403247634, + "grad_norm": 3.665705979258958, + "learning_rate": 9.932432432432433e-06, + "loss": 0.9713, + "step": 441 + }, + { + "epoch": 0.29905277401894453, + "grad_norm": 4.334698014272042, + "learning_rate": 9.954954954954956e-06, + "loss": 0.9083, + "step": 442 + }, + { + "epoch": 0.2997293640054127, + "grad_norm": 3.3447072281334513, + "learning_rate": 9.97747747747748e-06, + "loss": 0.9034, + "step": 443 + }, + { + "epoch": 0.3004059539918809, + "grad_norm": 4.1589594059536985, + "learning_rate": 1e-05, + "loss": 0.9321, + "step": 444 + }, + { + "epoch": 0.3010825439783491, + "grad_norm": 4.1209097446031935, + "learning_rate": 9.999998450134754e-06, + "loss": 0.9187, + "step": 445 + }, + { + "epoch": 0.3017591339648173, + "grad_norm": 3.6342562123201354, + "learning_rate": 9.999993800539971e-06, + "loss": 0.9433, + "step": 446 + }, + { + "epoch": 0.30243572395128554, + "grad_norm": 3.8838265923669137, + "learning_rate": 9.999986051218538e-06, + "loss": 0.9375, + "step": 447 + }, + { + "epoch": 0.30311231393775373, + "grad_norm": 3.8982724762364014, + "learning_rate": 9.999975202175256e-06, + "loss": 0.9194, + "step": 448 + }, + { + "epoch": 0.3037889039242219, + "grad_norm": 4.0073538315187225, + "learning_rate": 9.999961253416853e-06, + "loss": 0.9047, + "step": 449 + }, + { + "epoch": 0.3044654939106901, + "grad_norm": 3.83724752407595, + "learning_rate": 9.999944204951974e-06, + "loss": 0.9214, + "step": 450 + }, + { + "epoch": 0.3051420838971583, + "grad_norm": 3.5702265127516117, + "learning_rate": 9.999924056791192e-06, + "loss": 0.8795, + "step": 451 + }, + { + "epoch": 0.30581867388362655, + "grad_norm": 3.492179005663864, + "learning_rate": 9.999900808946996e-06, + "loss": 0.9131, + "step": 452 + }, + { + "epoch": 0.30649526387009474, + "grad_norm": 3.1745948209546837, + "learning_rate": 9.999874461433796e-06, + "loss": 0.9165, + "step": 453 + }, + { + "epoch": 0.3071718538565629, + "grad_norm": 3.552119904729672, + "learning_rate": 9.999845014267928e-06, + "loss": 0.9205, + "step": 454 + }, + { + "epoch": 0.3078484438430311, + "grad_norm": 3.095271394681658, + "learning_rate": 9.99981246746765e-06, + "loss": 0.8814, + "step": 455 + }, + { + "epoch": 0.3085250338294993, + "grad_norm": 3.5904697324098733, + "learning_rate": 9.999776821053134e-06, + "loss": 0.9115, + "step": 456 + }, + { + "epoch": 0.3092016238159675, + "grad_norm": 3.696586268891529, + "learning_rate": 9.999738075046483e-06, + "loss": 0.9036, + "step": 457 + }, + { + "epoch": 0.30987821380243574, + "grad_norm": 3.666514530641993, + "learning_rate": 9.999696229471716e-06, + "loss": 0.9171, + "step": 458 + }, + { + "epoch": 0.31055480378890393, + "grad_norm": 3.5284682583414426, + "learning_rate": 9.999651284354774e-06, + "loss": 0.8933, + "step": 459 + }, + { + "epoch": 0.3112313937753721, + "grad_norm": 3.55289773993609, + "learning_rate": 9.999603239723524e-06, + "loss": 0.872, + "step": 460 + }, + { + "epoch": 0.3119079837618403, + "grad_norm": 3.2368814708165794, + "learning_rate": 9.999552095607748e-06, + "loss": 0.8605, + "step": 461 + }, + { + "epoch": 0.3125845737483085, + "grad_norm": 3.0334660120605452, + "learning_rate": 9.999497852039152e-06, + "loss": 0.9141, + "step": 462 + }, + { + "epoch": 0.31326116373477675, + "grad_norm": 3.314329217403211, + "learning_rate": 9.999440509051367e-06, + "loss": 0.8968, + "step": 463 + }, + { + "epoch": 0.31393775372124494, + "grad_norm": 3.570487557645587, + "learning_rate": 9.999380066679943e-06, + "loss": 0.9137, + "step": 464 + }, + { + "epoch": 0.31461434370771313, + "grad_norm": 3.2744333929814706, + "learning_rate": 9.999316524962347e-06, + "loss": 0.9098, + "step": 465 + }, + { + "epoch": 0.3152909336941813, + "grad_norm": 3.003532560029087, + "learning_rate": 9.999249883937971e-06, + "loss": 0.8658, + "step": 466 + }, + { + "epoch": 0.3159675236806495, + "grad_norm": 3.1147658376753915, + "learning_rate": 9.999180143648136e-06, + "loss": 0.8807, + "step": 467 + }, + { + "epoch": 0.3166441136671177, + "grad_norm": 3.2904201115134244, + "learning_rate": 9.999107304136068e-06, + "loss": 0.918, + "step": 468 + }, + { + "epoch": 0.31732070365358594, + "grad_norm": 3.346893818987595, + "learning_rate": 9.999031365446932e-06, + "loss": 0.8351, + "step": 469 + }, + { + "epoch": 0.31799729364005414, + "grad_norm": 3.088776815713552, + "learning_rate": 9.9989523276278e-06, + "loss": 0.8829, + "step": 470 + }, + { + "epoch": 0.3186738836265223, + "grad_norm": 4.0936798977412625, + "learning_rate": 9.998870190727674e-06, + "loss": 0.8432, + "step": 471 + }, + { + "epoch": 0.3193504736129905, + "grad_norm": 3.1764697217902946, + "learning_rate": 9.998784954797474e-06, + "loss": 0.8613, + "step": 472 + }, + { + "epoch": 0.3200270635994587, + "grad_norm": 3.193647148522581, + "learning_rate": 9.99869661989004e-06, + "loss": 0.8693, + "step": 473 + }, + { + "epoch": 0.32070365358592695, + "grad_norm": 3.4120440580585303, + "learning_rate": 9.998605186060138e-06, + "loss": 0.8244, + "step": 474 + }, + { + "epoch": 0.32138024357239514, + "grad_norm": 3.0258544107304832, + "learning_rate": 9.998510653364449e-06, + "loss": 0.8517, + "step": 475 + }, + { + "epoch": 0.32205683355886333, + "grad_norm": 3.1034878568318263, + "learning_rate": 9.998413021861581e-06, + "loss": 0.8578, + "step": 476 + }, + { + "epoch": 0.3227334235453315, + "grad_norm": 3.250075725402367, + "learning_rate": 9.998312291612056e-06, + "loss": 0.877, + "step": 477 + }, + { + "epoch": 0.3234100135317997, + "grad_norm": 2.7733424457408034, + "learning_rate": 9.998208462678328e-06, + "loss": 0.8353, + "step": 478 + }, + { + "epoch": 0.32408660351826796, + "grad_norm": 3.0189955639643955, + "learning_rate": 9.998101535124758e-06, + "loss": 0.8852, + "step": 479 + }, + { + "epoch": 0.32476319350473615, + "grad_norm": 2.8949415050560456, + "learning_rate": 9.99799150901764e-06, + "loss": 0.8907, + "step": 480 + }, + { + "epoch": 0.32543978349120434, + "grad_norm": 3.0534310639406885, + "learning_rate": 9.997878384425183e-06, + "loss": 0.8456, + "step": 481 + }, + { + "epoch": 0.3261163734776725, + "grad_norm": 2.855996354930219, + "learning_rate": 9.997762161417517e-06, + "loss": 0.8338, + "step": 482 + }, + { + "epoch": 0.3267929634641407, + "grad_norm": 2.890677997136448, + "learning_rate": 9.997642840066696e-06, + "loss": 0.8582, + "step": 483 + }, + { + "epoch": 0.3274695534506089, + "grad_norm": 2.8955609614653155, + "learning_rate": 9.997520420446694e-06, + "loss": 0.8533, + "step": 484 + }, + { + "epoch": 0.32814614343707715, + "grad_norm": 3.094942114313454, + "learning_rate": 9.9973949026334e-06, + "loss": 0.824, + "step": 485 + }, + { + "epoch": 0.32882273342354534, + "grad_norm": 3.063619152732945, + "learning_rate": 9.99726628670463e-06, + "loss": 0.8555, + "step": 486 + }, + { + "epoch": 0.32949932341001353, + "grad_norm": 2.819488565448912, + "learning_rate": 9.997134572740122e-06, + "loss": 0.8464, + "step": 487 + }, + { + "epoch": 0.3301759133964817, + "grad_norm": 2.5548684405877973, + "learning_rate": 9.996999760821529e-06, + "loss": 0.7959, + "step": 488 + }, + { + "epoch": 0.3308525033829499, + "grad_norm": 2.8786002140608313, + "learning_rate": 9.996861851032426e-06, + "loss": 0.8573, + "step": 489 + }, + { + "epoch": 0.33152909336941816, + "grad_norm": 2.807515168968411, + "learning_rate": 9.996720843458312e-06, + "loss": 0.8447, + "step": 490 + }, + { + "epoch": 0.33220568335588635, + "grad_norm": 2.937361095247826, + "learning_rate": 9.996576738186602e-06, + "loss": 0.8275, + "step": 491 + }, + { + "epoch": 0.33288227334235454, + "grad_norm": 2.7581361587891218, + "learning_rate": 9.996429535306638e-06, + "loss": 0.8625, + "step": 492 + }, + { + "epoch": 0.33355886332882273, + "grad_norm": 3.060203502125078, + "learning_rate": 9.996279234909672e-06, + "loss": 0.8504, + "step": 493 + }, + { + "epoch": 0.3342354533152909, + "grad_norm": 2.9680097813210553, + "learning_rate": 9.996125837088883e-06, + "loss": 0.846, + "step": 494 + }, + { + "epoch": 0.3349120433017591, + "grad_norm": 2.6959945304869635, + "learning_rate": 9.995969341939373e-06, + "loss": 0.8065, + "step": 495 + }, + { + "epoch": 0.33558863328822736, + "grad_norm": 3.177115967659662, + "learning_rate": 9.995809749558159e-06, + "loss": 0.8275, + "step": 496 + }, + { + "epoch": 0.33626522327469555, + "grad_norm": 2.944430539362135, + "learning_rate": 9.995647060044178e-06, + "loss": 0.8373, + "step": 497 + }, + { + "epoch": 0.33694181326116374, + "grad_norm": 2.9290586110244305, + "learning_rate": 9.995481273498291e-06, + "loss": 0.7995, + "step": 498 + }, + { + "epoch": 0.3376184032476319, + "grad_norm": 3.206371405004454, + "learning_rate": 9.995312390023275e-06, + "loss": 0.8409, + "step": 499 + }, + { + "epoch": 0.3382949932341001, + "grad_norm": 2.7223624769282857, + "learning_rate": 9.995140409723831e-06, + "loss": 0.8337, + "step": 500 + }, + { + "epoch": 0.33897158322056836, + "grad_norm": 2.7354940352519708, + "learning_rate": 9.994965332706574e-06, + "loss": 0.8128, + "step": 501 + }, + { + "epoch": 0.33964817320703655, + "grad_norm": 2.742152403931936, + "learning_rate": 9.994787159080046e-06, + "loss": 0.8372, + "step": 502 + }, + { + "epoch": 0.34032476319350474, + "grad_norm": 2.407710662991995, + "learning_rate": 9.994605888954701e-06, + "loss": 0.7752, + "step": 503 + }, + { + "epoch": 0.34100135317997293, + "grad_norm": 2.648539541200312, + "learning_rate": 9.99442152244292e-06, + "loss": 0.8351, + "step": 504 + }, + { + "epoch": 0.3416779431664411, + "grad_norm": 2.7987393621847256, + "learning_rate": 9.994234059658998e-06, + "loss": 0.8253, + "step": 505 + }, + { + "epoch": 0.3423545331529093, + "grad_norm": 2.6611235079805544, + "learning_rate": 9.994043500719155e-06, + "loss": 0.7732, + "step": 506 + }, + { + "epoch": 0.34303112313937756, + "grad_norm": 2.824414736639999, + "learning_rate": 9.993849845741525e-06, + "loss": 0.8232, + "step": 507 + }, + { + "epoch": 0.34370771312584575, + "grad_norm": 3.278643791346545, + "learning_rate": 9.993653094846162e-06, + "loss": 0.833, + "step": 508 + }, + { + "epoch": 0.34438430311231394, + "grad_norm": 2.6755857541883263, + "learning_rate": 9.993453248155044e-06, + "loss": 0.8138, + "step": 509 + }, + { + "epoch": 0.34506089309878213, + "grad_norm": 3.534895888050554, + "learning_rate": 9.993250305792067e-06, + "loss": 0.8327, + "step": 510 + }, + { + "epoch": 0.3457374830852503, + "grad_norm": 2.544304824354249, + "learning_rate": 9.993044267883039e-06, + "loss": 0.7876, + "step": 511 + }, + { + "epoch": 0.34641407307171856, + "grad_norm": 2.9700680528467163, + "learning_rate": 9.992835134555694e-06, + "loss": 0.8245, + "step": 512 + }, + { + "epoch": 0.34709066305818675, + "grad_norm": 2.5626117802153, + "learning_rate": 9.992622905939686e-06, + "loss": 0.8026, + "step": 513 + }, + { + "epoch": 0.34776725304465494, + "grad_norm": 2.687982257062625, + "learning_rate": 9.992407582166582e-06, + "loss": 0.817, + "step": 514 + }, + { + "epoch": 0.34844384303112313, + "grad_norm": 2.682185575359109, + "learning_rate": 9.992189163369873e-06, + "loss": 0.8024, + "step": 515 + }, + { + "epoch": 0.3491204330175913, + "grad_norm": 2.621193747220266, + "learning_rate": 9.991967649684967e-06, + "loss": 0.7859, + "step": 516 + }, + { + "epoch": 0.3497970230040595, + "grad_norm": 2.6783740060221803, + "learning_rate": 9.99174304124919e-06, + "loss": 0.7966, + "step": 517 + }, + { + "epoch": 0.35047361299052776, + "grad_norm": 2.638660466868078, + "learning_rate": 9.991515338201787e-06, + "loss": 0.8155, + "step": 518 + }, + { + "epoch": 0.35115020297699595, + "grad_norm": 2.6796352812176827, + "learning_rate": 9.991284540683922e-06, + "loss": 0.8187, + "step": 519 + }, + { + "epoch": 0.35182679296346414, + "grad_norm": 2.4636478517253098, + "learning_rate": 9.991050648838676e-06, + "loss": 0.79, + "step": 520 + }, + { + "epoch": 0.35250338294993233, + "grad_norm": 2.5132501947626134, + "learning_rate": 9.990813662811052e-06, + "loss": 0.8107, + "step": 521 + }, + { + "epoch": 0.3531799729364005, + "grad_norm": 2.3102149207906657, + "learning_rate": 9.990573582747965e-06, + "loss": 0.7221, + "step": 522 + }, + { + "epoch": 0.35385656292286877, + "grad_norm": 2.8388086625182076, + "learning_rate": 9.990330408798255e-06, + "loss": 0.8176, + "step": 523 + }, + { + "epoch": 0.35453315290933696, + "grad_norm": 2.547424702471631, + "learning_rate": 9.990084141112674e-06, + "loss": 0.7757, + "step": 524 + }, + { + "epoch": 0.35520974289580515, + "grad_norm": 2.459038671782621, + "learning_rate": 9.989834779843895e-06, + "loss": 0.7571, + "step": 525 + }, + { + "epoch": 0.35588633288227334, + "grad_norm": 2.343072915349538, + "learning_rate": 9.989582325146511e-06, + "loss": 0.7588, + "step": 526 + }, + { + "epoch": 0.3565629228687415, + "grad_norm": 2.6318457433995643, + "learning_rate": 9.98932677717703e-06, + "loss": 0.7863, + "step": 527 + }, + { + "epoch": 0.3572395128552097, + "grad_norm": 2.841829010674145, + "learning_rate": 9.989068136093873e-06, + "loss": 0.8111, + "step": 528 + }, + { + "epoch": 0.35791610284167796, + "grad_norm": 2.798257035023864, + "learning_rate": 9.98880640205739e-06, + "loss": 0.8038, + "step": 529 + }, + { + "epoch": 0.35859269282814615, + "grad_norm": 2.7565105246706367, + "learning_rate": 9.988541575229837e-06, + "loss": 0.7761, + "step": 530 + }, + { + "epoch": 0.35926928281461434, + "grad_norm": 2.6356624492568357, + "learning_rate": 9.988273655775398e-06, + "loss": 0.8054, + "step": 531 + }, + { + "epoch": 0.35994587280108253, + "grad_norm": 2.6569910293091574, + "learning_rate": 9.988002643860162e-06, + "loss": 0.7642, + "step": 532 + }, + { + "epoch": 0.3606224627875507, + "grad_norm": 2.779551959212093, + "learning_rate": 9.987728539652145e-06, + "loss": 0.7719, + "step": 533 + }, + { + "epoch": 0.36129905277401897, + "grad_norm": 2.821073101690452, + "learning_rate": 9.98745134332128e-06, + "loss": 0.723, + "step": 534 + }, + { + "epoch": 0.36197564276048716, + "grad_norm": 2.665438624418873, + "learning_rate": 9.987171055039409e-06, + "loss": 0.8325, + "step": 535 + }, + { + "epoch": 0.36265223274695535, + "grad_norm": 2.602468827659332, + "learning_rate": 9.986887674980297e-06, + "loss": 0.7768, + "step": 536 + }, + { + "epoch": 0.36332882273342354, + "grad_norm": 2.5104464519095866, + "learning_rate": 9.986601203319623e-06, + "loss": 0.735, + "step": 537 + }, + { + "epoch": 0.36400541271989173, + "grad_norm": 2.4650931692682723, + "learning_rate": 9.986311640234988e-06, + "loss": 0.7359, + "step": 538 + }, + { + "epoch": 0.3646820027063599, + "grad_norm": 2.266836893273909, + "learning_rate": 9.986018985905901e-06, + "loss": 0.7646, + "step": 539 + }, + { + "epoch": 0.36535859269282817, + "grad_norm": 2.3159717821800627, + "learning_rate": 9.985723240513795e-06, + "loss": 0.7479, + "step": 540 + }, + { + "epoch": 0.36603518267929636, + "grad_norm": 2.3267215054412, + "learning_rate": 9.985424404242015e-06, + "loss": 0.7429, + "step": 541 + }, + { + "epoch": 0.36671177266576455, + "grad_norm": 2.445693725429499, + "learning_rate": 9.985122477275824e-06, + "loss": 0.7349, + "step": 542 + }, + { + "epoch": 0.36738836265223274, + "grad_norm": 2.4198599886280623, + "learning_rate": 9.9848174598024e-06, + "loss": 0.7323, + "step": 543 + }, + { + "epoch": 0.3680649526387009, + "grad_norm": 2.2812273337003917, + "learning_rate": 9.984509352010839e-06, + "loss": 0.6918, + "step": 544 + }, + { + "epoch": 0.36874154262516917, + "grad_norm": 2.2654499896491487, + "learning_rate": 9.984198154092147e-06, + "loss": 0.7169, + "step": 545 + }, + { + "epoch": 0.36941813261163736, + "grad_norm": 2.2119471183099235, + "learning_rate": 9.983883866239253e-06, + "loss": 0.7525, + "step": 546 + }, + { + "epoch": 0.37009472259810555, + "grad_norm": 2.239396967454031, + "learning_rate": 9.983566488647e-06, + "loss": 0.7633, + "step": 547 + }, + { + "epoch": 0.37077131258457374, + "grad_norm": 2.3671195766363646, + "learning_rate": 9.98324602151214e-06, + "loss": 0.7287, + "step": 548 + }, + { + "epoch": 0.37144790257104193, + "grad_norm": 2.3516713935587545, + "learning_rate": 9.98292246503335e-06, + "loss": 0.7845, + "step": 549 + }, + { + "epoch": 0.3721244925575101, + "grad_norm": 2.5614066335000887, + "learning_rate": 9.982595819411216e-06, + "loss": 0.772, + "step": 550 + }, + { + "epoch": 0.37280108254397837, + "grad_norm": 2.268730726420046, + "learning_rate": 9.98226608484824e-06, + "loss": 0.7265, + "step": 551 + }, + { + "epoch": 0.37347767253044656, + "grad_norm": 2.4673866865308485, + "learning_rate": 9.981933261548841e-06, + "loss": 0.7357, + "step": 552 + }, + { + "epoch": 0.37415426251691475, + "grad_norm": 2.296525880694683, + "learning_rate": 9.981597349719351e-06, + "loss": 0.7124, + "step": 553 + }, + { + "epoch": 0.37483085250338294, + "grad_norm": 2.265625643278614, + "learning_rate": 9.981258349568018e-06, + "loss": 0.7373, + "step": 554 + }, + { + "epoch": 0.37550744248985113, + "grad_norm": 2.2696703683486157, + "learning_rate": 9.980916261305002e-06, + "loss": 0.7247, + "step": 555 + }, + { + "epoch": 0.3761840324763194, + "grad_norm": 2.177261495035988, + "learning_rate": 9.980571085142381e-06, + "loss": 0.747, + "step": 556 + }, + { + "epoch": 0.37686062246278756, + "grad_norm": 2.362176160288204, + "learning_rate": 9.980222821294143e-06, + "loss": 0.7246, + "step": 557 + }, + { + "epoch": 0.37753721244925575, + "grad_norm": 2.1022685100770646, + "learning_rate": 9.979871469976197e-06, + "loss": 0.6982, + "step": 558 + }, + { + "epoch": 0.37821380243572394, + "grad_norm": 2.236688754096613, + "learning_rate": 9.979517031406357e-06, + "loss": 0.7417, + "step": 559 + }, + { + "epoch": 0.37889039242219213, + "grad_norm": 2.1466742469590834, + "learning_rate": 9.97915950580436e-06, + "loss": 0.7377, + "step": 560 + }, + { + "epoch": 0.3795669824086603, + "grad_norm": 2.182650476403489, + "learning_rate": 9.97879889339185e-06, + "loss": 0.7209, + "step": 561 + }, + { + "epoch": 0.38024357239512857, + "grad_norm": 2.1526694838589755, + "learning_rate": 9.97843519439239e-06, + "loss": 0.7522, + "step": 562 + }, + { + "epoch": 0.38092016238159676, + "grad_norm": 2.1371520397539943, + "learning_rate": 9.978068409031449e-06, + "loss": 0.7189, + "step": 563 + }, + { + "epoch": 0.38159675236806495, + "grad_norm": 2.102484057128259, + "learning_rate": 9.97769853753642e-06, + "loss": 0.7035, + "step": 564 + }, + { + "epoch": 0.38227334235453314, + "grad_norm": 2.1862049962798777, + "learning_rate": 9.977325580136598e-06, + "loss": 0.7131, + "step": 565 + }, + { + "epoch": 0.38294993234100133, + "grad_norm": 2.366449337575754, + "learning_rate": 9.9769495370632e-06, + "loss": 0.7405, + "step": 566 + }, + { + "epoch": 0.3836265223274696, + "grad_norm": 2.088294738417784, + "learning_rate": 9.97657040854935e-06, + "loss": 0.6992, + "step": 567 + }, + { + "epoch": 0.38430311231393777, + "grad_norm": 2.309847902881155, + "learning_rate": 9.976188194830092e-06, + "loss": 0.7349, + "step": 568 + }, + { + "epoch": 0.38497970230040596, + "grad_norm": 2.1445079732726624, + "learning_rate": 9.975802896142373e-06, + "loss": 0.7408, + "step": 569 + }, + { + "epoch": 0.38565629228687415, + "grad_norm": 2.239660655657111, + "learning_rate": 9.975414512725058e-06, + "loss": 0.7203, + "step": 570 + }, + { + "epoch": 0.38633288227334234, + "grad_norm": 2.1721580645053273, + "learning_rate": 9.975023044818925e-06, + "loss": 0.7365, + "step": 571 + }, + { + "epoch": 0.3870094722598105, + "grad_norm": 2.1101165945964455, + "learning_rate": 9.974628492666664e-06, + "loss": 0.6948, + "step": 572 + }, + { + "epoch": 0.3876860622462788, + "grad_norm": 2.412564136594261, + "learning_rate": 9.974230856512874e-06, + "loss": 0.7622, + "step": 573 + }, + { + "epoch": 0.38836265223274696, + "grad_norm": 2.2856492277962097, + "learning_rate": 9.973830136604068e-06, + "loss": 0.728, + "step": 574 + }, + { + "epoch": 0.38903924221921515, + "grad_norm": 2.180935064812689, + "learning_rate": 9.973426333188673e-06, + "loss": 0.7116, + "step": 575 + }, + { + "epoch": 0.38971583220568334, + "grad_norm": 2.276771558917313, + "learning_rate": 9.973019446517023e-06, + "loss": 0.7363, + "step": 576 + }, + { + "epoch": 0.39039242219215153, + "grad_norm": 2.0804479250377907, + "learning_rate": 9.972609476841368e-06, + "loss": 0.729, + "step": 577 + }, + { + "epoch": 0.3910690121786198, + "grad_norm": 2.293959982563185, + "learning_rate": 9.972196424415865e-06, + "loss": 0.7168, + "step": 578 + }, + { + "epoch": 0.39174560216508797, + "grad_norm": 2.019436821032936, + "learning_rate": 9.971780289496585e-06, + "loss": 0.715, + "step": 579 + }, + { + "epoch": 0.39242219215155616, + "grad_norm": 2.2784237326655625, + "learning_rate": 9.971361072341509e-06, + "loss": 0.7036, + "step": 580 + }, + { + "epoch": 0.39309878213802435, + "grad_norm": 2.059377297470316, + "learning_rate": 9.97093877321053e-06, + "loss": 0.721, + "step": 581 + }, + { + "epoch": 0.39377537212449254, + "grad_norm": 2.139362525141615, + "learning_rate": 9.970513392365449e-06, + "loss": 0.715, + "step": 582 + }, + { + "epoch": 0.3944519621109608, + "grad_norm": 2.0176705974240003, + "learning_rate": 9.970084930069982e-06, + "loss": 0.7114, + "step": 583 + }, + { + "epoch": 0.395128552097429, + "grad_norm": 2.0092807332993137, + "learning_rate": 9.969653386589749e-06, + "loss": 0.6947, + "step": 584 + }, + { + "epoch": 0.39580514208389717, + "grad_norm": 2.1129018989942683, + "learning_rate": 9.969218762192286e-06, + "loss": 0.7647, + "step": 585 + }, + { + "epoch": 0.39648173207036536, + "grad_norm": 2.1316606162902563, + "learning_rate": 9.968781057147036e-06, + "loss": 0.7023, + "step": 586 + }, + { + "epoch": 0.39715832205683355, + "grad_norm": 2.232107359256458, + "learning_rate": 9.968340271725352e-06, + "loss": 0.7147, + "step": 587 + }, + { + "epoch": 0.39783491204330174, + "grad_norm": 2.0234712969002477, + "learning_rate": 9.967896406200498e-06, + "loss": 0.7091, + "step": 588 + }, + { + "epoch": 0.39851150202977, + "grad_norm": 2.141612227613064, + "learning_rate": 9.967449460847648e-06, + "loss": 0.7107, + "step": 589 + }, + { + "epoch": 0.39918809201623817, + "grad_norm": 2.0552325338452735, + "learning_rate": 9.966999435943882e-06, + "loss": 0.682, + "step": 590 + }, + { + "epoch": 0.39986468200270636, + "grad_norm": 2.265846296289829, + "learning_rate": 9.966546331768192e-06, + "loss": 0.7215, + "step": 591 + }, + { + "epoch": 0.40054127198917455, + "grad_norm": 2.2436402593988833, + "learning_rate": 9.966090148601477e-06, + "loss": 0.6817, + "step": 592 + }, + { + "epoch": 0.40121786197564274, + "grad_norm": 2.179910678350655, + "learning_rate": 9.965630886726548e-06, + "loss": 0.7052, + "step": 593 + }, + { + "epoch": 0.401894451962111, + "grad_norm": 2.48734340383341, + "learning_rate": 9.965168546428122e-06, + "loss": 0.714, + "step": 594 + }, + { + "epoch": 0.4025710419485792, + "grad_norm": 2.163089894373313, + "learning_rate": 9.964703127992822e-06, + "loss": 0.69, + "step": 595 + }, + { + "epoch": 0.40324763193504737, + "grad_norm": 2.4952934205498045, + "learning_rate": 9.964234631709188e-06, + "loss": 0.7057, + "step": 596 + }, + { + "epoch": 0.40392422192151556, + "grad_norm": 2.515865519983522, + "learning_rate": 9.963763057867658e-06, + "loss": 0.7149, + "step": 597 + }, + { + "epoch": 0.40460081190798375, + "grad_norm": 2.0215440093353227, + "learning_rate": 9.963288406760584e-06, + "loss": 0.6926, + "step": 598 + }, + { + "epoch": 0.40527740189445194, + "grad_norm": 2.1935710081830577, + "learning_rate": 9.962810678682223e-06, + "loss": 0.7007, + "step": 599 + }, + { + "epoch": 0.4059539918809202, + "grad_norm": 2.027744782273937, + "learning_rate": 9.962329873928743e-06, + "loss": 0.7042, + "step": 600 + }, + { + "epoch": 0.4066305818673884, + "grad_norm": 2.034332358536131, + "learning_rate": 9.961845992798213e-06, + "loss": 0.7381, + "step": 601 + }, + { + "epoch": 0.40730717185385656, + "grad_norm": 2.0084286006292515, + "learning_rate": 9.961359035590619e-06, + "loss": 0.6834, + "step": 602 + }, + { + "epoch": 0.40798376184032475, + "grad_norm": 2.362532652474117, + "learning_rate": 9.960869002607843e-06, + "loss": 0.7133, + "step": 603 + }, + { + "epoch": 0.40866035182679294, + "grad_norm": 2.100407719983761, + "learning_rate": 9.960375894153682e-06, + "loss": 0.6868, + "step": 604 + }, + { + "epoch": 0.4093369418132612, + "grad_norm": 2.1368565789833265, + "learning_rate": 9.959879710533835e-06, + "loss": 0.6934, + "step": 605 + }, + { + "epoch": 0.4100135317997294, + "grad_norm": 2.0566631156288375, + "learning_rate": 9.959380452055909e-06, + "loss": 0.6786, + "step": 606 + }, + { + "epoch": 0.41069012178619757, + "grad_norm": 1.9593553918937765, + "learning_rate": 9.958878119029419e-06, + "loss": 0.7015, + "step": 607 + }, + { + "epoch": 0.41136671177266576, + "grad_norm": 2.018785216368903, + "learning_rate": 9.958372711765785e-06, + "loss": 0.6794, + "step": 608 + }, + { + "epoch": 0.41204330175913395, + "grad_norm": 1.9075054250954835, + "learning_rate": 9.95786423057833e-06, + "loss": 0.6691, + "step": 609 + }, + { + "epoch": 0.41271989174560214, + "grad_norm": 2.0871748298241766, + "learning_rate": 9.957352675782283e-06, + "loss": 0.6762, + "step": 610 + }, + { + "epoch": 0.4133964817320704, + "grad_norm": 1.9188715246488623, + "learning_rate": 9.956838047694785e-06, + "loss": 0.6627, + "step": 611 + }, + { + "epoch": 0.4140730717185386, + "grad_norm": 1.9847524523392805, + "learning_rate": 9.956320346634877e-06, + "loss": 0.6884, + "step": 612 + }, + { + "epoch": 0.41474966170500677, + "grad_norm": 2.043064857643346, + "learning_rate": 9.955799572923503e-06, + "loss": 0.6711, + "step": 613 + }, + { + "epoch": 0.41542625169147496, + "grad_norm": 2.0325247485054296, + "learning_rate": 9.955275726883517e-06, + "loss": 0.6838, + "step": 614 + }, + { + "epoch": 0.41610284167794315, + "grad_norm": 1.8907423347407817, + "learning_rate": 9.954748808839675e-06, + "loss": 0.6816, + "step": 615 + }, + { + "epoch": 0.4167794316644114, + "grad_norm": 2.1406953226662515, + "learning_rate": 9.954218819118636e-06, + "loss": 0.6949, + "step": 616 + }, + { + "epoch": 0.4174560216508796, + "grad_norm": 2.001786047271728, + "learning_rate": 9.953685758048968e-06, + "loss": 0.6815, + "step": 617 + }, + { + "epoch": 0.4181326116373478, + "grad_norm": 1.929934647977257, + "learning_rate": 9.953149625961136e-06, + "loss": 0.6852, + "step": 618 + }, + { + "epoch": 0.41880920162381596, + "grad_norm": 2.0589536480408883, + "learning_rate": 9.952610423187516e-06, + "loss": 0.6902, + "step": 619 + }, + { + "epoch": 0.41948579161028415, + "grad_norm": 1.9996210703499524, + "learning_rate": 9.952068150062386e-06, + "loss": 0.6707, + "step": 620 + }, + { + "epoch": 0.42016238159675234, + "grad_norm": 2.0134316880601206, + "learning_rate": 9.951522806921922e-06, + "loss": 0.683, + "step": 621 + }, + { + "epoch": 0.4208389715832206, + "grad_norm": 1.86633078703543, + "learning_rate": 9.95097439410421e-06, + "loss": 0.641, + "step": 622 + }, + { + "epoch": 0.4215155615696888, + "grad_norm": 2.2129596018316136, + "learning_rate": 9.950422911949238e-06, + "loss": 0.6932, + "step": 623 + }, + { + "epoch": 0.42219215155615697, + "grad_norm": 1.944116798384578, + "learning_rate": 9.949868360798893e-06, + "loss": 0.7012, + "step": 624 + }, + { + "epoch": 0.42286874154262516, + "grad_norm": 1.9308190608146374, + "learning_rate": 9.949310740996964e-06, + "loss": 0.6716, + "step": 625 + }, + { + "epoch": 0.42354533152909335, + "grad_norm": 1.782688098228263, + "learning_rate": 9.94875005288915e-06, + "loss": 0.674, + "step": 626 + }, + { + "epoch": 0.4242219215155616, + "grad_norm": 2.1653710361638945, + "learning_rate": 9.948186296823048e-06, + "loss": 0.6705, + "step": 627 + }, + { + "epoch": 0.4248985115020298, + "grad_norm": 1.9491818637213443, + "learning_rate": 9.947619473148152e-06, + "loss": 0.6792, + "step": 628 + }, + { + "epoch": 0.425575101488498, + "grad_norm": 1.995741143847466, + "learning_rate": 9.947049582215862e-06, + "loss": 0.7007, + "step": 629 + }, + { + "epoch": 0.42625169147496617, + "grad_norm": 1.8127064040211225, + "learning_rate": 9.946476624379485e-06, + "loss": 0.6555, + "step": 630 + }, + { + "epoch": 0.42692828146143436, + "grad_norm": 2.002480955111533, + "learning_rate": 9.945900599994219e-06, + "loss": 0.6943, + "step": 631 + }, + { + "epoch": 0.42760487144790255, + "grad_norm": 2.0318946970414564, + "learning_rate": 9.94532150941717e-06, + "loss": 0.6629, + "step": 632 + }, + { + "epoch": 0.4282814614343708, + "grad_norm": 1.9877226730218924, + "learning_rate": 9.944739353007344e-06, + "loss": 0.6803, + "step": 633 + }, + { + "epoch": 0.428958051420839, + "grad_norm": 1.809109992463502, + "learning_rate": 9.944154131125643e-06, + "loss": 0.6559, + "step": 634 + }, + { + "epoch": 0.42963464140730717, + "grad_norm": 1.997159047981803, + "learning_rate": 9.943565844134877e-06, + "loss": 0.6867, + "step": 635 + }, + { + "epoch": 0.43031123139377536, + "grad_norm": 1.810602576207129, + "learning_rate": 9.942974492399751e-06, + "loss": 0.6702, + "step": 636 + }, + { + "epoch": 0.43098782138024355, + "grad_norm": 2.0451887760889322, + "learning_rate": 9.94238007628687e-06, + "loss": 0.6728, + "step": 637 + }, + { + "epoch": 0.4316644113667118, + "grad_norm": 1.9773424679166982, + "learning_rate": 9.94178259616474e-06, + "loss": 0.6554, + "step": 638 + }, + { + "epoch": 0.43234100135318, + "grad_norm": 1.983323900570349, + "learning_rate": 9.941182052403768e-06, + "loss": 0.6753, + "step": 639 + }, + { + "epoch": 0.4330175913396482, + "grad_norm": 1.9549786355670833, + "learning_rate": 9.940578445376259e-06, + "loss": 0.6905, + "step": 640 + }, + { + "epoch": 0.43369418132611637, + "grad_norm": 1.997179151236007, + "learning_rate": 9.939971775456416e-06, + "loss": 0.6847, + "step": 641 + }, + { + "epoch": 0.43437077131258456, + "grad_norm": 1.8413656351114143, + "learning_rate": 9.93936204302034e-06, + "loss": 0.6888, + "step": 642 + }, + { + "epoch": 0.43504736129905275, + "grad_norm": 1.8738449595961248, + "learning_rate": 9.938749248446033e-06, + "loss": 0.6413, + "step": 643 + }, + { + "epoch": 0.435723951285521, + "grad_norm": 1.9154050527684428, + "learning_rate": 9.938133392113399e-06, + "loss": 0.6786, + "step": 644 + }, + { + "epoch": 0.4364005412719892, + "grad_norm": 1.9692984990498263, + "learning_rate": 9.937514474404229e-06, + "loss": 0.652, + "step": 645 + }, + { + "epoch": 0.4370771312584574, + "grad_norm": 1.8363218558877794, + "learning_rate": 9.936892495702222e-06, + "loss": 0.6318, + "step": 646 + }, + { + "epoch": 0.43775372124492556, + "grad_norm": 1.8058194378723227, + "learning_rate": 9.936267456392971e-06, + "loss": 0.6534, + "step": 647 + }, + { + "epoch": 0.43843031123139375, + "grad_norm": 1.9353376187771927, + "learning_rate": 9.935639356863966e-06, + "loss": 0.6969, + "step": 648 + }, + { + "epoch": 0.439106901217862, + "grad_norm": 1.8956154103192093, + "learning_rate": 9.935008197504596e-06, + "loss": 0.6521, + "step": 649 + }, + { + "epoch": 0.4397834912043302, + "grad_norm": 1.7924097574849651, + "learning_rate": 9.934373978706147e-06, + "loss": 0.604, + "step": 650 + }, + { + "epoch": 0.4404600811907984, + "grad_norm": 1.9110970217222203, + "learning_rate": 9.933736700861798e-06, + "loss": 0.6808, + "step": 651 + }, + { + "epoch": 0.44113667117726657, + "grad_norm": 1.7458422006902716, + "learning_rate": 9.933096364366625e-06, + "loss": 0.6512, + "step": 652 + }, + { + "epoch": 0.44181326116373476, + "grad_norm": 1.7984905753531888, + "learning_rate": 9.932452969617607e-06, + "loss": 0.6278, + "step": 653 + }, + { + "epoch": 0.44248985115020295, + "grad_norm": 1.8244577525823302, + "learning_rate": 9.931806517013612e-06, + "loss": 0.643, + "step": 654 + }, + { + "epoch": 0.4431664411366712, + "grad_norm": 1.8462614021324941, + "learning_rate": 9.931157006955406e-06, + "loss": 0.6822, + "step": 655 + }, + { + "epoch": 0.4438430311231394, + "grad_norm": 1.8237762592102016, + "learning_rate": 9.93050443984565e-06, + "loss": 0.6549, + "step": 656 + }, + { + "epoch": 0.4445196211096076, + "grad_norm": 2.0264274302521432, + "learning_rate": 9.929848816088898e-06, + "loss": 0.6764, + "step": 657 + }, + { + "epoch": 0.44519621109607577, + "grad_norm": 1.9028989377044923, + "learning_rate": 9.929190136091604e-06, + "loss": 0.6619, + "step": 658 + }, + { + "epoch": 0.44587280108254396, + "grad_norm": 1.9349232488713752, + "learning_rate": 9.928528400262116e-06, + "loss": 0.6592, + "step": 659 + }, + { + "epoch": 0.4465493910690122, + "grad_norm": 1.8586228478591957, + "learning_rate": 9.92786360901067e-06, + "loss": 0.6867, + "step": 660 + }, + { + "epoch": 0.4472259810554804, + "grad_norm": 1.7870259169553473, + "learning_rate": 9.927195762749405e-06, + "loss": 0.6487, + "step": 661 + }, + { + "epoch": 0.4479025710419486, + "grad_norm": 1.8126776620578224, + "learning_rate": 9.926524861892346e-06, + "loss": 0.6567, + "step": 662 + }, + { + "epoch": 0.4485791610284168, + "grad_norm": 1.8737868617314668, + "learning_rate": 9.925850906855419e-06, + "loss": 0.6614, + "step": 663 + }, + { + "epoch": 0.44925575101488496, + "grad_norm": 1.7713697769223755, + "learning_rate": 9.925173898056436e-06, + "loss": 0.6278, + "step": 664 + }, + { + "epoch": 0.44993234100135315, + "grad_norm": 1.9092961289094104, + "learning_rate": 9.924493835915108e-06, + "loss": 0.6668, + "step": 665 + }, + { + "epoch": 0.4506089309878214, + "grad_norm": 1.7496215142400025, + "learning_rate": 9.923810720853038e-06, + "loss": 0.6373, + "step": 666 + }, + { + "epoch": 0.4512855209742896, + "grad_norm": 1.883308707797926, + "learning_rate": 9.923124553293718e-06, + "loss": 0.6623, + "step": 667 + }, + { + "epoch": 0.4519621109607578, + "grad_norm": 1.8381070698311122, + "learning_rate": 9.922435333662537e-06, + "loss": 0.6577, + "step": 668 + }, + { + "epoch": 0.45263870094722597, + "grad_norm": 1.8134254950348916, + "learning_rate": 9.921743062386773e-06, + "loss": 0.6799, + "step": 669 + }, + { + "epoch": 0.45331529093369416, + "grad_norm": 1.8177625611343793, + "learning_rate": 9.921047739895596e-06, + "loss": 0.6206, + "step": 670 + }, + { + "epoch": 0.4539918809201624, + "grad_norm": 1.916375693161388, + "learning_rate": 9.92034936662007e-06, + "loss": 0.6466, + "step": 671 + }, + { + "epoch": 0.4546684709066306, + "grad_norm": 1.8666278331742292, + "learning_rate": 9.91964794299315e-06, + "loss": 0.6472, + "step": 672 + }, + { + "epoch": 0.4553450608930988, + "grad_norm": 1.841159739003564, + "learning_rate": 9.918943469449676e-06, + "loss": 0.6738, + "step": 673 + }, + { + "epoch": 0.456021650879567, + "grad_norm": 1.931039343691473, + "learning_rate": 9.918235946426389e-06, + "loss": 0.6464, + "step": 674 + }, + { + "epoch": 0.45669824086603517, + "grad_norm": 1.6913206369911358, + "learning_rate": 9.917525374361913e-06, + "loss": 0.6387, + "step": 675 + }, + { + "epoch": 0.45737483085250336, + "grad_norm": 1.9636634964730848, + "learning_rate": 9.916811753696764e-06, + "loss": 0.6377, + "step": 676 + }, + { + "epoch": 0.4580514208389716, + "grad_norm": 1.7872712718739638, + "learning_rate": 9.916095084873348e-06, + "loss": 0.646, + "step": 677 + }, + { + "epoch": 0.4587280108254398, + "grad_norm": 1.6642630676043548, + "learning_rate": 9.915375368335962e-06, + "loss": 0.5887, + "step": 678 + }, + { + "epoch": 0.459404600811908, + "grad_norm": 1.810919197529432, + "learning_rate": 9.91465260453079e-06, + "loss": 0.6268, + "step": 679 + }, + { + "epoch": 0.46008119079837617, + "grad_norm": 1.7866371130807348, + "learning_rate": 9.913926793905909e-06, + "loss": 0.6155, + "step": 680 + }, + { + "epoch": 0.46075778078484436, + "grad_norm": 1.738555445668848, + "learning_rate": 9.91319793691128e-06, + "loss": 0.6722, + "step": 681 + }, + { + "epoch": 0.4614343707713126, + "grad_norm": 1.8542733755959186, + "learning_rate": 9.912466033998758e-06, + "loss": 0.6604, + "step": 682 + }, + { + "epoch": 0.4621109607577808, + "grad_norm": 1.6243959213835935, + "learning_rate": 9.91173108562208e-06, + "loss": 0.6177, + "step": 683 + }, + { + "epoch": 0.462787550744249, + "grad_norm": 1.7088849631880303, + "learning_rate": 9.910993092236878e-06, + "loss": 0.6321, + "step": 684 + }, + { + "epoch": 0.4634641407307172, + "grad_norm": 1.744776998203165, + "learning_rate": 9.910252054300664e-06, + "loss": 0.6234, + "step": 685 + }, + { + "epoch": 0.46414073071718537, + "grad_norm": 1.713082320412285, + "learning_rate": 9.909507972272845e-06, + "loss": 0.6204, + "step": 686 + }, + { + "epoch": 0.4648173207036536, + "grad_norm": 1.7906364998078443, + "learning_rate": 9.90876084661471e-06, + "loss": 0.6279, + "step": 687 + }, + { + "epoch": 0.4654939106901218, + "grad_norm": 1.7059545171919712, + "learning_rate": 9.908010677789437e-06, + "loss": 0.6217, + "step": 688 + }, + { + "epoch": 0.46617050067659, + "grad_norm": 1.7506382698310816, + "learning_rate": 9.90725746626209e-06, + "loss": 0.6492, + "step": 689 + }, + { + "epoch": 0.4668470906630582, + "grad_norm": 1.7765082721148218, + "learning_rate": 9.90650121249962e-06, + "loss": 0.6602, + "step": 690 + }, + { + "epoch": 0.4675236806495264, + "grad_norm": 1.6752513183319067, + "learning_rate": 9.905741916970863e-06, + "loss": 0.6338, + "step": 691 + }, + { + "epoch": 0.46820027063599456, + "grad_norm": 1.7186093366528268, + "learning_rate": 9.904979580146544e-06, + "loss": 0.5968, + "step": 692 + }, + { + "epoch": 0.4688768606224628, + "grad_norm": 1.665313992607149, + "learning_rate": 9.904214202499266e-06, + "loss": 0.6499, + "step": 693 + }, + { + "epoch": 0.469553450608931, + "grad_norm": 1.716744851695661, + "learning_rate": 9.903445784503525e-06, + "loss": 0.6388, + "step": 694 + }, + { + "epoch": 0.4702300405953992, + "grad_norm": 1.7245973304097408, + "learning_rate": 9.902674326635698e-06, + "loss": 0.6164, + "step": 695 + }, + { + "epoch": 0.4709066305818674, + "grad_norm": 1.7252557740173053, + "learning_rate": 9.901899829374048e-06, + "loss": 0.6271, + "step": 696 + }, + { + "epoch": 0.47158322056833557, + "grad_norm": 1.7400065832286413, + "learning_rate": 9.90112229319872e-06, + "loss": 0.6115, + "step": 697 + }, + { + "epoch": 0.4722598105548038, + "grad_norm": 1.7879186578579442, + "learning_rate": 9.900341718591746e-06, + "loss": 0.6612, + "step": 698 + }, + { + "epoch": 0.472936400541272, + "grad_norm": 1.7582531777040444, + "learning_rate": 9.899558106037039e-06, + "loss": 0.6318, + "step": 699 + }, + { + "epoch": 0.4736129905277402, + "grad_norm": 1.7056978103836629, + "learning_rate": 9.898771456020397e-06, + "loss": 0.6496, + "step": 700 + }, + { + "epoch": 0.4742895805142084, + "grad_norm": 1.8130704404644866, + "learning_rate": 9.897981769029504e-06, + "loss": 0.6539, + "step": 701 + }, + { + "epoch": 0.4749661705006766, + "grad_norm": 1.6519286721992499, + "learning_rate": 9.897189045553917e-06, + "loss": 0.6403, + "step": 702 + }, + { + "epoch": 0.47564276048714477, + "grad_norm": 1.662796737313033, + "learning_rate": 9.896393286085085e-06, + "loss": 0.6284, + "step": 703 + }, + { + "epoch": 0.476319350473613, + "grad_norm": 1.6513492760188824, + "learning_rate": 9.895594491116336e-06, + "loss": 0.6105, + "step": 704 + }, + { + "epoch": 0.4769959404600812, + "grad_norm": 1.815000983504719, + "learning_rate": 9.89479266114288e-06, + "loss": 0.6721, + "step": 705 + }, + { + "epoch": 0.4776725304465494, + "grad_norm": 1.7292582267683765, + "learning_rate": 9.893987796661809e-06, + "loss": 0.6248, + "step": 706 + }, + { + "epoch": 0.4783491204330176, + "grad_norm": 1.7289472357290376, + "learning_rate": 9.893179898172095e-06, + "loss": 0.6493, + "step": 707 + }, + { + "epoch": 0.4790257104194858, + "grad_norm": 1.6593555289396826, + "learning_rate": 9.89236896617459e-06, + "loss": 0.5745, + "step": 708 + }, + { + "epoch": 0.479702300405954, + "grad_norm": 1.672847157894918, + "learning_rate": 9.891555001172032e-06, + "loss": 0.6026, + "step": 709 + }, + { + "epoch": 0.4803788903924222, + "grad_norm": 1.7502390676959785, + "learning_rate": 9.890738003669029e-06, + "loss": 0.6003, + "step": 710 + }, + { + "epoch": 0.4810554803788904, + "grad_norm": 1.7317795497035786, + "learning_rate": 9.88991797417208e-06, + "loss": 0.6252, + "step": 711 + }, + { + "epoch": 0.4817320703653586, + "grad_norm": 1.785190533669979, + "learning_rate": 9.889094913189561e-06, + "loss": 0.6374, + "step": 712 + }, + { + "epoch": 0.4824086603518268, + "grad_norm": 1.578833584389062, + "learning_rate": 9.888268821231721e-06, + "loss": 0.6063, + "step": 713 + }, + { + "epoch": 0.48308525033829497, + "grad_norm": 1.7596384787332495, + "learning_rate": 9.887439698810694e-06, + "loss": 0.62, + "step": 714 + }, + { + "epoch": 0.4837618403247632, + "grad_norm": 1.594461218310731, + "learning_rate": 9.886607546440492e-06, + "loss": 0.6045, + "step": 715 + }, + { + "epoch": 0.4844384303112314, + "grad_norm": 1.7861022819494534, + "learning_rate": 9.885772364637002e-06, + "loss": 0.6406, + "step": 716 + }, + { + "epoch": 0.4851150202976996, + "grad_norm": 1.7637826966498402, + "learning_rate": 9.884934153917998e-06, + "loss": 0.6699, + "step": 717 + }, + { + "epoch": 0.4857916102841678, + "grad_norm": 1.7715147918492709, + "learning_rate": 9.884092914803119e-06, + "loss": 0.6445, + "step": 718 + }, + { + "epoch": 0.486468200270636, + "grad_norm": 1.7337695186422941, + "learning_rate": 9.88324864781389e-06, + "loss": 0.6273, + "step": 719 + }, + { + "epoch": 0.4871447902571042, + "grad_norm": 1.5963191484454953, + "learning_rate": 9.882401353473711e-06, + "loss": 0.5803, + "step": 720 + }, + { + "epoch": 0.4878213802435724, + "grad_norm": 1.6713741090198078, + "learning_rate": 9.881551032307859e-06, + "loss": 0.6024, + "step": 721 + }, + { + "epoch": 0.4884979702300406, + "grad_norm": 1.6048808576870026, + "learning_rate": 9.880697684843487e-06, + "loss": 0.6145, + "step": 722 + }, + { + "epoch": 0.4891745602165088, + "grad_norm": 1.7435957538894575, + "learning_rate": 9.879841311609625e-06, + "loss": 0.6097, + "step": 723 + }, + { + "epoch": 0.489851150202977, + "grad_norm": 1.6588461692725849, + "learning_rate": 9.878981913137178e-06, + "loss": 0.6155, + "step": 724 + }, + { + "epoch": 0.49052774018944517, + "grad_norm": 1.7396401883773205, + "learning_rate": 9.878119489958929e-06, + "loss": 0.6211, + "step": 725 + }, + { + "epoch": 0.4912043301759134, + "grad_norm": 1.6723586481609591, + "learning_rate": 9.877254042609529e-06, + "loss": 0.6384, + "step": 726 + }, + { + "epoch": 0.4918809201623816, + "grad_norm": 1.5772435277903296, + "learning_rate": 9.87638557162551e-06, + "loss": 0.6014, + "step": 727 + }, + { + "epoch": 0.4925575101488498, + "grad_norm": 1.6462833868011817, + "learning_rate": 9.875514077545282e-06, + "loss": 0.6496, + "step": 728 + }, + { + "epoch": 0.493234100135318, + "grad_norm": 1.7827549905726272, + "learning_rate": 9.874639560909118e-06, + "loss": 0.6522, + "step": 729 + }, + { + "epoch": 0.4939106901217862, + "grad_norm": 1.623572952971577, + "learning_rate": 9.873762022259177e-06, + "loss": 0.6044, + "step": 730 + }, + { + "epoch": 0.4945872801082544, + "grad_norm": 1.6716564125292326, + "learning_rate": 9.87288146213948e-06, + "loss": 0.5997, + "step": 731 + }, + { + "epoch": 0.4952638700947226, + "grad_norm": 1.7137680916001972, + "learning_rate": 9.87199788109593e-06, + "loss": 0.6069, + "step": 732 + }, + { + "epoch": 0.4959404600811908, + "grad_norm": 1.7164683410507124, + "learning_rate": 9.8711112796763e-06, + "loss": 0.6033, + "step": 733 + }, + { + "epoch": 0.496617050067659, + "grad_norm": 1.7395270346998304, + "learning_rate": 9.870221658430233e-06, + "loss": 0.5927, + "step": 734 + }, + { + "epoch": 0.4972936400541272, + "grad_norm": 1.6223821863202548, + "learning_rate": 9.869329017909248e-06, + "loss": 0.6003, + "step": 735 + }, + { + "epoch": 0.4979702300405954, + "grad_norm": 1.6582083441683446, + "learning_rate": 9.868433358666734e-06, + "loss": 0.5864, + "step": 736 + }, + { + "epoch": 0.4986468200270636, + "grad_norm": 1.6235788830079854, + "learning_rate": 9.86753468125795e-06, + "loss": 0.6234, + "step": 737 + }, + { + "epoch": 0.4993234100135318, + "grad_norm": 1.5575952668537478, + "learning_rate": 9.86663298624003e-06, + "loss": 0.6021, + "step": 738 + }, + { + "epoch": 0.5, + "grad_norm": 1.659423383816892, + "learning_rate": 9.865728274171972e-06, + "loss": 0.6102, + "step": 739 + }, + { + "epoch": 0.5006765899864682, + "grad_norm": 1.6923854605985684, + "learning_rate": 9.864820545614656e-06, + "loss": 0.6446, + "step": 740 + }, + { + "epoch": 0.5013531799729364, + "grad_norm": 1.6846452394519977, + "learning_rate": 9.863909801130816e-06, + "loss": 0.6482, + "step": 741 + }, + { + "epoch": 0.5020297699594046, + "grad_norm": 1.7552123660153136, + "learning_rate": 9.862996041285071e-06, + "loss": 0.5958, + "step": 742 + }, + { + "epoch": 0.5027063599458728, + "grad_norm": 1.6085553057005082, + "learning_rate": 9.862079266643899e-06, + "loss": 0.6171, + "step": 743 + }, + { + "epoch": 0.503382949932341, + "grad_norm": 1.6659215114844248, + "learning_rate": 9.861159477775653e-06, + "loss": 0.611, + "step": 744 + }, + { + "epoch": 0.5040595399188093, + "grad_norm": 1.622234077476183, + "learning_rate": 9.860236675250553e-06, + "loss": 0.629, + "step": 745 + }, + { + "epoch": 0.5047361299052774, + "grad_norm": 1.6778622263084553, + "learning_rate": 9.859310859640685e-06, + "loss": 0.6113, + "step": 746 + }, + { + "epoch": 0.5054127198917456, + "grad_norm": 1.6063455871758163, + "learning_rate": 9.858382031520005e-06, + "loss": 0.6181, + "step": 747 + }, + { + "epoch": 0.5060893098782138, + "grad_norm": 1.6017126433116808, + "learning_rate": 9.857450191464337e-06, + "loss": 0.604, + "step": 748 + }, + { + "epoch": 0.506765899864682, + "grad_norm": 1.5800409303674505, + "learning_rate": 9.856515340051374e-06, + "loss": 0.621, + "step": 749 + }, + { + "epoch": 0.5074424898511503, + "grad_norm": 1.6335282822148747, + "learning_rate": 9.855577477860669e-06, + "loss": 0.6128, + "step": 750 + }, + { + "epoch": 0.5081190798376184, + "grad_norm": 1.6471217728117251, + "learning_rate": 9.854636605473647e-06, + "loss": 0.6264, + "step": 751 + }, + { + "epoch": 0.5087956698240866, + "grad_norm": 1.6708409892034344, + "learning_rate": 9.8536927234736e-06, + "loss": 0.5891, + "step": 752 + }, + { + "epoch": 0.5094722598105548, + "grad_norm": 1.5678996027099266, + "learning_rate": 9.852745832445684e-06, + "loss": 0.621, + "step": 753 + }, + { + "epoch": 0.510148849797023, + "grad_norm": 1.5662685910655971, + "learning_rate": 9.851795932976919e-06, + "loss": 0.5957, + "step": 754 + }, + { + "epoch": 0.5108254397834912, + "grad_norm": 1.541521445093749, + "learning_rate": 9.850843025656194e-06, + "loss": 0.6006, + "step": 755 + }, + { + "epoch": 0.5115020297699594, + "grad_norm": 1.548693267118839, + "learning_rate": 9.849887111074256e-06, + "loss": 0.5999, + "step": 756 + }, + { + "epoch": 0.5121786197564276, + "grad_norm": 1.6330795062621721, + "learning_rate": 9.848928189823724e-06, + "loss": 0.6405, + "step": 757 + }, + { + "epoch": 0.5128552097428958, + "grad_norm": 1.6165681102295442, + "learning_rate": 9.847966262499073e-06, + "loss": 0.6105, + "step": 758 + }, + { + "epoch": 0.513531799729364, + "grad_norm": 1.6337780848379253, + "learning_rate": 9.847001329696653e-06, + "loss": 0.6194, + "step": 759 + }, + { + "epoch": 0.5142083897158322, + "grad_norm": 1.598034839418694, + "learning_rate": 9.846033392014665e-06, + "loss": 0.6161, + "step": 760 + }, + { + "epoch": 0.5148849797023004, + "grad_norm": 1.5342729581057934, + "learning_rate": 9.84506245005318e-06, + "loss": 0.6009, + "step": 761 + }, + { + "epoch": 0.5155615696887687, + "grad_norm": 1.5924867997084955, + "learning_rate": 9.84408850441413e-06, + "loss": 0.6206, + "step": 762 + }, + { + "epoch": 0.5162381596752368, + "grad_norm": 1.6130256943080854, + "learning_rate": 9.843111555701307e-06, + "loss": 0.5903, + "step": 763 + }, + { + "epoch": 0.516914749661705, + "grad_norm": 1.692415463857787, + "learning_rate": 9.84213160452037e-06, + "loss": 0.619, + "step": 764 + }, + { + "epoch": 0.5175913396481732, + "grad_norm": 1.6261092756599715, + "learning_rate": 9.841148651478833e-06, + "loss": 0.6073, + "step": 765 + }, + { + "epoch": 0.5182679296346414, + "grad_norm": 1.6455088298521052, + "learning_rate": 9.840162697186075e-06, + "loss": 0.6099, + "step": 766 + }, + { + "epoch": 0.5189445196211097, + "grad_norm": 1.5521392563452239, + "learning_rate": 9.839173742253334e-06, + "loss": 0.6111, + "step": 767 + }, + { + "epoch": 0.5196211096075778, + "grad_norm": 1.5765567484232255, + "learning_rate": 9.838181787293707e-06, + "loss": 0.5873, + "step": 768 + }, + { + "epoch": 0.520297699594046, + "grad_norm": 1.580053176799126, + "learning_rate": 9.837186832922157e-06, + "loss": 0.6019, + "step": 769 + }, + { + "epoch": 0.5209742895805142, + "grad_norm": 1.4773449572850088, + "learning_rate": 9.8361888797555e-06, + "loss": 0.5992, + "step": 770 + }, + { + "epoch": 0.5216508795669824, + "grad_norm": 1.5134890082335568, + "learning_rate": 9.835187928412412e-06, + "loss": 0.5826, + "step": 771 + }, + { + "epoch": 0.5223274695534507, + "grad_norm": 1.5839800415867358, + "learning_rate": 9.834183979513427e-06, + "loss": 0.6098, + "step": 772 + }, + { + "epoch": 0.5230040595399188, + "grad_norm": 1.560504675974206, + "learning_rate": 9.833177033680945e-06, + "loss": 0.5776, + "step": 773 + }, + { + "epoch": 0.523680649526387, + "grad_norm": 1.5092576948094416, + "learning_rate": 9.832167091539215e-06, + "loss": 0.5801, + "step": 774 + }, + { + "epoch": 0.5243572395128552, + "grad_norm": 1.5258055048481598, + "learning_rate": 9.831154153714344e-06, + "loss": 0.6124, + "step": 775 + }, + { + "epoch": 0.5250338294993234, + "grad_norm": 1.6141761010516198, + "learning_rate": 9.830138220834305e-06, + "loss": 0.5627, + "step": 776 + }, + { + "epoch": 0.5257104194857916, + "grad_norm": 1.4899611207648424, + "learning_rate": 9.829119293528916e-06, + "loss": 0.5995, + "step": 777 + }, + { + "epoch": 0.5263870094722598, + "grad_norm": 1.5297959832164443, + "learning_rate": 9.82809737242986e-06, + "loss": 0.5994, + "step": 778 + }, + { + "epoch": 0.527063599458728, + "grad_norm": 1.5009548259970094, + "learning_rate": 9.827072458170673e-06, + "loss": 0.5871, + "step": 779 + }, + { + "epoch": 0.5277401894451962, + "grad_norm": 1.534081377423664, + "learning_rate": 9.826044551386743e-06, + "loss": 0.6127, + "step": 780 + }, + { + "epoch": 0.5284167794316644, + "grad_norm": 1.5835054937186908, + "learning_rate": 9.825013652715323e-06, + "loss": 0.5705, + "step": 781 + }, + { + "epoch": 0.5290933694181326, + "grad_norm": 1.6197476891525862, + "learning_rate": 9.82397976279551e-06, + "loss": 0.6249, + "step": 782 + }, + { + "epoch": 0.5297699594046008, + "grad_norm": 1.5118940285786684, + "learning_rate": 9.822942882268261e-06, + "loss": 0.5644, + "step": 783 + }, + { + "epoch": 0.530446549391069, + "grad_norm": 1.61150903255849, + "learning_rate": 9.821903011776385e-06, + "loss": 0.5976, + "step": 784 + }, + { + "epoch": 0.5311231393775372, + "grad_norm": 1.5878566064030444, + "learning_rate": 9.820860151964548e-06, + "loss": 0.5916, + "step": 785 + }, + { + "epoch": 0.5317997293640054, + "grad_norm": 1.5508462981554838, + "learning_rate": 9.819814303479268e-06, + "loss": 0.6005, + "step": 786 + }, + { + "epoch": 0.5324763193504736, + "grad_norm": 1.5243635027775746, + "learning_rate": 9.818765466968909e-06, + "loss": 0.5954, + "step": 787 + }, + { + "epoch": 0.5331529093369418, + "grad_norm": 1.5074445872918962, + "learning_rate": 9.8177136430837e-06, + "loss": 0.5652, + "step": 788 + }, + { + "epoch": 0.5338294993234101, + "grad_norm": 1.5398916190523528, + "learning_rate": 9.816658832475709e-06, + "loss": 0.6261, + "step": 789 + }, + { + "epoch": 0.5345060893098782, + "grad_norm": 1.4913814473732248, + "learning_rate": 9.815601035798866e-06, + "loss": 0.5544, + "step": 790 + }, + { + "epoch": 0.5351826792963464, + "grad_norm": 1.512191575421141, + "learning_rate": 9.814540253708945e-06, + "loss": 0.6049, + "step": 791 + }, + { + "epoch": 0.5358592692828146, + "grad_norm": 1.506943852426354, + "learning_rate": 9.813476486863575e-06, + "loss": 0.5747, + "step": 792 + }, + { + "epoch": 0.5365358592692828, + "grad_norm": 1.5289902512576314, + "learning_rate": 9.812409735922236e-06, + "loss": 0.5927, + "step": 793 + }, + { + "epoch": 0.5372124492557511, + "grad_norm": 1.4688894059588264, + "learning_rate": 9.811340001546252e-06, + "loss": 0.5626, + "step": 794 + }, + { + "epoch": 0.5378890392422192, + "grad_norm": 1.5502677365309896, + "learning_rate": 9.810267284398805e-06, + "loss": 0.5775, + "step": 795 + }, + { + "epoch": 0.5385656292286874, + "grad_norm": 1.504260579122203, + "learning_rate": 9.80919158514492e-06, + "loss": 0.5816, + "step": 796 + }, + { + "epoch": 0.5392422192151556, + "grad_norm": 1.5332683654835957, + "learning_rate": 9.80811290445147e-06, + "loss": 0.61, + "step": 797 + }, + { + "epoch": 0.5399188092016238, + "grad_norm": 1.5829335228456114, + "learning_rate": 9.807031242987182e-06, + "loss": 0.5739, + "step": 798 + }, + { + "epoch": 0.540595399188092, + "grad_norm": 1.477030825458606, + "learning_rate": 9.805946601422628e-06, + "loss": 0.5709, + "step": 799 + }, + { + "epoch": 0.5412719891745602, + "grad_norm": 1.5310442073122814, + "learning_rate": 9.804858980430225e-06, + "loss": 0.5774, + "step": 800 + }, + { + "epoch": 0.5419485791610285, + "grad_norm": 1.6198458755827938, + "learning_rate": 9.803768380684242e-06, + "loss": 0.5983, + "step": 801 + }, + { + "epoch": 0.5426251691474966, + "grad_norm": 1.5722213284156308, + "learning_rate": 9.80267480286079e-06, + "loss": 0.592, + "step": 802 + }, + { + "epoch": 0.5433017591339648, + "grad_norm": 1.5478352782171632, + "learning_rate": 9.801578247637828e-06, + "loss": 0.5861, + "step": 803 + }, + { + "epoch": 0.543978349120433, + "grad_norm": 1.5819907116760954, + "learning_rate": 9.800478715695165e-06, + "loss": 0.6087, + "step": 804 + }, + { + "epoch": 0.5446549391069012, + "grad_norm": 1.524512354441419, + "learning_rate": 9.799376207714446e-06, + "loss": 0.5879, + "step": 805 + }, + { + "epoch": 0.5453315290933695, + "grad_norm": 1.517953472171127, + "learning_rate": 9.79827072437917e-06, + "loss": 0.5882, + "step": 806 + }, + { + "epoch": 0.5460081190798376, + "grad_norm": 1.6697483870541, + "learning_rate": 9.797162266374677e-06, + "loss": 0.6129, + "step": 807 + }, + { + "epoch": 0.5466847090663058, + "grad_norm": 1.5406927455004877, + "learning_rate": 9.79605083438815e-06, + "loss": 0.6111, + "step": 808 + }, + { + "epoch": 0.547361299052774, + "grad_norm": 1.573139563138134, + "learning_rate": 9.794936429108617e-06, + "loss": 0.5508, + "step": 809 + }, + { + "epoch": 0.5480378890392422, + "grad_norm": 1.5569949399993268, + "learning_rate": 9.79381905122695e-06, + "loss": 0.5967, + "step": 810 + }, + { + "epoch": 0.5487144790257105, + "grad_norm": 1.440985317133278, + "learning_rate": 9.792698701435863e-06, + "loss": 0.5741, + "step": 811 + }, + { + "epoch": 0.5493910690121786, + "grad_norm": 1.4814660344152049, + "learning_rate": 9.791575380429911e-06, + "loss": 0.5865, + "step": 812 + }, + { + "epoch": 0.5500676589986468, + "grad_norm": 1.4855959485099581, + "learning_rate": 9.790449088905496e-06, + "loss": 0.5847, + "step": 813 + }, + { + "epoch": 0.550744248985115, + "grad_norm": 1.3877950490527795, + "learning_rate": 9.789319827560854e-06, + "loss": 0.5636, + "step": 814 + }, + { + "epoch": 0.5514208389715832, + "grad_norm": 1.4871104111545084, + "learning_rate": 9.78818759709607e-06, + "loss": 0.5699, + "step": 815 + }, + { + "epoch": 0.5520974289580515, + "grad_norm": 1.5520974904698874, + "learning_rate": 9.787052398213062e-06, + "loss": 0.6043, + "step": 816 + }, + { + "epoch": 0.5527740189445196, + "grad_norm": 1.51941049125711, + "learning_rate": 9.785914231615595e-06, + "loss": 0.5894, + "step": 817 + }, + { + "epoch": 0.5534506089309879, + "grad_norm": 1.51253479768057, + "learning_rate": 9.784773098009269e-06, + "loss": 0.5941, + "step": 818 + }, + { + "epoch": 0.554127198917456, + "grad_norm": 1.4985468635005212, + "learning_rate": 9.783628998101525e-06, + "loss": 0.6045, + "step": 819 + }, + { + "epoch": 0.5548037889039242, + "grad_norm": 1.7399730564579428, + "learning_rate": 9.782481932601643e-06, + "loss": 0.6035, + "step": 820 + }, + { + "epoch": 0.5554803788903924, + "grad_norm": 1.5148424096689828, + "learning_rate": 9.781331902220748e-06, + "loss": 0.5818, + "step": 821 + }, + { + "epoch": 0.5561569688768606, + "grad_norm": 1.5734432459925136, + "learning_rate": 9.780178907671788e-06, + "loss": 0.598, + "step": 822 + }, + { + "epoch": 0.5568335588633289, + "grad_norm": 1.4341192532764733, + "learning_rate": 9.779022949669565e-06, + "loss": 0.5712, + "step": 823 + }, + { + "epoch": 0.557510148849797, + "grad_norm": 1.4828360153292617, + "learning_rate": 9.777864028930705e-06, + "loss": 0.5762, + "step": 824 + }, + { + "epoch": 0.5581867388362652, + "grad_norm": 1.5272581217393904, + "learning_rate": 9.776702146173678e-06, + "loss": 0.5902, + "step": 825 + }, + { + "epoch": 0.5588633288227334, + "grad_norm": 1.474462218909566, + "learning_rate": 9.775537302118791e-06, + "loss": 0.5937, + "step": 826 + }, + { + "epoch": 0.5595399188092016, + "grad_norm": 1.4711865855163142, + "learning_rate": 9.77436949748818e-06, + "loss": 0.5624, + "step": 827 + }, + { + "epoch": 0.5602165087956699, + "grad_norm": 1.5541161951656597, + "learning_rate": 9.773198733005827e-06, + "loss": 0.5636, + "step": 828 + }, + { + "epoch": 0.560893098782138, + "grad_norm": 1.4528736826698192, + "learning_rate": 9.772025009397538e-06, + "loss": 0.5708, + "step": 829 + }, + { + "epoch": 0.5615696887686062, + "grad_norm": 1.5348342474317382, + "learning_rate": 9.770848327390961e-06, + "loss": 0.578, + "step": 830 + }, + { + "epoch": 0.5622462787550744, + "grad_norm": 1.5639549858597588, + "learning_rate": 9.769668687715572e-06, + "loss": 0.5973, + "step": 831 + }, + { + "epoch": 0.5629228687415426, + "grad_norm": 1.4919566493475664, + "learning_rate": 9.76848609110269e-06, + "loss": 0.5771, + "step": 832 + }, + { + "epoch": 0.5635994587280109, + "grad_norm": 1.590278449722994, + "learning_rate": 9.767300538285454e-06, + "loss": 0.5713, + "step": 833 + }, + { + "epoch": 0.564276048714479, + "grad_norm": 1.5159888392403542, + "learning_rate": 9.766112029998847e-06, + "loss": 0.5746, + "step": 834 + }, + { + "epoch": 0.5649526387009473, + "grad_norm": 1.4087331775743346, + "learning_rate": 9.76492056697968e-06, + "loss": 0.5656, + "step": 835 + }, + { + "epoch": 0.5656292286874154, + "grad_norm": 1.5401107560462461, + "learning_rate": 9.763726149966596e-06, + "loss": 0.5855, + "step": 836 + }, + { + "epoch": 0.5663058186738836, + "grad_norm": 1.4859624312720217, + "learning_rate": 9.762528779700067e-06, + "loss": 0.5793, + "step": 837 + }, + { + "epoch": 0.5669824086603519, + "grad_norm": 1.5000618258437695, + "learning_rate": 9.7613284569224e-06, + "loss": 0.5952, + "step": 838 + }, + { + "epoch": 0.56765899864682, + "grad_norm": 1.4847226268347697, + "learning_rate": 9.760125182377732e-06, + "loss": 0.5599, + "step": 839 + }, + { + "epoch": 0.5683355886332883, + "grad_norm": 1.3974150433692198, + "learning_rate": 9.758918956812024e-06, + "loss": 0.5836, + "step": 840 + }, + { + "epoch": 0.5690121786197564, + "grad_norm": 1.5439055222320957, + "learning_rate": 9.757709780973074e-06, + "loss": 0.5825, + "step": 841 + }, + { + "epoch": 0.5696887686062246, + "grad_norm": 1.4336760360463616, + "learning_rate": 9.756497655610503e-06, + "loss": 0.5572, + "step": 842 + }, + { + "epoch": 0.5703653585926928, + "grad_norm": 1.4289167361745316, + "learning_rate": 9.755282581475769e-06, + "loss": 0.5614, + "step": 843 + }, + { + "epoch": 0.571041948579161, + "grad_norm": 1.4397286373393339, + "learning_rate": 9.754064559322147e-06, + "loss": 0.5759, + "step": 844 + }, + { + "epoch": 0.5717185385656293, + "grad_norm": 1.3921029382982602, + "learning_rate": 9.752843589904746e-06, + "loss": 0.5647, + "step": 845 + }, + { + "epoch": 0.5723951285520974, + "grad_norm": 1.4569036063966363, + "learning_rate": 9.751619673980503e-06, + "loss": 0.5993, + "step": 846 + }, + { + "epoch": 0.5730717185385656, + "grad_norm": 1.4955773165520974, + "learning_rate": 9.75039281230818e-06, + "loss": 0.5747, + "step": 847 + }, + { + "epoch": 0.5737483085250338, + "grad_norm": 1.43011602275059, + "learning_rate": 9.749163005648362e-06, + "loss": 0.556, + "step": 848 + }, + { + "epoch": 0.574424898511502, + "grad_norm": 1.4987929090940766, + "learning_rate": 9.747930254763467e-06, + "loss": 0.601, + "step": 849 + }, + { + "epoch": 0.5751014884979703, + "grad_norm": 1.5221805690391956, + "learning_rate": 9.746694560417731e-06, + "loss": 0.5792, + "step": 850 + }, + { + "epoch": 0.5757780784844384, + "grad_norm": 1.4409909021292568, + "learning_rate": 9.745455923377218e-06, + "loss": 0.5686, + "step": 851 + }, + { + "epoch": 0.5764546684709067, + "grad_norm": 1.5331733122946738, + "learning_rate": 9.74421434440982e-06, + "loss": 0.5877, + "step": 852 + }, + { + "epoch": 0.5771312584573748, + "grad_norm": 1.7728555669909092, + "learning_rate": 9.742969824285244e-06, + "loss": 0.5796, + "step": 853 + }, + { + "epoch": 0.577807848443843, + "grad_norm": 1.5527178949346045, + "learning_rate": 9.741722363775029e-06, + "loss": 0.5775, + "step": 854 + }, + { + "epoch": 0.5784844384303113, + "grad_norm": 1.4144723729143567, + "learning_rate": 9.74047196365253e-06, + "loss": 0.5894, + "step": 855 + }, + { + "epoch": 0.5791610284167794, + "grad_norm": 1.4405172045715295, + "learning_rate": 9.73921862469293e-06, + "loss": 0.56, + "step": 856 + }, + { + "epoch": 0.5798376184032477, + "grad_norm": 1.4593763462458083, + "learning_rate": 9.737962347673232e-06, + "loss": 0.5653, + "step": 857 + }, + { + "epoch": 0.5805142083897158, + "grad_norm": 1.5231015715647205, + "learning_rate": 9.736703133372259e-06, + "loss": 0.6081, + "step": 858 + }, + { + "epoch": 0.581190798376184, + "grad_norm": 1.3640730713147275, + "learning_rate": 9.735440982570656e-06, + "loss": 0.5729, + "step": 859 + }, + { + "epoch": 0.5818673883626523, + "grad_norm": 1.4477405879003105, + "learning_rate": 9.734175896050889e-06, + "loss": 0.5443, + "step": 860 + }, + { + "epoch": 0.5825439783491204, + "grad_norm": 1.3710307808039208, + "learning_rate": 9.732907874597241e-06, + "loss": 0.542, + "step": 861 + }, + { + "epoch": 0.5832205683355887, + "grad_norm": 1.3824132249109022, + "learning_rate": 9.731636918995821e-06, + "loss": 0.574, + "step": 862 + }, + { + "epoch": 0.5838971583220568, + "grad_norm": 1.4496950380801756, + "learning_rate": 9.730363030034551e-06, + "loss": 0.5632, + "step": 863 + }, + { + "epoch": 0.584573748308525, + "grad_norm": 1.389239972724504, + "learning_rate": 9.729086208503174e-06, + "loss": 0.5623, + "step": 864 + }, + { + "epoch": 0.5852503382949933, + "grad_norm": 1.491143461609495, + "learning_rate": 9.72780645519325e-06, + "loss": 0.5456, + "step": 865 + }, + { + "epoch": 0.5859269282814614, + "grad_norm": 1.5239062721142338, + "learning_rate": 9.726523770898157e-06, + "loss": 0.5616, + "step": 866 + }, + { + "epoch": 0.5866035182679297, + "grad_norm": 1.5155111551809417, + "learning_rate": 9.725238156413089e-06, + "loss": 0.5865, + "step": 867 + }, + { + "epoch": 0.5872801082543978, + "grad_norm": 1.477315881988796, + "learning_rate": 9.72394961253506e-06, + "loss": 0.5578, + "step": 868 + }, + { + "epoch": 0.587956698240866, + "grad_norm": 1.3431613060276728, + "learning_rate": 9.722658140062898e-06, + "loss": 0.555, + "step": 869 + }, + { + "epoch": 0.5886332882273342, + "grad_norm": 1.472403673794486, + "learning_rate": 9.721363739797243e-06, + "loss": 0.5638, + "step": 870 + }, + { + "epoch": 0.5893098782138024, + "grad_norm": 1.5119770565727169, + "learning_rate": 9.720066412540554e-06, + "loss": 0.5509, + "step": 871 + }, + { + "epoch": 0.5899864682002707, + "grad_norm": 1.4581998018729656, + "learning_rate": 9.718766159097109e-06, + "loss": 0.5741, + "step": 872 + }, + { + "epoch": 0.5906630581867388, + "grad_norm": 1.4778398991082073, + "learning_rate": 9.717462980272989e-06, + "loss": 0.5838, + "step": 873 + }, + { + "epoch": 0.591339648173207, + "grad_norm": 1.450744346748018, + "learning_rate": 9.716156876876096e-06, + "loss": 0.5735, + "step": 874 + }, + { + "epoch": 0.5920162381596752, + "grad_norm": 1.4330043370067966, + "learning_rate": 9.714847849716149e-06, + "loss": 0.5486, + "step": 875 + }, + { + "epoch": 0.5926928281461434, + "grad_norm": 1.3295272980967532, + "learning_rate": 9.713535899604667e-06, + "loss": 0.5562, + "step": 876 + }, + { + "epoch": 0.5933694181326117, + "grad_norm": 1.5018536264245457, + "learning_rate": 9.71222102735499e-06, + "loss": 0.5559, + "step": 877 + }, + { + "epoch": 0.5940460081190798, + "grad_norm": 1.5885433466214243, + "learning_rate": 9.710903233782273e-06, + "loss": 0.59, + "step": 878 + }, + { + "epoch": 0.5947225981055481, + "grad_norm": 1.371744266648417, + "learning_rate": 9.70958251970347e-06, + "loss": 0.5463, + "step": 879 + }, + { + "epoch": 0.5953991880920162, + "grad_norm": 1.3863121051011533, + "learning_rate": 9.708258885937359e-06, + "loss": 0.5481, + "step": 880 + }, + { + "epoch": 0.5960757780784844, + "grad_norm": 1.4983021715077849, + "learning_rate": 9.706932333304518e-06, + "loss": 0.599, + "step": 881 + }, + { + "epoch": 0.5967523680649527, + "grad_norm": 1.4007593996650822, + "learning_rate": 9.705602862627335e-06, + "loss": 0.557, + "step": 882 + }, + { + "epoch": 0.5974289580514208, + "grad_norm": 1.4820775219149764, + "learning_rate": 9.704270474730018e-06, + "loss": 0.5603, + "step": 883 + }, + { + "epoch": 0.5981055480378891, + "grad_norm": 1.5332245521088, + "learning_rate": 9.70293517043857e-06, + "loss": 0.5657, + "step": 884 + }, + { + "epoch": 0.5987821380243572, + "grad_norm": 1.5419686111048978, + "learning_rate": 9.701596950580807e-06, + "loss": 0.5884, + "step": 885 + }, + { + "epoch": 0.5994587280108254, + "grad_norm": 1.4434909655666306, + "learning_rate": 9.700255815986357e-06, + "loss": 0.5599, + "step": 886 + }, + { + "epoch": 0.6001353179972937, + "grad_norm": 1.3979104123355959, + "learning_rate": 9.69891176748665e-06, + "loss": 0.5444, + "step": 887 + }, + { + "epoch": 0.6008119079837618, + "grad_norm": 1.331308664936877, + "learning_rate": 9.697564805914922e-06, + "loss": 0.5499, + "step": 888 + }, + { + "epoch": 0.6014884979702301, + "grad_norm": 1.327499797705502, + "learning_rate": 9.696214932106218e-06, + "loss": 0.5408, + "step": 889 + }, + { + "epoch": 0.6021650879566982, + "grad_norm": 1.3379638644164007, + "learning_rate": 9.694862146897385e-06, + "loss": 0.5497, + "step": 890 + }, + { + "epoch": 0.6028416779431665, + "grad_norm": 1.3288675507428929, + "learning_rate": 9.693506451127082e-06, + "loss": 0.5512, + "step": 891 + }, + { + "epoch": 0.6035182679296346, + "grad_norm": 1.3719618397137918, + "learning_rate": 9.692147845635761e-06, + "loss": 0.5609, + "step": 892 + }, + { + "epoch": 0.6041948579161028, + "grad_norm": 1.309436398801699, + "learning_rate": 9.690786331265687e-06, + "loss": 0.5488, + "step": 893 + }, + { + "epoch": 0.6048714479025711, + "grad_norm": 1.3943922122388013, + "learning_rate": 9.689421908860928e-06, + "loss": 0.5471, + "step": 894 + }, + { + "epoch": 0.6055480378890392, + "grad_norm": 1.4847830573388547, + "learning_rate": 9.688054579267347e-06, + "loss": 0.5655, + "step": 895 + }, + { + "epoch": 0.6062246278755075, + "grad_norm": 1.4118053169125588, + "learning_rate": 9.68668434333262e-06, + "loss": 0.5689, + "step": 896 + }, + { + "epoch": 0.6069012178619756, + "grad_norm": 1.4584980750152714, + "learning_rate": 9.685311201906216e-06, + "loss": 0.5756, + "step": 897 + }, + { + "epoch": 0.6075778078484438, + "grad_norm": 1.425428426125189, + "learning_rate": 9.683935155839408e-06, + "loss": 0.5874, + "step": 898 + }, + { + "epoch": 0.6082543978349121, + "grad_norm": 1.3970744593822986, + "learning_rate": 9.682556205985274e-06, + "loss": 0.5661, + "step": 899 + }, + { + "epoch": 0.6089309878213802, + "grad_norm": 1.3837680826838403, + "learning_rate": 9.681174353198687e-06, + "loss": 0.5714, + "step": 900 + }, + { + "epoch": 0.6096075778078485, + "grad_norm": 1.441948220204842, + "learning_rate": 9.67978959833632e-06, + "loss": 0.5478, + "step": 901 + }, + { + "epoch": 0.6102841677943166, + "grad_norm": 1.3741120155021025, + "learning_rate": 9.678401942256648e-06, + "loss": 0.5837, + "step": 902 + }, + { + "epoch": 0.6109607577807848, + "grad_norm": 1.3963868245289623, + "learning_rate": 9.67701138581994e-06, + "loss": 0.5608, + "step": 903 + }, + { + "epoch": 0.6116373477672531, + "grad_norm": 1.384844476497834, + "learning_rate": 9.675617929888271e-06, + "loss": 0.5368, + "step": 904 + }, + { + "epoch": 0.6123139377537212, + "grad_norm": 1.3278573496251747, + "learning_rate": 9.674221575325503e-06, + "loss": 0.54, + "step": 905 + }, + { + "epoch": 0.6129905277401895, + "grad_norm": 1.3923228339398561, + "learning_rate": 9.672822322997305e-06, + "loss": 0.561, + "step": 906 + }, + { + "epoch": 0.6136671177266576, + "grad_norm": 1.395088698787219, + "learning_rate": 9.671420173771135e-06, + "loss": 0.5593, + "step": 907 + }, + { + "epoch": 0.6143437077131259, + "grad_norm": 1.4409163976038764, + "learning_rate": 9.670015128516253e-06, + "loss": 0.5521, + "step": 908 + }, + { + "epoch": 0.6150202976995941, + "grad_norm": 1.3785741338135242, + "learning_rate": 9.668607188103708e-06, + "loss": 0.5645, + "step": 909 + }, + { + "epoch": 0.6156968876860622, + "grad_norm": 1.4063255874161504, + "learning_rate": 9.667196353406352e-06, + "loss": 0.5514, + "step": 910 + }, + { + "epoch": 0.6163734776725305, + "grad_norm": 1.3357192560767774, + "learning_rate": 9.665782625298821e-06, + "loss": 0.5383, + "step": 911 + }, + { + "epoch": 0.6170500676589986, + "grad_norm": 1.3539482074197706, + "learning_rate": 9.664366004657553e-06, + "loss": 0.5604, + "step": 912 + }, + { + "epoch": 0.6177266576454669, + "grad_norm": 1.310691687673011, + "learning_rate": 9.662946492360777e-06, + "loss": 0.5303, + "step": 913 + }, + { + "epoch": 0.618403247631935, + "grad_norm": 1.2800569076417998, + "learning_rate": 9.66152408928851e-06, + "loss": 0.556, + "step": 914 + }, + { + "epoch": 0.6190798376184032, + "grad_norm": 1.3973489965953194, + "learning_rate": 9.66009879632257e-06, + "loss": 0.5725, + "step": 915 + }, + { + "epoch": 0.6197564276048715, + "grad_norm": 1.451780064487979, + "learning_rate": 9.65867061434656e-06, + "loss": 0.5617, + "step": 916 + }, + { + "epoch": 0.6204330175913396, + "grad_norm": 1.35933524582482, + "learning_rate": 9.657239544245877e-06, + "loss": 0.5573, + "step": 917 + }, + { + "epoch": 0.6211096075778079, + "grad_norm": 1.4232172017425695, + "learning_rate": 9.655805586907705e-06, + "loss": 0.5466, + "step": 918 + }, + { + "epoch": 0.621786197564276, + "grad_norm": 1.4312081095893574, + "learning_rate": 9.654368743221022e-06, + "loss": 0.5625, + "step": 919 + }, + { + "epoch": 0.6224627875507442, + "grad_norm": 1.3912552358253953, + "learning_rate": 9.652929014076593e-06, + "loss": 0.5683, + "step": 920 + }, + { + "epoch": 0.6231393775372125, + "grad_norm": 1.3702305761206166, + "learning_rate": 9.651486400366972e-06, + "loss": 0.5302, + "step": 921 + }, + { + "epoch": 0.6238159675236806, + "grad_norm": 1.3785136595357277, + "learning_rate": 9.650040902986504e-06, + "loss": 0.5267, + "step": 922 + }, + { + "epoch": 0.6244925575101489, + "grad_norm": 1.42489356713615, + "learning_rate": 9.648592522831316e-06, + "loss": 0.5678, + "step": 923 + }, + { + "epoch": 0.625169147496617, + "grad_norm": 1.458705178998173, + "learning_rate": 9.64714126079933e-06, + "loss": 0.5903, + "step": 924 + }, + { + "epoch": 0.6258457374830853, + "grad_norm": 1.2919714844532744, + "learning_rate": 9.645687117790246e-06, + "loss": 0.539, + "step": 925 + }, + { + "epoch": 0.6265223274695535, + "grad_norm": 1.3159779958295432, + "learning_rate": 9.644230094705555e-06, + "loss": 0.5482, + "step": 926 + }, + { + "epoch": 0.6271989174560216, + "grad_norm": 1.372558886610807, + "learning_rate": 9.642770192448537e-06, + "loss": 0.5513, + "step": 927 + }, + { + "epoch": 0.6278755074424899, + "grad_norm": 1.4623448167001696, + "learning_rate": 9.641307411924246e-06, + "loss": 0.553, + "step": 928 + }, + { + "epoch": 0.628552097428958, + "grad_norm": 1.2857193407365461, + "learning_rate": 9.639841754039534e-06, + "loss": 0.5448, + "step": 929 + }, + { + "epoch": 0.6292286874154263, + "grad_norm": 1.3979647765995682, + "learning_rate": 9.638373219703023e-06, + "loss": 0.5409, + "step": 930 + }, + { + "epoch": 0.6299052774018945, + "grad_norm": 1.4450876574726212, + "learning_rate": 9.63690180982513e-06, + "loss": 0.5499, + "step": 931 + }, + { + "epoch": 0.6305818673883626, + "grad_norm": 1.387580596637908, + "learning_rate": 9.635427525318048e-06, + "loss": 0.5444, + "step": 932 + }, + { + "epoch": 0.6312584573748309, + "grad_norm": 1.3897869799870526, + "learning_rate": 9.633950367095758e-06, + "loss": 0.5409, + "step": 933 + }, + { + "epoch": 0.631935047361299, + "grad_norm": 1.3799414280334457, + "learning_rate": 9.632470336074009e-06, + "loss": 0.5615, + "step": 934 + }, + { + "epoch": 0.6326116373477673, + "grad_norm": 1.397109410439002, + "learning_rate": 9.63098743317035e-06, + "loss": 0.5588, + "step": 935 + }, + { + "epoch": 0.6332882273342354, + "grad_norm": 1.3877108753591307, + "learning_rate": 9.629501659304096e-06, + "loss": 0.5332, + "step": 936 + }, + { + "epoch": 0.6339648173207036, + "grad_norm": 1.425214924037294, + "learning_rate": 9.628013015396347e-06, + "loss": 0.5406, + "step": 937 + }, + { + "epoch": 0.6346414073071719, + "grad_norm": 1.427636608291554, + "learning_rate": 9.626521502369984e-06, + "loss": 0.5588, + "step": 938 + }, + { + "epoch": 0.63531799729364, + "grad_norm": 1.3998173217547065, + "learning_rate": 9.625027121149665e-06, + "loss": 0.5755, + "step": 939 + }, + { + "epoch": 0.6359945872801083, + "grad_norm": 1.3913272168959687, + "learning_rate": 9.623529872661821e-06, + "loss": 0.5911, + "step": 940 + }, + { + "epoch": 0.6366711772665764, + "grad_norm": 1.4119241403062532, + "learning_rate": 9.62202975783467e-06, + "loss": 0.5698, + "step": 941 + }, + { + "epoch": 0.6373477672530447, + "grad_norm": 1.369482238930773, + "learning_rate": 9.620526777598202e-06, + "loss": 0.5501, + "step": 942 + }, + { + "epoch": 0.6380243572395129, + "grad_norm": 1.4337966859514184, + "learning_rate": 9.619020932884182e-06, + "loss": 0.5809, + "step": 943 + }, + { + "epoch": 0.638700947225981, + "grad_norm": 1.3801684958426206, + "learning_rate": 9.617512224626153e-06, + "loss": 0.538, + "step": 944 + }, + { + "epoch": 0.6393775372124493, + "grad_norm": 1.3568642938213509, + "learning_rate": 9.616000653759435e-06, + "loss": 0.5234, + "step": 945 + }, + { + "epoch": 0.6400541271989174, + "grad_norm": 1.4120466838855552, + "learning_rate": 9.614486221221115e-06, + "loss": 0.5589, + "step": 946 + }, + { + "epoch": 0.6407307171853857, + "grad_norm": 1.4200432234755074, + "learning_rate": 9.612968927950066e-06, + "loss": 0.5589, + "step": 947 + }, + { + "epoch": 0.6414073071718539, + "grad_norm": 1.514542623903677, + "learning_rate": 9.611448774886925e-06, + "loss": 0.5467, + "step": 948 + }, + { + "epoch": 0.642083897158322, + "grad_norm": 1.3394445600719525, + "learning_rate": 9.609925762974103e-06, + "loss": 0.5461, + "step": 949 + }, + { + "epoch": 0.6427604871447903, + "grad_norm": 1.3558253169872096, + "learning_rate": 9.60839989315579e-06, + "loss": 0.5464, + "step": 950 + }, + { + "epoch": 0.6434370771312584, + "grad_norm": 1.3355916041692906, + "learning_rate": 9.606871166377939e-06, + "loss": 0.5473, + "step": 951 + }, + { + "epoch": 0.6441136671177267, + "grad_norm": 1.4277624564048805, + "learning_rate": 9.60533958358828e-06, + "loss": 0.5627, + "step": 952 + }, + { + "epoch": 0.6447902571041949, + "grad_norm": 1.39810856974054, + "learning_rate": 9.603805145736311e-06, + "loss": 0.5506, + "step": 953 + }, + { + "epoch": 0.645466847090663, + "grad_norm": 1.399253302116368, + "learning_rate": 9.602267853773301e-06, + "loss": 0.525, + "step": 954 + }, + { + "epoch": 0.6461434370771313, + "grad_norm": 1.3599740675606824, + "learning_rate": 9.60072770865229e-06, + "loss": 0.5466, + "step": 955 + }, + { + "epoch": 0.6468200270635994, + "grad_norm": 1.3238748547686656, + "learning_rate": 9.599184711328082e-06, + "loss": 0.5251, + "step": 956 + }, + { + "epoch": 0.6474966170500677, + "grad_norm": 1.405404092108196, + "learning_rate": 9.597638862757255e-06, + "loss": 0.5839, + "step": 957 + }, + { + "epoch": 0.6481732070365359, + "grad_norm": 1.3278904637408784, + "learning_rate": 9.596090163898148e-06, + "loss": 0.5473, + "step": 958 + }, + { + "epoch": 0.648849797023004, + "grad_norm": 1.3142690515046698, + "learning_rate": 9.594538615710875e-06, + "loss": 0.5397, + "step": 959 + }, + { + "epoch": 0.6495263870094723, + "grad_norm": 1.2226279962051259, + "learning_rate": 9.59298421915731e-06, + "loss": 0.5249, + "step": 960 + }, + { + "epoch": 0.6502029769959404, + "grad_norm": 1.3097653165372385, + "learning_rate": 9.591426975201093e-06, + "loss": 0.5431, + "step": 961 + }, + { + "epoch": 0.6508795669824087, + "grad_norm": 1.3418537512883573, + "learning_rate": 9.589866884807637e-06, + "loss": 0.5573, + "step": 962 + }, + { + "epoch": 0.6515561569688768, + "grad_norm": 1.3365246025089783, + "learning_rate": 9.588303948944109e-06, + "loss": 0.5154, + "step": 963 + }, + { + "epoch": 0.652232746955345, + "grad_norm": 1.3271238141898198, + "learning_rate": 9.586738168579446e-06, + "loss": 0.5466, + "step": 964 + }, + { + "epoch": 0.6529093369418133, + "grad_norm": 1.3178585046613578, + "learning_rate": 9.58516954468435e-06, + "loss": 0.5526, + "step": 965 + }, + { + "epoch": 0.6535859269282814, + "grad_norm": 1.3989072692037787, + "learning_rate": 9.58359807823128e-06, + "loss": 0.5663, + "step": 966 + }, + { + "epoch": 0.6542625169147497, + "grad_norm": 1.3555037515851058, + "learning_rate": 9.582023770194462e-06, + "loss": 0.5547, + "step": 967 + }, + { + "epoch": 0.6549391069012178, + "grad_norm": 1.3813861872852315, + "learning_rate": 9.580446621549883e-06, + "loss": 0.5479, + "step": 968 + }, + { + "epoch": 0.6556156968876861, + "grad_norm": 1.424745070176629, + "learning_rate": 9.578866633275289e-06, + "loss": 0.568, + "step": 969 + }, + { + "epoch": 0.6562922868741543, + "grad_norm": 1.2674000115717234, + "learning_rate": 9.577283806350186e-06, + "loss": 0.4873, + "step": 970 + }, + { + "epoch": 0.6569688768606224, + "grad_norm": 1.4144882152699472, + "learning_rate": 9.575698141755844e-06, + "loss": 0.5732, + "step": 971 + }, + { + "epoch": 0.6576454668470907, + "grad_norm": 1.3286139699632946, + "learning_rate": 9.57410964047529e-06, + "loss": 0.549, + "step": 972 + }, + { + "epoch": 0.6583220568335588, + "grad_norm": 1.369980768822487, + "learning_rate": 9.572518303493305e-06, + "loss": 0.5637, + "step": 973 + }, + { + "epoch": 0.6589986468200271, + "grad_norm": 1.3191259196641867, + "learning_rate": 9.570924131796437e-06, + "loss": 0.5219, + "step": 974 + }, + { + "epoch": 0.6596752368064953, + "grad_norm": 1.2876748949527599, + "learning_rate": 9.569327126372985e-06, + "loss": 0.5313, + "step": 975 + }, + { + "epoch": 0.6603518267929634, + "grad_norm": 1.2517769114590152, + "learning_rate": 9.567727288213005e-06, + "loss": 0.5229, + "step": 976 + }, + { + "epoch": 0.6610284167794317, + "grad_norm": 1.3386273910558741, + "learning_rate": 9.566124618308312e-06, + "loss": 0.5554, + "step": 977 + }, + { + "epoch": 0.6617050067658998, + "grad_norm": 1.2835140736196629, + "learning_rate": 9.564519117652473e-06, + "loss": 0.5691, + "step": 978 + }, + { + "epoch": 0.6623815967523681, + "grad_norm": 1.3637938650411934, + "learning_rate": 9.562910787240814e-06, + "loss": 0.5489, + "step": 979 + }, + { + "epoch": 0.6630581867388363, + "grad_norm": 1.349687301178395, + "learning_rate": 9.56129962807041e-06, + "loss": 0.5521, + "step": 980 + }, + { + "epoch": 0.6637347767253045, + "grad_norm": 1.4169245371858592, + "learning_rate": 9.559685641140098e-06, + "loss": 0.5587, + "step": 981 + }, + { + "epoch": 0.6644113667117727, + "grad_norm": 1.3985311547678738, + "learning_rate": 9.55806882745046e-06, + "loss": 0.5285, + "step": 982 + }, + { + "epoch": 0.6650879566982408, + "grad_norm": 1.307960497977671, + "learning_rate": 9.556449188003831e-06, + "loss": 0.5028, + "step": 983 + }, + { + "epoch": 0.6657645466847091, + "grad_norm": 1.501170859949113, + "learning_rate": 9.554826723804304e-06, + "loss": 0.5557, + "step": 984 + }, + { + "epoch": 0.6664411366711772, + "grad_norm": 1.5050837366352103, + "learning_rate": 9.553201435857718e-06, + "loss": 0.5449, + "step": 985 + }, + { + "epoch": 0.6671177266576455, + "grad_norm": 1.279971240070052, + "learning_rate": 9.551573325171662e-06, + "loss": 0.52, + "step": 986 + }, + { + "epoch": 0.6677943166441137, + "grad_norm": 1.3580033928572774, + "learning_rate": 9.54994239275548e-06, + "loss": 0.5281, + "step": 987 + }, + { + "epoch": 0.6684709066305818, + "grad_norm": 1.4122733978886406, + "learning_rate": 9.54830863962026e-06, + "loss": 0.5377, + "step": 988 + }, + { + "epoch": 0.6691474966170501, + "grad_norm": 1.318103326767308, + "learning_rate": 9.546672066778842e-06, + "loss": 0.5491, + "step": 989 + }, + { + "epoch": 0.6698240866035182, + "grad_norm": 1.367910485968864, + "learning_rate": 9.545032675245814e-06, + "loss": 0.5509, + "step": 990 + }, + { + "epoch": 0.6705006765899865, + "grad_norm": 1.3568884466439552, + "learning_rate": 9.543390466037507e-06, + "loss": 0.5568, + "step": 991 + }, + { + "epoch": 0.6711772665764547, + "grad_norm": 1.2525351435097483, + "learning_rate": 9.541745440172006e-06, + "loss": 0.5121, + "step": 992 + }, + { + "epoch": 0.6718538565629228, + "grad_norm": 1.2936482783093453, + "learning_rate": 9.540097598669135e-06, + "loss": 0.5144, + "step": 993 + }, + { + "epoch": 0.6725304465493911, + "grad_norm": 1.36600537631378, + "learning_rate": 9.538446942550468e-06, + "loss": 0.5528, + "step": 994 + }, + { + "epoch": 0.6732070365358592, + "grad_norm": 1.2729433973145832, + "learning_rate": 9.536793472839325e-06, + "loss": 0.5239, + "step": 995 + }, + { + "epoch": 0.6738836265223275, + "grad_norm": 1.327473038211668, + "learning_rate": 9.535137190560765e-06, + "loss": 0.5554, + "step": 996 + }, + { + "epoch": 0.6745602165087957, + "grad_norm": 1.3515073959577413, + "learning_rate": 9.533478096741597e-06, + "loss": 0.5361, + "step": 997 + }, + { + "epoch": 0.6752368064952639, + "grad_norm": 1.2826188441164121, + "learning_rate": 9.531816192410366e-06, + "loss": 0.5459, + "step": 998 + }, + { + "epoch": 0.6759133964817321, + "grad_norm": 1.300107320008121, + "learning_rate": 9.530151478597366e-06, + "loss": 0.5608, + "step": 999 + }, + { + "epoch": 0.6765899864682002, + "grad_norm": 1.3356839606812958, + "learning_rate": 9.528483956334628e-06, + "loss": 0.5147, + "step": 1000 + }, + { + "epoch": 0.6772665764546685, + "grad_norm": 1.3120279365134704, + "learning_rate": 9.526813626655929e-06, + "loss": 0.536, + "step": 1001 + }, + { + "epoch": 0.6779431664411367, + "grad_norm": 1.3297459079562277, + "learning_rate": 9.525140490596778e-06, + "loss": 0.5389, + "step": 1002 + }, + { + "epoch": 0.6786197564276049, + "grad_norm": 1.263353060560232, + "learning_rate": 9.523464549194434e-06, + "loss": 0.5222, + "step": 1003 + }, + { + "epoch": 0.6792963464140731, + "grad_norm": 1.3766618870033855, + "learning_rate": 9.521785803487888e-06, + "loss": 0.527, + "step": 1004 + }, + { + "epoch": 0.6799729364005412, + "grad_norm": 1.3777397427193374, + "learning_rate": 9.520104254517873e-06, + "loss": 0.5463, + "step": 1005 + }, + { + "epoch": 0.6806495263870095, + "grad_norm": 1.2408275412363299, + "learning_rate": 9.518419903326859e-06, + "loss": 0.5164, + "step": 1006 + }, + { + "epoch": 0.6813261163734776, + "grad_norm": 1.3468025610994963, + "learning_rate": 9.51673275095905e-06, + "loss": 0.5432, + "step": 1007 + }, + { + "epoch": 0.6820027063599459, + "grad_norm": 1.363867425812368, + "learning_rate": 9.515042798460393e-06, + "loss": 0.5657, + "step": 1008 + }, + { + "epoch": 0.6826792963464141, + "grad_norm": 1.2738870118029653, + "learning_rate": 9.513350046878565e-06, + "loss": 0.5526, + "step": 1009 + }, + { + "epoch": 0.6833558863328822, + "grad_norm": 1.3541456471191247, + "learning_rate": 9.511654497262984e-06, + "loss": 0.5484, + "step": 1010 + }, + { + "epoch": 0.6840324763193505, + "grad_norm": 1.2885587079894851, + "learning_rate": 9.509956150664796e-06, + "loss": 0.5126, + "step": 1011 + }, + { + "epoch": 0.6847090663058186, + "grad_norm": 1.3438732729106644, + "learning_rate": 9.508255008136885e-06, + "loss": 0.5218, + "step": 1012 + }, + { + "epoch": 0.6853856562922869, + "grad_norm": 1.3795448163943937, + "learning_rate": 9.506551070733869e-06, + "loss": 0.5489, + "step": 1013 + }, + { + "epoch": 0.6860622462787551, + "grad_norm": 1.285620830621772, + "learning_rate": 9.504844339512096e-06, + "loss": 0.5345, + "step": 1014 + }, + { + "epoch": 0.6867388362652233, + "grad_norm": 1.2756329996389932, + "learning_rate": 9.50313481552965e-06, + "loss": 0.5126, + "step": 1015 + }, + { + "epoch": 0.6874154262516915, + "grad_norm": 1.2432220267937537, + "learning_rate": 9.501422499846338e-06, + "loss": 0.524, + "step": 1016 + }, + { + "epoch": 0.6880920162381596, + "grad_norm": 1.257677476756879, + "learning_rate": 9.49970739352371e-06, + "loss": 0.534, + "step": 1017 + }, + { + "epoch": 0.6887686062246279, + "grad_norm": 1.3273447461283017, + "learning_rate": 9.497989497625036e-06, + "loss": 0.5406, + "step": 1018 + }, + { + "epoch": 0.6894451962110961, + "grad_norm": 1.260547433668687, + "learning_rate": 9.49626881321532e-06, + "loss": 0.5173, + "step": 1019 + }, + { + "epoch": 0.6901217861975643, + "grad_norm": 1.2886542014829554, + "learning_rate": 9.494545341361291e-06, + "loss": 0.5143, + "step": 1020 + }, + { + "epoch": 0.6907983761840325, + "grad_norm": 1.343253740959597, + "learning_rate": 9.492819083131412e-06, + "loss": 0.5618, + "step": 1021 + }, + { + "epoch": 0.6914749661705006, + "grad_norm": 1.2555010974446998, + "learning_rate": 9.491090039595869e-06, + "loss": 0.5319, + "step": 1022 + }, + { + "epoch": 0.6921515561569689, + "grad_norm": 1.319505203780607, + "learning_rate": 9.489358211826577e-06, + "loss": 0.5375, + "step": 1023 + }, + { + "epoch": 0.6928281461434371, + "grad_norm": 1.1996956522628355, + "learning_rate": 9.487623600897172e-06, + "loss": 0.4947, + "step": 1024 + }, + { + "epoch": 0.6935047361299053, + "grad_norm": 1.3127047147597104, + "learning_rate": 9.485886207883022e-06, + "loss": 0.528, + "step": 1025 + }, + { + "epoch": 0.6941813261163735, + "grad_norm": 1.3191073278457845, + "learning_rate": 9.484146033861216e-06, + "loss": 0.5259, + "step": 1026 + }, + { + "epoch": 0.6948579161028416, + "grad_norm": 1.254310556929913, + "learning_rate": 9.482403079910571e-06, + "loss": 0.5119, + "step": 1027 + }, + { + "epoch": 0.6955345060893099, + "grad_norm": 1.2337606715988945, + "learning_rate": 9.480657347111621e-06, + "loss": 0.503, + "step": 1028 + }, + { + "epoch": 0.696211096075778, + "grad_norm": 1.3540905295740424, + "learning_rate": 9.478908836546629e-06, + "loss": 0.5292, + "step": 1029 + }, + { + "epoch": 0.6968876860622463, + "grad_norm": 1.4171859855033513, + "learning_rate": 9.477157549299574e-06, + "loss": 0.5524, + "step": 1030 + }, + { + "epoch": 0.6975642760487145, + "grad_norm": 1.393922847605659, + "learning_rate": 9.475403486456162e-06, + "loss": 0.5356, + "step": 1031 + }, + { + "epoch": 0.6982408660351827, + "grad_norm": 1.2893129326178892, + "learning_rate": 9.473646649103819e-06, + "loss": 0.5303, + "step": 1032 + }, + { + "epoch": 0.6989174560216509, + "grad_norm": 1.395325205307781, + "learning_rate": 9.471887038331686e-06, + "loss": 0.5336, + "step": 1033 + }, + { + "epoch": 0.699594046008119, + "grad_norm": 1.2208612224676296, + "learning_rate": 9.470124655230627e-06, + "loss": 0.5163, + "step": 1034 + }, + { + "epoch": 0.7002706359945873, + "grad_norm": 1.2460030453002091, + "learning_rate": 9.468359500893227e-06, + "loss": 0.5118, + "step": 1035 + }, + { + "epoch": 0.7009472259810555, + "grad_norm": 1.3306247260952895, + "learning_rate": 9.466591576413785e-06, + "loss": 0.5362, + "step": 1036 + }, + { + "epoch": 0.7016238159675237, + "grad_norm": 1.2585485402143146, + "learning_rate": 9.464820882888319e-06, + "loss": 0.5258, + "step": 1037 + }, + { + "epoch": 0.7023004059539919, + "grad_norm": 1.288794719662461, + "learning_rate": 9.463047421414564e-06, + "loss": 0.5386, + "step": 1038 + }, + { + "epoch": 0.70297699594046, + "grad_norm": 1.2475012289505003, + "learning_rate": 9.461271193091971e-06, + "loss": 0.5256, + "step": 1039 + }, + { + "epoch": 0.7036535859269283, + "grad_norm": 1.2087947388839142, + "learning_rate": 9.459492199021705e-06, + "loss": 0.5451, + "step": 1040 + }, + { + "epoch": 0.7043301759133965, + "grad_norm": 1.3104079751387059, + "learning_rate": 9.457710440306645e-06, + "loss": 0.5265, + "step": 1041 + }, + { + "epoch": 0.7050067658998647, + "grad_norm": 1.3380393908643815, + "learning_rate": 9.455925918051388e-06, + "loss": 0.5369, + "step": 1042 + }, + { + "epoch": 0.7056833558863329, + "grad_norm": 1.2511546699253946, + "learning_rate": 9.454138633362241e-06, + "loss": 0.5297, + "step": 1043 + }, + { + "epoch": 0.706359945872801, + "grad_norm": 1.324148288096631, + "learning_rate": 9.452348587347224e-06, + "loss": 0.5317, + "step": 1044 + }, + { + "epoch": 0.7070365358592693, + "grad_norm": 1.2494905498440645, + "learning_rate": 9.450555781116068e-06, + "loss": 0.5247, + "step": 1045 + }, + { + "epoch": 0.7077131258457375, + "grad_norm": 1.2657195358378996, + "learning_rate": 9.448760215780218e-06, + "loss": 0.5301, + "step": 1046 + }, + { + "epoch": 0.7083897158322057, + "grad_norm": 1.252536940258962, + "learning_rate": 9.446961892452824e-06, + "loss": 0.517, + "step": 1047 + }, + { + "epoch": 0.7090663058186739, + "grad_norm": 1.2036919668954769, + "learning_rate": 9.445160812248754e-06, + "loss": 0.4953, + "step": 1048 + }, + { + "epoch": 0.709742895805142, + "grad_norm": 1.1974599304320643, + "learning_rate": 9.44335697628458e-06, + "loss": 0.5222, + "step": 1049 + }, + { + "epoch": 0.7104194857916103, + "grad_norm": 1.2640687097542616, + "learning_rate": 9.44155038567858e-06, + "loss": 0.5276, + "step": 1050 + }, + { + "epoch": 0.7110960757780784, + "grad_norm": 1.26580964036148, + "learning_rate": 9.439741041550745e-06, + "loss": 0.5436, + "step": 1051 + }, + { + "epoch": 0.7117726657645467, + "grad_norm": 1.3520832943905736, + "learning_rate": 9.437928945022772e-06, + "loss": 0.5569, + "step": 1052 + }, + { + "epoch": 0.7124492557510149, + "grad_norm": 1.2471314764984125, + "learning_rate": 9.43611409721806e-06, + "loss": 0.5244, + "step": 1053 + }, + { + "epoch": 0.713125845737483, + "grad_norm": 1.274199489244252, + "learning_rate": 9.434296499261719e-06, + "loss": 0.5553, + "step": 1054 + }, + { + "epoch": 0.7138024357239513, + "grad_norm": 1.2611714641375649, + "learning_rate": 9.432476152280562e-06, + "loss": 0.5495, + "step": 1055 + }, + { + "epoch": 0.7144790257104194, + "grad_norm": 1.2933516856333653, + "learning_rate": 9.430653057403105e-06, + "loss": 0.5214, + "step": 1056 + }, + { + "epoch": 0.7151556156968877, + "grad_norm": 1.2639444832070281, + "learning_rate": 9.428827215759569e-06, + "loss": 0.5409, + "step": 1057 + }, + { + "epoch": 0.7158322056833559, + "grad_norm": 1.2755101715166999, + "learning_rate": 9.426998628481876e-06, + "loss": 0.5045, + "step": 1058 + }, + { + "epoch": 0.7165087956698241, + "grad_norm": 1.2809343276489422, + "learning_rate": 9.425167296703655e-06, + "loss": 0.5021, + "step": 1059 + }, + { + "epoch": 0.7171853856562923, + "grad_norm": 1.270706769695537, + "learning_rate": 9.42333322156023e-06, + "loss": 0.5096, + "step": 1060 + }, + { + "epoch": 0.7178619756427604, + "grad_norm": 1.2258189002444924, + "learning_rate": 9.42149640418863e-06, + "loss": 0.5203, + "step": 1061 + }, + { + "epoch": 0.7185385656292287, + "grad_norm": 1.2091660343486292, + "learning_rate": 9.419656845727582e-06, + "loss": 0.5109, + "step": 1062 + }, + { + "epoch": 0.7192151556156969, + "grad_norm": 1.2393047302711406, + "learning_rate": 9.417814547317513e-06, + "loss": 0.528, + "step": 1063 + }, + { + "epoch": 0.7198917456021651, + "grad_norm": 1.3022882402386513, + "learning_rate": 9.415969510100549e-06, + "loss": 0.5511, + "step": 1064 + }, + { + "epoch": 0.7205683355886333, + "grad_norm": 1.2218640795866045, + "learning_rate": 9.414121735220513e-06, + "loss": 0.5154, + "step": 1065 + }, + { + "epoch": 0.7212449255751014, + "grad_norm": 1.2191141863190391, + "learning_rate": 9.412271223822929e-06, + "loss": 0.5215, + "step": 1066 + }, + { + "epoch": 0.7219215155615697, + "grad_norm": 1.2037212415605165, + "learning_rate": 9.41041797705501e-06, + "loss": 0.5179, + "step": 1067 + }, + { + "epoch": 0.7225981055480379, + "grad_norm": 1.289576805052528, + "learning_rate": 9.408561996065672e-06, + "loss": 0.5528, + "step": 1068 + }, + { + "epoch": 0.7232746955345061, + "grad_norm": 1.3073734499243528, + "learning_rate": 9.406703282005523e-06, + "loss": 0.529, + "step": 1069 + }, + { + "epoch": 0.7239512855209743, + "grad_norm": 1.3050448266339914, + "learning_rate": 9.404841836026863e-06, + "loss": 0.5329, + "step": 1070 + }, + { + "epoch": 0.7246278755074425, + "grad_norm": 1.2290680927440425, + "learning_rate": 9.40297765928369e-06, + "loss": 0.5367, + "step": 1071 + }, + { + "epoch": 0.7253044654939107, + "grad_norm": 1.2240668245069777, + "learning_rate": 9.401110752931694e-06, + "loss": 0.531, + "step": 1072 + }, + { + "epoch": 0.725981055480379, + "grad_norm": 1.3473562899592888, + "learning_rate": 9.399241118128255e-06, + "loss": 0.531, + "step": 1073 + }, + { + "epoch": 0.7266576454668471, + "grad_norm": 1.2581370172967685, + "learning_rate": 9.397368756032445e-06, + "loss": 0.5321, + "step": 1074 + }, + { + "epoch": 0.7273342354533153, + "grad_norm": 1.2573947330764252, + "learning_rate": 9.395493667805032e-06, + "loss": 0.5193, + "step": 1075 + }, + { + "epoch": 0.7280108254397835, + "grad_norm": 1.2241008215655933, + "learning_rate": 9.393615854608461e-06, + "loss": 0.5122, + "step": 1076 + }, + { + "epoch": 0.7286874154262517, + "grad_norm": 1.1777671303740376, + "learning_rate": 9.391735317606885e-06, + "loss": 0.5104, + "step": 1077 + }, + { + "epoch": 0.7293640054127198, + "grad_norm": 1.197878175202554, + "learning_rate": 9.389852057966129e-06, + "loss": 0.5157, + "step": 1078 + }, + { + "epoch": 0.7300405953991881, + "grad_norm": 1.237944117266921, + "learning_rate": 9.387966076853714e-06, + "loss": 0.5188, + "step": 1079 + }, + { + "epoch": 0.7307171853856563, + "grad_norm": 1.2016964462933304, + "learning_rate": 9.386077375438848e-06, + "loss": 0.5008, + "step": 1080 + }, + { + "epoch": 0.7313937753721245, + "grad_norm": 1.3030730797360548, + "learning_rate": 9.384185954892423e-06, + "loss": 0.5141, + "step": 1081 + }, + { + "epoch": 0.7320703653585927, + "grad_norm": 1.204426766247516, + "learning_rate": 9.382291816387018e-06, + "loss": 0.5177, + "step": 1082 + }, + { + "epoch": 0.7327469553450608, + "grad_norm": 1.239964128818739, + "learning_rate": 9.380394961096895e-06, + "loss": 0.5486, + "step": 1083 + }, + { + "epoch": 0.7334235453315291, + "grad_norm": 1.2519067594437676, + "learning_rate": 9.378495390198005e-06, + "loss": 0.5271, + "step": 1084 + }, + { + "epoch": 0.7341001353179973, + "grad_norm": 1.2018233212299891, + "learning_rate": 9.376593104867976e-06, + "loss": 0.5207, + "step": 1085 + }, + { + "epoch": 0.7347767253044655, + "grad_norm": 1.1511603822022256, + "learning_rate": 9.374688106286127e-06, + "loss": 0.4949, + "step": 1086 + }, + { + "epoch": 0.7354533152909337, + "grad_norm": 1.2957006130750042, + "learning_rate": 9.372780395633451e-06, + "loss": 0.5345, + "step": 1087 + }, + { + "epoch": 0.7361299052774019, + "grad_norm": 1.2723770737474842, + "learning_rate": 9.370869974092628e-06, + "loss": 0.5281, + "step": 1088 + }, + { + "epoch": 0.7368064952638701, + "grad_norm": 1.241815277191254, + "learning_rate": 9.368956842848014e-06, + "loss": 0.543, + "step": 1089 + }, + { + "epoch": 0.7374830852503383, + "grad_norm": 1.2032263815000845, + "learning_rate": 9.36704100308565e-06, + "loss": 0.492, + "step": 1090 + }, + { + "epoch": 0.7381596752368065, + "grad_norm": 1.19514936869428, + "learning_rate": 9.36512245599325e-06, + "loss": 0.5199, + "step": 1091 + }, + { + "epoch": 0.7388362652232747, + "grad_norm": 1.2230982484528294, + "learning_rate": 9.363201202760212e-06, + "loss": 0.5248, + "step": 1092 + }, + { + "epoch": 0.7395128552097429, + "grad_norm": 1.2752199312029666, + "learning_rate": 9.36127724457761e-06, + "loss": 0.523, + "step": 1093 + }, + { + "epoch": 0.7401894451962111, + "grad_norm": 1.21153928005643, + "learning_rate": 9.359350582638193e-06, + "loss": 0.515, + "step": 1094 + }, + { + "epoch": 0.7408660351826793, + "grad_norm": 1.2424880078858398, + "learning_rate": 9.357421218136387e-06, + "loss": 0.5287, + "step": 1095 + }, + { + "epoch": 0.7415426251691475, + "grad_norm": 1.283084430351766, + "learning_rate": 9.355489152268296e-06, + "loss": 0.5345, + "step": 1096 + }, + { + "epoch": 0.7422192151556157, + "grad_norm": 1.16681955496085, + "learning_rate": 9.353554386231697e-06, + "loss": 0.4996, + "step": 1097 + }, + { + "epoch": 0.7428958051420839, + "grad_norm": 1.249942106329068, + "learning_rate": 9.351616921226036e-06, + "loss": 0.5522, + "step": 1098 + }, + { + "epoch": 0.7435723951285521, + "grad_norm": 1.245397417814634, + "learning_rate": 9.349676758452441e-06, + "loss": 0.5419, + "step": 1099 + }, + { + "epoch": 0.7442489851150202, + "grad_norm": 1.2050839062440089, + "learning_rate": 9.347733899113709e-06, + "loss": 0.5016, + "step": 1100 + }, + { + "epoch": 0.7449255751014885, + "grad_norm": 1.2192149937877752, + "learning_rate": 9.345788344414306e-06, + "loss": 0.523, + "step": 1101 + }, + { + "epoch": 0.7456021650879567, + "grad_norm": 1.1860068074567807, + "learning_rate": 9.343840095560373e-06, + "loss": 0.5115, + "step": 1102 + }, + { + "epoch": 0.7462787550744249, + "grad_norm": 1.2506071981429465, + "learning_rate": 9.341889153759715e-06, + "loss": 0.5048, + "step": 1103 + }, + { + "epoch": 0.7469553450608931, + "grad_norm": 1.2416686227007272, + "learning_rate": 9.339935520221816e-06, + "loss": 0.5216, + "step": 1104 + }, + { + "epoch": 0.7476319350473613, + "grad_norm": 1.1918928798340078, + "learning_rate": 9.33797919615782e-06, + "loss": 0.5276, + "step": 1105 + }, + { + "epoch": 0.7483085250338295, + "grad_norm": 1.2608880099009294, + "learning_rate": 9.336020182780545e-06, + "loss": 0.5002, + "step": 1106 + }, + { + "epoch": 0.7489851150202977, + "grad_norm": 1.220186200328264, + "learning_rate": 9.33405848130447e-06, + "loss": 0.5308, + "step": 1107 + }, + { + "epoch": 0.7496617050067659, + "grad_norm": 1.2292827308408865, + "learning_rate": 9.332094092945749e-06, + "loss": 0.501, + "step": 1108 + }, + { + "epoch": 0.7503382949932341, + "grad_norm": 1.1640792670480113, + "learning_rate": 9.330127018922195e-06, + "loss": 0.4993, + "step": 1109 + }, + { + "epoch": 0.7510148849797023, + "grad_norm": 1.1442652537974987, + "learning_rate": 9.328157260453286e-06, + "loss": 0.5014, + "step": 1110 + }, + { + "epoch": 0.7516914749661705, + "grad_norm": 1.2270186722583278, + "learning_rate": 9.326184818760167e-06, + "loss": 0.5132, + "step": 1111 + }, + { + "epoch": 0.7523680649526387, + "grad_norm": 1.1923161453599922, + "learning_rate": 9.324209695065644e-06, + "loss": 0.5212, + "step": 1112 + }, + { + "epoch": 0.7530446549391069, + "grad_norm": 1.2391194563334127, + "learning_rate": 9.322231890594193e-06, + "loss": 0.5451, + "step": 1113 + }, + { + "epoch": 0.7537212449255751, + "grad_norm": 1.2344078457049639, + "learning_rate": 9.32025140657194e-06, + "loss": 0.5251, + "step": 1114 + }, + { + "epoch": 0.7543978349120433, + "grad_norm": 1.2519285005335228, + "learning_rate": 9.318268244226681e-06, + "loss": 0.5056, + "step": 1115 + }, + { + "epoch": 0.7550744248985115, + "grad_norm": 1.242774477788238, + "learning_rate": 9.31628240478787e-06, + "loss": 0.5277, + "step": 1116 + }, + { + "epoch": 0.7557510148849798, + "grad_norm": 1.282012041641155, + "learning_rate": 9.31429388948662e-06, + "loss": 0.5167, + "step": 1117 + }, + { + "epoch": 0.7564276048714479, + "grad_norm": 1.184970210583831, + "learning_rate": 9.312302699555701e-06, + "loss": 0.5201, + "step": 1118 + }, + { + "epoch": 0.7571041948579161, + "grad_norm": 1.219400714159224, + "learning_rate": 9.310308836229548e-06, + "loss": 0.4936, + "step": 1119 + }, + { + "epoch": 0.7577807848443843, + "grad_norm": 1.1382113207645481, + "learning_rate": 9.308312300744247e-06, + "loss": 0.4951, + "step": 1120 + }, + { + "epoch": 0.7584573748308525, + "grad_norm": 1.167666202797914, + "learning_rate": 9.306313094337539e-06, + "loss": 0.4867, + "step": 1121 + }, + { + "epoch": 0.7591339648173207, + "grad_norm": 1.2387704994315876, + "learning_rate": 9.304311218248828e-06, + "loss": 0.4901, + "step": 1122 + }, + { + "epoch": 0.7598105548037889, + "grad_norm": 1.1316940775149187, + "learning_rate": 9.30230667371917e-06, + "loss": 0.4774, + "step": 1123 + }, + { + "epoch": 0.7604871447902571, + "grad_norm": 1.2058566340142174, + "learning_rate": 9.30029946199127e-06, + "loss": 0.52, + "step": 1124 + }, + { + "epoch": 0.7611637347767253, + "grad_norm": 1.243961618423197, + "learning_rate": 9.298289584309496e-06, + "loss": 0.5346, + "step": 1125 + }, + { + "epoch": 0.7618403247631935, + "grad_norm": 1.193442295510897, + "learning_rate": 9.29627704191986e-06, + "loss": 0.5065, + "step": 1126 + }, + { + "epoch": 0.7625169147496617, + "grad_norm": 1.2407973475588872, + "learning_rate": 9.294261836070034e-06, + "loss": 0.524, + "step": 1127 + }, + { + "epoch": 0.7631935047361299, + "grad_norm": 1.195632349448027, + "learning_rate": 9.292243968009332e-06, + "loss": 0.4739, + "step": 1128 + }, + { + "epoch": 0.7638700947225981, + "grad_norm": 1.1911527376522375, + "learning_rate": 9.290223438988726e-06, + "loss": 0.5221, + "step": 1129 + }, + { + "epoch": 0.7645466847090663, + "grad_norm": 1.129506736804369, + "learning_rate": 9.288200250260836e-06, + "loss": 0.4977, + "step": 1130 + }, + { + "epoch": 0.7652232746955345, + "grad_norm": 1.3476559038046902, + "learning_rate": 9.286174403079928e-06, + "loss": 0.5335, + "step": 1131 + }, + { + "epoch": 0.7658998646820027, + "grad_norm": 1.2656781723573305, + "learning_rate": 9.284145898701921e-06, + "loss": 0.5298, + "step": 1132 + }, + { + "epoch": 0.7665764546684709, + "grad_norm": 1.2362237441645285, + "learning_rate": 9.282114738384375e-06, + "loss": 0.5098, + "step": 1133 + }, + { + "epoch": 0.7672530446549392, + "grad_norm": 1.2091333429953386, + "learning_rate": 9.280080923386501e-06, + "loss": 0.5041, + "step": 1134 + }, + { + "epoch": 0.7679296346414073, + "grad_norm": 1.1647454812252227, + "learning_rate": 9.278044454969157e-06, + "loss": 0.4838, + "step": 1135 + }, + { + "epoch": 0.7686062246278755, + "grad_norm": 1.2327622765574568, + "learning_rate": 9.27600533439484e-06, + "loss": 0.4997, + "step": 1136 + }, + { + "epoch": 0.7692828146143437, + "grad_norm": 1.205122408855429, + "learning_rate": 9.273963562927695e-06, + "loss": 0.5062, + "step": 1137 + }, + { + "epoch": 0.7699594046008119, + "grad_norm": 1.2455164972733845, + "learning_rate": 9.271919141833514e-06, + "loss": 0.5077, + "step": 1138 + }, + { + "epoch": 0.7706359945872802, + "grad_norm": 1.2212308301938604, + "learning_rate": 9.269872072379725e-06, + "loss": 0.4984, + "step": 1139 + }, + { + "epoch": 0.7713125845737483, + "grad_norm": 1.2614884328893134, + "learning_rate": 9.267822355835402e-06, + "loss": 0.5473, + "step": 1140 + }, + { + "epoch": 0.7719891745602165, + "grad_norm": 1.245191993873177, + "learning_rate": 9.265769993471258e-06, + "loss": 0.5277, + "step": 1141 + }, + { + "epoch": 0.7726657645466847, + "grad_norm": 1.184237449224601, + "learning_rate": 9.263714986559647e-06, + "loss": 0.5228, + "step": 1142 + }, + { + "epoch": 0.7733423545331529, + "grad_norm": 1.2988147104890695, + "learning_rate": 9.261657336374561e-06, + "loss": 0.5164, + "step": 1143 + }, + { + "epoch": 0.774018944519621, + "grad_norm": 1.258595652871956, + "learning_rate": 9.259597044191635e-06, + "loss": 0.5378, + "step": 1144 + }, + { + "epoch": 0.7746955345060893, + "grad_norm": 1.1255557115022607, + "learning_rate": 9.25753411128814e-06, + "loss": 0.4567, + "step": 1145 + }, + { + "epoch": 0.7753721244925575, + "grad_norm": 1.2610790509117351, + "learning_rate": 9.25546853894298e-06, + "loss": 0.5242, + "step": 1146 + }, + { + "epoch": 0.7760487144790257, + "grad_norm": 1.2589644527665236, + "learning_rate": 9.253400328436699e-06, + "loss": 0.5219, + "step": 1147 + }, + { + "epoch": 0.7767253044654939, + "grad_norm": 1.184495299442687, + "learning_rate": 9.251329481051476e-06, + "loss": 0.5092, + "step": 1148 + }, + { + "epoch": 0.7774018944519621, + "grad_norm": 1.1782965516351767, + "learning_rate": 9.249255998071127e-06, + "loss": 0.5006, + "step": 1149 + }, + { + "epoch": 0.7780784844384303, + "grad_norm": 1.1678224212638866, + "learning_rate": 9.247179880781099e-06, + "loss": 0.5077, + "step": 1150 + }, + { + "epoch": 0.7787550744248986, + "grad_norm": 1.172336312307818, + "learning_rate": 9.24510113046847e-06, + "loss": 0.4821, + "step": 1151 + }, + { + "epoch": 0.7794316644113667, + "grad_norm": 1.2326813343485412, + "learning_rate": 9.243019748421956e-06, + "loss": 0.5238, + "step": 1152 + }, + { + "epoch": 0.7801082543978349, + "grad_norm": 1.253336584382025, + "learning_rate": 9.2409357359319e-06, + "loss": 0.5201, + "step": 1153 + }, + { + "epoch": 0.7807848443843031, + "grad_norm": 1.2715452410014232, + "learning_rate": 9.238849094290279e-06, + "loss": 0.5173, + "step": 1154 + }, + { + "epoch": 0.7814614343707713, + "grad_norm": 1.2043169935061349, + "learning_rate": 9.236759824790698e-06, + "loss": 0.4971, + "step": 1155 + }, + { + "epoch": 0.7821380243572396, + "grad_norm": 1.265740963751011, + "learning_rate": 9.234667928728392e-06, + "loss": 0.5015, + "step": 1156 + }, + { + "epoch": 0.7828146143437077, + "grad_norm": 1.1540576910730072, + "learning_rate": 9.23257340740022e-06, + "loss": 0.5, + "step": 1157 + }, + { + "epoch": 0.7834912043301759, + "grad_norm": 1.2321409258359763, + "learning_rate": 9.230476262104678e-06, + "loss": 0.5327, + "step": 1158 + }, + { + "epoch": 0.7841677943166441, + "grad_norm": 1.2198752858863542, + "learning_rate": 9.22837649414188e-06, + "loss": 0.5162, + "step": 1159 + }, + { + "epoch": 0.7848443843031123, + "grad_norm": 1.2086741821442035, + "learning_rate": 9.226274104813567e-06, + "loss": 0.5266, + "step": 1160 + }, + { + "epoch": 0.7855209742895806, + "grad_norm": 1.2269901761735669, + "learning_rate": 9.22416909542311e-06, + "loss": 0.5064, + "step": 1161 + }, + { + "epoch": 0.7861975642760487, + "grad_norm": 1.1272107653531314, + "learning_rate": 9.222061467275503e-06, + "loss": 0.5056, + "step": 1162 + }, + { + "epoch": 0.786874154262517, + "grad_norm": 1.1682273139005261, + "learning_rate": 9.219951221677356e-06, + "loss": 0.5057, + "step": 1163 + }, + { + "epoch": 0.7875507442489851, + "grad_norm": 1.2047454363953354, + "learning_rate": 9.217838359936914e-06, + "loss": 0.4973, + "step": 1164 + }, + { + "epoch": 0.7882273342354533, + "grad_norm": 1.1996132345158188, + "learning_rate": 9.215722883364033e-06, + "loss": 0.5061, + "step": 1165 + }, + { + "epoch": 0.7889039242219216, + "grad_norm": 1.2175204839574072, + "learning_rate": 9.213604793270196e-06, + "loss": 0.5087, + "step": 1166 + }, + { + "epoch": 0.7895805142083897, + "grad_norm": 1.163682009112604, + "learning_rate": 9.211484090968505e-06, + "loss": 0.496, + "step": 1167 + }, + { + "epoch": 0.790257104194858, + "grad_norm": 1.1551956368039216, + "learning_rate": 9.20936077777368e-06, + "loss": 0.5006, + "step": 1168 + }, + { + "epoch": 0.7909336941813261, + "grad_norm": 1.286480056932424, + "learning_rate": 9.207234855002062e-06, + "loss": 0.5427, + "step": 1169 + }, + { + "epoch": 0.7916102841677943, + "grad_norm": 1.2416731588400851, + "learning_rate": 9.205106323971607e-06, + "loss": 0.5128, + "step": 1170 + }, + { + "epoch": 0.7922868741542625, + "grad_norm": 1.2035277303836733, + "learning_rate": 9.202975186001892e-06, + "loss": 0.5028, + "step": 1171 + }, + { + "epoch": 0.7929634641407307, + "grad_norm": 1.172801805095853, + "learning_rate": 9.200841442414106e-06, + "loss": 0.5045, + "step": 1172 + }, + { + "epoch": 0.793640054127199, + "grad_norm": 1.2246920957554206, + "learning_rate": 9.198705094531053e-06, + "loss": 0.5215, + "step": 1173 + }, + { + "epoch": 0.7943166441136671, + "grad_norm": 1.2210194718468, + "learning_rate": 9.196566143677157e-06, + "loss": 0.5161, + "step": 1174 + }, + { + "epoch": 0.7949932341001353, + "grad_norm": 1.2207221685901417, + "learning_rate": 9.19442459117845e-06, + "loss": 0.5358, + "step": 1175 + }, + { + "epoch": 0.7956698240866035, + "grad_norm": 1.1836268293335501, + "learning_rate": 9.192280438362581e-06, + "loss": 0.4876, + "step": 1176 + }, + { + "epoch": 0.7963464140730717, + "grad_norm": 1.166241983031272, + "learning_rate": 9.190133686558809e-06, + "loss": 0.4922, + "step": 1177 + }, + { + "epoch": 0.79702300405954, + "grad_norm": 1.1556650614887145, + "learning_rate": 9.187984337098002e-06, + "loss": 0.5073, + "step": 1178 + }, + { + "epoch": 0.7976995940460081, + "grad_norm": 1.1921213306616103, + "learning_rate": 9.185832391312644e-06, + "loss": 0.5091, + "step": 1179 + }, + { + "epoch": 0.7983761840324763, + "grad_norm": 1.214325909552027, + "learning_rate": 9.183677850536823e-06, + "loss": 0.5153, + "step": 1180 + }, + { + "epoch": 0.7990527740189445, + "grad_norm": 1.1979689868096728, + "learning_rate": 9.181520716106238e-06, + "loss": 0.5019, + "step": 1181 + }, + { + "epoch": 0.7997293640054127, + "grad_norm": 1.1835623771156139, + "learning_rate": 9.179360989358199e-06, + "loss": 0.5011, + "step": 1182 + }, + { + "epoch": 0.800405953991881, + "grad_norm": 1.1896726896387138, + "learning_rate": 9.177198671631616e-06, + "loss": 0.5043, + "step": 1183 + }, + { + "epoch": 0.8010825439783491, + "grad_norm": 1.1453404765067188, + "learning_rate": 9.175033764267013e-06, + "loss": 0.4804, + "step": 1184 + }, + { + "epoch": 0.8017591339648173, + "grad_norm": 1.1653256478294909, + "learning_rate": 9.172866268606514e-06, + "loss": 0.5065, + "step": 1185 + }, + { + "epoch": 0.8024357239512855, + "grad_norm": 1.1459358079448239, + "learning_rate": 9.17069618599385e-06, + "loss": 0.5052, + "step": 1186 + }, + { + "epoch": 0.8031123139377537, + "grad_norm": 1.141288346093606, + "learning_rate": 9.168523517774356e-06, + "loss": 0.5004, + "step": 1187 + }, + { + "epoch": 0.803788903924222, + "grad_norm": 1.2081823619592962, + "learning_rate": 9.166348265294968e-06, + "loss": 0.5044, + "step": 1188 + }, + { + "epoch": 0.8044654939106901, + "grad_norm": 1.161845597498268, + "learning_rate": 9.164170429904224e-06, + "loss": 0.5178, + "step": 1189 + }, + { + "epoch": 0.8051420838971584, + "grad_norm": 1.1575450572593282, + "learning_rate": 9.16199001295227e-06, + "loss": 0.5085, + "step": 1190 + }, + { + "epoch": 0.8058186738836265, + "grad_norm": 1.201488524435077, + "learning_rate": 9.15980701579084e-06, + "loss": 0.5055, + "step": 1191 + }, + { + "epoch": 0.8064952638700947, + "grad_norm": 1.1885579926096035, + "learning_rate": 9.157621439773278e-06, + "loss": 0.4957, + "step": 1192 + }, + { + "epoch": 0.8071718538565629, + "grad_norm": 1.121439214618471, + "learning_rate": 9.155433286254524e-06, + "loss": 0.4931, + "step": 1193 + }, + { + "epoch": 0.8078484438430311, + "grad_norm": 1.2044542210279365, + "learning_rate": 9.153242556591115e-06, + "loss": 0.504, + "step": 1194 + }, + { + "epoch": 0.8085250338294994, + "grad_norm": 1.1627187962465653, + "learning_rate": 9.151049252141185e-06, + "loss": 0.4976, + "step": 1195 + }, + { + "epoch": 0.8092016238159675, + "grad_norm": 1.136773615776934, + "learning_rate": 9.148853374264463e-06, + "loss": 0.4992, + "step": 1196 + }, + { + "epoch": 0.8098782138024357, + "grad_norm": 1.154489603413787, + "learning_rate": 9.146654924322277e-06, + "loss": 0.5052, + "step": 1197 + }, + { + "epoch": 0.8105548037889039, + "grad_norm": 1.158757882340262, + "learning_rate": 9.144453903677546e-06, + "loss": 0.4983, + "step": 1198 + }, + { + "epoch": 0.8112313937753721, + "grad_norm": 1.170001361279437, + "learning_rate": 9.142250313694785e-06, + "loss": 0.5098, + "step": 1199 + }, + { + "epoch": 0.8119079837618404, + "grad_norm": 1.198423599885229, + "learning_rate": 9.140044155740102e-06, + "loss": 0.4989, + "step": 1200 + }, + { + "epoch": 0.8125845737483085, + "grad_norm": 1.2762899051831884, + "learning_rate": 9.137835431181192e-06, + "loss": 0.5235, + "step": 1201 + }, + { + "epoch": 0.8132611637347767, + "grad_norm": 1.1672888453026706, + "learning_rate": 9.13562414138735e-06, + "loss": 0.4845, + "step": 1202 + }, + { + "epoch": 0.8139377537212449, + "grad_norm": 1.1916703372146515, + "learning_rate": 9.133410287729454e-06, + "loss": 0.5103, + "step": 1203 + }, + { + "epoch": 0.8146143437077131, + "grad_norm": 1.1707987920004868, + "learning_rate": 9.131193871579975e-06, + "loss": 0.5033, + "step": 1204 + }, + { + "epoch": 0.8152909336941814, + "grad_norm": 1.167866774362817, + "learning_rate": 9.12897489431297e-06, + "loss": 0.5102, + "step": 1205 + }, + { + "epoch": 0.8159675236806495, + "grad_norm": 1.1591757897777635, + "learning_rate": 9.126753357304088e-06, + "loss": 0.5005, + "step": 1206 + }, + { + "epoch": 0.8166441136671178, + "grad_norm": 1.134558974635681, + "learning_rate": 9.12452926193056e-06, + "loss": 0.4934, + "step": 1207 + }, + { + "epoch": 0.8173207036535859, + "grad_norm": 1.176088029742004, + "learning_rate": 9.122302609571204e-06, + "loss": 0.5012, + "step": 1208 + }, + { + "epoch": 0.8179972936400541, + "grad_norm": 1.1784767687786055, + "learning_rate": 9.120073401606427e-06, + "loss": 0.5136, + "step": 1209 + }, + { + "epoch": 0.8186738836265224, + "grad_norm": 1.1933785692052794, + "learning_rate": 9.117841639418218e-06, + "loss": 0.506, + "step": 1210 + }, + { + "epoch": 0.8193504736129905, + "grad_norm": 1.1658587414583894, + "learning_rate": 9.115607324390146e-06, + "loss": 0.4713, + "step": 1211 + }, + { + "epoch": 0.8200270635994588, + "grad_norm": 1.2206670194557647, + "learning_rate": 9.11337045790737e-06, + "loss": 0.5055, + "step": 1212 + }, + { + "epoch": 0.8207036535859269, + "grad_norm": 1.1596600395068082, + "learning_rate": 9.111131041356624e-06, + "loss": 0.5046, + "step": 1213 + }, + { + "epoch": 0.8213802435723951, + "grad_norm": 1.2417732230382863, + "learning_rate": 9.108889076126226e-06, + "loss": 0.5111, + "step": 1214 + }, + { + "epoch": 0.8220568335588633, + "grad_norm": 1.1095547390157674, + "learning_rate": 9.106644563606076e-06, + "loss": 0.5052, + "step": 1215 + }, + { + "epoch": 0.8227334235453315, + "grad_norm": 1.1437208716968439, + "learning_rate": 9.104397505187645e-06, + "loss": 0.4926, + "step": 1216 + }, + { + "epoch": 0.8234100135317998, + "grad_norm": 1.1504515072112609, + "learning_rate": 9.102147902263994e-06, + "loss": 0.478, + "step": 1217 + }, + { + "epoch": 0.8240866035182679, + "grad_norm": 1.1443597297700818, + "learning_rate": 9.099895756229754e-06, + "loss": 0.5202, + "step": 1218 + }, + { + "epoch": 0.8247631935047361, + "grad_norm": 1.0841569991671136, + "learning_rate": 9.097641068481133e-06, + "loss": 0.4873, + "step": 1219 + }, + { + "epoch": 0.8254397834912043, + "grad_norm": 1.184769872454869, + "learning_rate": 9.095383840415915e-06, + "loss": 0.5065, + "step": 1220 + }, + { + "epoch": 0.8261163734776725, + "grad_norm": 1.2340019807532674, + "learning_rate": 9.093124073433464e-06, + "loss": 0.5045, + "step": 1221 + }, + { + "epoch": 0.8267929634641408, + "grad_norm": 1.1484478756692174, + "learning_rate": 9.090861768934708e-06, + "loss": 0.4832, + "step": 1222 + }, + { + "epoch": 0.8274695534506089, + "grad_norm": 1.215347927222981, + "learning_rate": 9.088596928322158e-06, + "loss": 0.5044, + "step": 1223 + }, + { + "epoch": 0.8281461434370772, + "grad_norm": 1.2136813200888974, + "learning_rate": 9.08632955299989e-06, + "loss": 0.5075, + "step": 1224 + }, + { + "epoch": 0.8288227334235453, + "grad_norm": 1.21118000221307, + "learning_rate": 9.084059644373558e-06, + "loss": 0.502, + "step": 1225 + }, + { + "epoch": 0.8294993234100135, + "grad_norm": 1.1992669669850768, + "learning_rate": 9.08178720385038e-06, + "loss": 0.4914, + "step": 1226 + }, + { + "epoch": 0.8301759133964818, + "grad_norm": 1.159224784891141, + "learning_rate": 9.07951223283915e-06, + "loss": 0.4941, + "step": 1227 + }, + { + "epoch": 0.8308525033829499, + "grad_norm": 1.152140367381401, + "learning_rate": 9.077234732750223e-06, + "loss": 0.4862, + "step": 1228 + }, + { + "epoch": 0.8315290933694182, + "grad_norm": 1.2178795785849492, + "learning_rate": 9.074954704995532e-06, + "loss": 0.5277, + "step": 1229 + }, + { + "epoch": 0.8322056833558863, + "grad_norm": 1.178803162593111, + "learning_rate": 9.072672150988563e-06, + "loss": 0.5019, + "step": 1230 + }, + { + "epoch": 0.8328822733423545, + "grad_norm": 1.084268178799295, + "learning_rate": 9.070387072144386e-06, + "loss": 0.4917, + "step": 1231 + }, + { + "epoch": 0.8335588633288228, + "grad_norm": 1.2088539096551216, + "learning_rate": 9.06809946987962e-06, + "loss": 0.5342, + "step": 1232 + }, + { + "epoch": 0.8342354533152909, + "grad_norm": 1.1143419493443054, + "learning_rate": 9.065809345612458e-06, + "loss": 0.4885, + "step": 1233 + }, + { + "epoch": 0.8349120433017592, + "grad_norm": 1.1393488661081579, + "learning_rate": 9.06351670076265e-06, + "loss": 0.4705, + "step": 1234 + }, + { + "epoch": 0.8355886332882273, + "grad_norm": 1.150681280569992, + "learning_rate": 9.061221536751517e-06, + "loss": 0.498, + "step": 1235 + }, + { + "epoch": 0.8362652232746955, + "grad_norm": 1.1133905701950872, + "learning_rate": 9.058923855001935e-06, + "loss": 0.4846, + "step": 1236 + }, + { + "epoch": 0.8369418132611637, + "grad_norm": 1.1451489378732804, + "learning_rate": 9.056623656938344e-06, + "loss": 0.4815, + "step": 1237 + }, + { + "epoch": 0.8376184032476319, + "grad_norm": 1.163503758049376, + "learning_rate": 9.05432094398674e-06, + "loss": 0.4907, + "step": 1238 + }, + { + "epoch": 0.8382949932341002, + "grad_norm": 1.231452882592344, + "learning_rate": 9.052015717574683e-06, + "loss": 0.5093, + "step": 1239 + }, + { + "epoch": 0.8389715832205683, + "grad_norm": 1.1863568491997516, + "learning_rate": 9.049707979131288e-06, + "loss": 0.5195, + "step": 1240 + }, + { + "epoch": 0.8396481732070366, + "grad_norm": 1.1659036571243266, + "learning_rate": 9.04739773008723e-06, + "loss": 0.4938, + "step": 1241 + }, + { + "epoch": 0.8403247631935047, + "grad_norm": 1.1394606038594781, + "learning_rate": 9.045084971874738e-06, + "loss": 0.4942, + "step": 1242 + }, + { + "epoch": 0.8410013531799729, + "grad_norm": 1.1688212688465167, + "learning_rate": 9.042769705927597e-06, + "loss": 0.5065, + "step": 1243 + }, + { + "epoch": 0.8416779431664412, + "grad_norm": 1.1400435992590443, + "learning_rate": 9.040451933681148e-06, + "loss": 0.5017, + "step": 1244 + }, + { + "epoch": 0.8423545331529093, + "grad_norm": 1.1393080053253095, + "learning_rate": 9.038131656572284e-06, + "loss": 0.4949, + "step": 1245 + }, + { + "epoch": 0.8430311231393776, + "grad_norm": 1.1392590264755564, + "learning_rate": 9.035808876039451e-06, + "loss": 0.5007, + "step": 1246 + }, + { + "epoch": 0.8437077131258457, + "grad_norm": 1.08930459707302, + "learning_rate": 9.033483593522652e-06, + "loss": 0.4833, + "step": 1247 + }, + { + "epoch": 0.8443843031123139, + "grad_norm": 1.1060110312753122, + "learning_rate": 9.03115581046343e-06, + "loss": 0.4859, + "step": 1248 + }, + { + "epoch": 0.8450608930987822, + "grad_norm": 1.1427215978050838, + "learning_rate": 9.028825528304892e-06, + "loss": 0.4952, + "step": 1249 + }, + { + "epoch": 0.8457374830852503, + "grad_norm": 1.1074913254567567, + "learning_rate": 9.026492748491683e-06, + "loss": 0.4836, + "step": 1250 + }, + { + "epoch": 0.8464140730717186, + "grad_norm": 1.1303369847573572, + "learning_rate": 9.02415747247e-06, + "loss": 0.4955, + "step": 1251 + }, + { + "epoch": 0.8470906630581867, + "grad_norm": 1.1404785046476127, + "learning_rate": 9.02181970168759e-06, + "loss": 0.4996, + "step": 1252 + }, + { + "epoch": 0.847767253044655, + "grad_norm": 1.157184953961382, + "learning_rate": 9.019479437593748e-06, + "loss": 0.5111, + "step": 1253 + }, + { + "epoch": 0.8484438430311232, + "grad_norm": 1.1346714000625626, + "learning_rate": 9.017136681639307e-06, + "loss": 0.4995, + "step": 1254 + }, + { + "epoch": 0.8491204330175913, + "grad_norm": 1.1129518663288842, + "learning_rate": 9.014791435276651e-06, + "loss": 0.4679, + "step": 1255 + }, + { + "epoch": 0.8497970230040596, + "grad_norm": 1.1332449083138452, + "learning_rate": 9.012443699959706e-06, + "loss": 0.4596, + "step": 1256 + }, + { + "epoch": 0.8504736129905277, + "grad_norm": 1.1928224747810408, + "learning_rate": 9.010093477143942e-06, + "loss": 0.4868, + "step": 1257 + }, + { + "epoch": 0.851150202976996, + "grad_norm": 1.1330133012179797, + "learning_rate": 9.007740768286369e-06, + "loss": 0.4745, + "step": 1258 + }, + { + "epoch": 0.8518267929634641, + "grad_norm": 1.1022108256784344, + "learning_rate": 9.005385574845543e-06, + "loss": 0.4702, + "step": 1259 + }, + { + "epoch": 0.8525033829499323, + "grad_norm": 1.1663024420978705, + "learning_rate": 9.003027898281551e-06, + "loss": 0.5006, + "step": 1260 + }, + { + "epoch": 0.8531799729364006, + "grad_norm": 1.1614830762105077, + "learning_rate": 9.000667740056033e-06, + "loss": 0.4788, + "step": 1261 + }, + { + "epoch": 0.8538565629228687, + "grad_norm": 1.1599210618285292, + "learning_rate": 8.998305101632155e-06, + "loss": 0.5145, + "step": 1262 + }, + { + "epoch": 0.854533152909337, + "grad_norm": 1.1997132116001858, + "learning_rate": 8.995939984474624e-06, + "loss": 0.522, + "step": 1263 + }, + { + "epoch": 0.8552097428958051, + "grad_norm": 1.1647077190927393, + "learning_rate": 8.99357239004969e-06, + "loss": 0.5069, + "step": 1264 + }, + { + "epoch": 0.8558863328822733, + "grad_norm": 1.1597263193249239, + "learning_rate": 8.991202319825131e-06, + "loss": 0.5082, + "step": 1265 + }, + { + "epoch": 0.8565629228687416, + "grad_norm": 1.1767426661006315, + "learning_rate": 8.988829775270265e-06, + "loss": 0.4927, + "step": 1266 + }, + { + "epoch": 0.8572395128552097, + "grad_norm": 1.1537489991545355, + "learning_rate": 8.986454757855938e-06, + "loss": 0.5006, + "step": 1267 + }, + { + "epoch": 0.857916102841678, + "grad_norm": 1.2096771024003516, + "learning_rate": 8.984077269054535e-06, + "loss": 0.5222, + "step": 1268 + }, + { + "epoch": 0.8585926928281461, + "grad_norm": 1.2327653148038258, + "learning_rate": 8.981697310339972e-06, + "loss": 0.4996, + "step": 1269 + }, + { + "epoch": 0.8592692828146143, + "grad_norm": 1.152318472987713, + "learning_rate": 8.979314883187694e-06, + "loss": 0.5223, + "step": 1270 + }, + { + "epoch": 0.8599458728010826, + "grad_norm": 1.1380987339073596, + "learning_rate": 8.976929989074677e-06, + "loss": 0.4876, + "step": 1271 + }, + { + "epoch": 0.8606224627875507, + "grad_norm": 1.1091524025592998, + "learning_rate": 8.974542629479426e-06, + "loss": 0.5002, + "step": 1272 + }, + { + "epoch": 0.861299052774019, + "grad_norm": 1.213865438308819, + "learning_rate": 8.972152805881978e-06, + "loss": 0.5077, + "step": 1273 + }, + { + "epoch": 0.8619756427604871, + "grad_norm": 1.240209755681695, + "learning_rate": 8.969760519763891e-06, + "loss": 0.5141, + "step": 1274 + }, + { + "epoch": 0.8626522327469553, + "grad_norm": 1.0695291806021396, + "learning_rate": 8.967365772608258e-06, + "loss": 0.4657, + "step": 1275 + }, + { + "epoch": 0.8633288227334236, + "grad_norm": 1.0946797927801621, + "learning_rate": 8.96496856589969e-06, + "loss": 0.4994, + "step": 1276 + }, + { + "epoch": 0.8640054127198917, + "grad_norm": 1.1415766090022756, + "learning_rate": 8.962568901124326e-06, + "loss": 0.5136, + "step": 1277 + }, + { + "epoch": 0.86468200270636, + "grad_norm": 1.1298423136870914, + "learning_rate": 8.96016677976983e-06, + "loss": 0.4915, + "step": 1278 + }, + { + "epoch": 0.8653585926928281, + "grad_norm": 1.1243889547151007, + "learning_rate": 8.957762203325389e-06, + "loss": 0.4997, + "step": 1279 + }, + { + "epoch": 0.8660351826792964, + "grad_norm": 1.0795482679115407, + "learning_rate": 8.955355173281709e-06, + "loss": 0.495, + "step": 1280 + }, + { + "epoch": 0.8667117726657646, + "grad_norm": 1.140820850967581, + "learning_rate": 8.952945691131016e-06, + "loss": 0.4894, + "step": 1281 + }, + { + "epoch": 0.8673883626522327, + "grad_norm": 1.151613276845393, + "learning_rate": 8.950533758367063e-06, + "loss": 0.5205, + "step": 1282 + }, + { + "epoch": 0.868064952638701, + "grad_norm": 1.1590996796684934, + "learning_rate": 8.948119376485119e-06, + "loss": 0.4832, + "step": 1283 + }, + { + "epoch": 0.8687415426251691, + "grad_norm": 1.110183660912312, + "learning_rate": 8.94570254698197e-06, + "loss": 0.4669, + "step": 1284 + }, + { + "epoch": 0.8694181326116374, + "grad_norm": 1.1112549714756523, + "learning_rate": 8.943283271355915e-06, + "loss": 0.5022, + "step": 1285 + }, + { + "epoch": 0.8700947225981055, + "grad_norm": 1.1418881471068483, + "learning_rate": 8.940861551106784e-06, + "loss": 0.4926, + "step": 1286 + }, + { + "epoch": 0.8707713125845737, + "grad_norm": 1.1118737025422658, + "learning_rate": 8.938437387735903e-06, + "loss": 0.4622, + "step": 1287 + }, + { + "epoch": 0.871447902571042, + "grad_norm": 1.1881042509588093, + "learning_rate": 8.93601078274613e-06, + "loss": 0.5061, + "step": 1288 + }, + { + "epoch": 0.8721244925575101, + "grad_norm": 1.1247767210782038, + "learning_rate": 8.933581737641824e-06, + "loss": 0.4938, + "step": 1289 + }, + { + "epoch": 0.8728010825439784, + "grad_norm": 1.1390569526887053, + "learning_rate": 8.931150253928866e-06, + "loss": 0.5012, + "step": 1290 + }, + { + "epoch": 0.8734776725304465, + "grad_norm": 1.1035813141538635, + "learning_rate": 8.928716333114643e-06, + "loss": 0.4891, + "step": 1291 + }, + { + "epoch": 0.8741542625169147, + "grad_norm": 1.056265978387651, + "learning_rate": 8.926279976708056e-06, + "loss": 0.4493, + "step": 1292 + }, + { + "epoch": 0.874830852503383, + "grad_norm": 1.1386457355437434, + "learning_rate": 8.923841186219512e-06, + "loss": 0.5046, + "step": 1293 + }, + { + "epoch": 0.8755074424898511, + "grad_norm": 1.111246334626937, + "learning_rate": 8.921399963160934e-06, + "loss": 0.492, + "step": 1294 + }, + { + "epoch": 0.8761840324763194, + "grad_norm": 1.1413217971002307, + "learning_rate": 8.918956309045743e-06, + "loss": 0.4949, + "step": 1295 + }, + { + "epoch": 0.8768606224627875, + "grad_norm": 1.0499552388509685, + "learning_rate": 8.916510225388878e-06, + "loss": 0.4783, + "step": 1296 + }, + { + "epoch": 0.8775372124492558, + "grad_norm": 1.0885209774631313, + "learning_rate": 8.914061713706776e-06, + "loss": 0.494, + "step": 1297 + }, + { + "epoch": 0.878213802435724, + "grad_norm": 1.1554655320262635, + "learning_rate": 8.911610775517383e-06, + "loss": 0.5158, + "step": 1298 + }, + { + "epoch": 0.8788903924221921, + "grad_norm": 1.1058022445695312, + "learning_rate": 8.90915741234015e-06, + "loss": 0.5058, + "step": 1299 + }, + { + "epoch": 0.8795669824086604, + "grad_norm": 1.093566761068326, + "learning_rate": 8.906701625696028e-06, + "loss": 0.4625, + "step": 1300 + }, + { + "epoch": 0.8802435723951285, + "grad_norm": 1.1322104456009245, + "learning_rate": 8.904243417107473e-06, + "loss": 0.5016, + "step": 1301 + }, + { + "epoch": 0.8809201623815968, + "grad_norm": 1.1148533155687137, + "learning_rate": 8.901782788098442e-06, + "loss": 0.4651, + "step": 1302 + }, + { + "epoch": 0.881596752368065, + "grad_norm": 1.1803033305243866, + "learning_rate": 8.899319740194391e-06, + "loss": 0.4861, + "step": 1303 + }, + { + "epoch": 0.8822733423545331, + "grad_norm": 1.1627810496841564, + "learning_rate": 8.89685427492228e-06, + "loss": 0.517, + "step": 1304 + }, + { + "epoch": 0.8829499323410014, + "grad_norm": 1.1271570410543166, + "learning_rate": 8.894386393810563e-06, + "loss": 0.4859, + "step": 1305 + }, + { + "epoch": 0.8836265223274695, + "grad_norm": 1.111793161435199, + "learning_rate": 8.891916098389193e-06, + "loss": 0.4763, + "step": 1306 + }, + { + "epoch": 0.8843031123139378, + "grad_norm": 1.1034277151511187, + "learning_rate": 8.889443390189618e-06, + "loss": 0.49, + "step": 1307 + }, + { + "epoch": 0.8849797023004059, + "grad_norm": 1.1140873596427165, + "learning_rate": 8.886968270744789e-06, + "loss": 0.4851, + "step": 1308 + }, + { + "epoch": 0.8856562922868741, + "grad_norm": 1.0885624245963936, + "learning_rate": 8.88449074158914e-06, + "loss": 0.4827, + "step": 1309 + }, + { + "epoch": 0.8863328822733424, + "grad_norm": 1.143391920316101, + "learning_rate": 8.882010804258612e-06, + "loss": 0.5025, + "step": 1310 + }, + { + "epoch": 0.8870094722598105, + "grad_norm": 1.171688364607078, + "learning_rate": 8.879528460290628e-06, + "loss": 0.5019, + "step": 1311 + }, + { + "epoch": 0.8876860622462788, + "grad_norm": 1.0896444499215772, + "learning_rate": 8.877043711224109e-06, + "loss": 0.4932, + "step": 1312 + }, + { + "epoch": 0.8883626522327469, + "grad_norm": 1.1438246799736955, + "learning_rate": 8.874556558599465e-06, + "loss": 0.4767, + "step": 1313 + }, + { + "epoch": 0.8890392422192152, + "grad_norm": 1.173811621971971, + "learning_rate": 8.872067003958597e-06, + "loss": 0.4927, + "step": 1314 + }, + { + "epoch": 0.8897158322056834, + "grad_norm": 1.118986786328927, + "learning_rate": 8.869575048844896e-06, + "loss": 0.4889, + "step": 1315 + }, + { + "epoch": 0.8903924221921515, + "grad_norm": 1.145105965592563, + "learning_rate": 8.867080694803238e-06, + "loss": 0.5006, + "step": 1316 + }, + { + "epoch": 0.8910690121786198, + "grad_norm": 1.1690976704954363, + "learning_rate": 8.864583943379987e-06, + "loss": 0.48, + "step": 1317 + }, + { + "epoch": 0.8917456021650879, + "grad_norm": 1.1877369269813445, + "learning_rate": 8.862084796122998e-06, + "loss": 0.4963, + "step": 1318 + }, + { + "epoch": 0.8924221921515562, + "grad_norm": 1.1109714662658936, + "learning_rate": 8.859583254581604e-06, + "loss": 0.4803, + "step": 1319 + }, + { + "epoch": 0.8930987821380244, + "grad_norm": 1.0543606010594926, + "learning_rate": 8.85707932030663e-06, + "loss": 0.4698, + "step": 1320 + }, + { + "epoch": 0.8937753721244925, + "grad_norm": 1.0918568552170842, + "learning_rate": 8.854572994850376e-06, + "loss": 0.4821, + "step": 1321 + }, + { + "epoch": 0.8944519621109608, + "grad_norm": 1.1171309020142948, + "learning_rate": 8.85206427976663e-06, + "loss": 0.487, + "step": 1322 + }, + { + "epoch": 0.8951285520974289, + "grad_norm": 1.1055727328989207, + "learning_rate": 8.849553176610661e-06, + "loss": 0.4795, + "step": 1323 + }, + { + "epoch": 0.8958051420838972, + "grad_norm": 1.124376201088204, + "learning_rate": 8.847039686939218e-06, + "loss": 0.5016, + "step": 1324 + }, + { + "epoch": 0.8964817320703654, + "grad_norm": 1.164546623553131, + "learning_rate": 8.844523812310527e-06, + "loss": 0.5052, + "step": 1325 + }, + { + "epoch": 0.8971583220568335, + "grad_norm": 1.1225569509504587, + "learning_rate": 8.842005554284296e-06, + "loss": 0.5065, + "step": 1326 + }, + { + "epoch": 0.8978349120433018, + "grad_norm": 1.0816518230791254, + "learning_rate": 8.83948491442171e-06, + "loss": 0.4812, + "step": 1327 + }, + { + "epoch": 0.8985115020297699, + "grad_norm": 1.062718268046039, + "learning_rate": 8.836961894285428e-06, + "loss": 0.4758, + "step": 1328 + }, + { + "epoch": 0.8991880920162382, + "grad_norm": 1.062149555585684, + "learning_rate": 8.834436495439588e-06, + "loss": 0.4848, + "step": 1329 + }, + { + "epoch": 0.8998646820027063, + "grad_norm": 1.133815006969586, + "learning_rate": 8.8319087194498e-06, + "loss": 0.4837, + "step": 1330 + }, + { + "epoch": 0.9005412719891746, + "grad_norm": 1.120722107309994, + "learning_rate": 8.829378567883152e-06, + "loss": 0.5073, + "step": 1331 + }, + { + "epoch": 0.9012178619756428, + "grad_norm": 1.115072165839227, + "learning_rate": 8.826846042308195e-06, + "loss": 0.5087, + "step": 1332 + }, + { + "epoch": 0.9018944519621109, + "grad_norm": 1.0648196052598222, + "learning_rate": 8.824311144294966e-06, + "loss": 0.4637, + "step": 1333 + }, + { + "epoch": 0.9025710419485792, + "grad_norm": 1.1892533528735048, + "learning_rate": 8.82177387541496e-06, + "loss": 0.5189, + "step": 1334 + }, + { + "epoch": 0.9032476319350473, + "grad_norm": 1.109942217898596, + "learning_rate": 8.819234237241148e-06, + "loss": 0.4783, + "step": 1335 + }, + { + "epoch": 0.9039242219215156, + "grad_norm": 1.0853856487309683, + "learning_rate": 8.816692231347972e-06, + "loss": 0.4759, + "step": 1336 + }, + { + "epoch": 0.9046008119079838, + "grad_norm": 1.0652639470040577, + "learning_rate": 8.814147859311333e-06, + "loss": 0.4791, + "step": 1337 + }, + { + "epoch": 0.9052774018944519, + "grad_norm": 1.0929673161367177, + "learning_rate": 8.81160112270861e-06, + "loss": 0.4887, + "step": 1338 + }, + { + "epoch": 0.9059539918809202, + "grad_norm": 1.1495286340763267, + "learning_rate": 8.809052023118638e-06, + "loss": 0.5051, + "step": 1339 + }, + { + "epoch": 0.9066305818673883, + "grad_norm": 1.078036842137762, + "learning_rate": 8.806500562121724e-06, + "loss": 0.4695, + "step": 1340 + }, + { + "epoch": 0.9073071718538566, + "grad_norm": 1.0903934108755975, + "learning_rate": 8.803946741299635e-06, + "loss": 0.4497, + "step": 1341 + }, + { + "epoch": 0.9079837618403248, + "grad_norm": 1.0983398134962221, + "learning_rate": 8.801390562235603e-06, + "loss": 0.4805, + "step": 1342 + }, + { + "epoch": 0.908660351826793, + "grad_norm": 1.0688244745964672, + "learning_rate": 8.79883202651432e-06, + "loss": 0.4575, + "step": 1343 + }, + { + "epoch": 0.9093369418132612, + "grad_norm": 1.1078644727186073, + "learning_rate": 8.796271135721944e-06, + "loss": 0.4946, + "step": 1344 + }, + { + "epoch": 0.9100135317997293, + "grad_norm": 1.1421186491807882, + "learning_rate": 8.793707891446086e-06, + "loss": 0.4847, + "step": 1345 + }, + { + "epoch": 0.9106901217861976, + "grad_norm": 1.093260920106085, + "learning_rate": 8.791142295275819e-06, + "loss": 0.4751, + "step": 1346 + }, + { + "epoch": 0.9113667117726658, + "grad_norm": 1.1136598978673493, + "learning_rate": 8.788574348801676e-06, + "loss": 0.4806, + "step": 1347 + }, + { + "epoch": 0.912043301759134, + "grad_norm": 1.102167214310862, + "learning_rate": 8.786004053615642e-06, + "loss": 0.4764, + "step": 1348 + }, + { + "epoch": 0.9127198917456022, + "grad_norm": 1.1059664458026084, + "learning_rate": 8.783431411311165e-06, + "loss": 0.4894, + "step": 1349 + }, + { + "epoch": 0.9133964817320703, + "grad_norm": 1.0578620170298016, + "learning_rate": 8.780856423483145e-06, + "loss": 0.4827, + "step": 1350 + }, + { + "epoch": 0.9140730717185386, + "grad_norm": 1.0957760371681264, + "learning_rate": 8.778279091727933e-06, + "loss": 0.503, + "step": 1351 + }, + { + "epoch": 0.9147496617050067, + "grad_norm": 1.1096754377279134, + "learning_rate": 8.775699417643337e-06, + "loss": 0.4719, + "step": 1352 + }, + { + "epoch": 0.915426251691475, + "grad_norm": 1.0989769966462268, + "learning_rate": 8.773117402828618e-06, + "loss": 0.4766, + "step": 1353 + }, + { + "epoch": 0.9161028416779432, + "grad_norm": 1.1020082145652916, + "learning_rate": 8.770533048884483e-06, + "loss": 0.4726, + "step": 1354 + }, + { + "epoch": 0.9167794316644113, + "grad_norm": 1.1072270802152941, + "learning_rate": 8.767946357413091e-06, + "loss": 0.4843, + "step": 1355 + }, + { + "epoch": 0.9174560216508796, + "grad_norm": 1.156295673676666, + "learning_rate": 8.765357330018056e-06, + "loss": 0.4952, + "step": 1356 + }, + { + "epoch": 0.9181326116373477, + "grad_norm": 1.1270161829253635, + "learning_rate": 8.76276596830443e-06, + "loss": 0.5159, + "step": 1357 + }, + { + "epoch": 0.918809201623816, + "grad_norm": 1.0891541127007403, + "learning_rate": 8.760172273878723e-06, + "loss": 0.4905, + "step": 1358 + }, + { + "epoch": 0.9194857916102842, + "grad_norm": 1.1225614804261512, + "learning_rate": 8.757576248348883e-06, + "loss": 0.4899, + "step": 1359 + }, + { + "epoch": 0.9201623815967523, + "grad_norm": 1.1093518880030158, + "learning_rate": 8.754977893324305e-06, + "loss": 0.4836, + "step": 1360 + }, + { + "epoch": 0.9208389715832206, + "grad_norm": 1.122660197398548, + "learning_rate": 8.75237721041583e-06, + "loss": 0.4885, + "step": 1361 + }, + { + "epoch": 0.9215155615696887, + "grad_norm": 1.0844909528987519, + "learning_rate": 8.74977420123574e-06, + "loss": 0.4754, + "step": 1362 + }, + { + "epoch": 0.922192151556157, + "grad_norm": 1.070954967599324, + "learning_rate": 8.747168867397765e-06, + "loss": 0.4771, + "step": 1363 + }, + { + "epoch": 0.9228687415426252, + "grad_norm": 1.1557809804366883, + "learning_rate": 8.744561210517067e-06, + "loss": 0.4649, + "step": 1364 + }, + { + "epoch": 0.9235453315290933, + "grad_norm": 1.1020724249035476, + "learning_rate": 8.741951232210254e-06, + "loss": 0.4759, + "step": 1365 + }, + { + "epoch": 0.9242219215155616, + "grad_norm": 1.0679150863324274, + "learning_rate": 8.73933893409537e-06, + "loss": 0.4782, + "step": 1366 + }, + { + "epoch": 0.9248985115020297, + "grad_norm": 1.0974332171975851, + "learning_rate": 8.736724317791903e-06, + "loss": 0.4697, + "step": 1367 + }, + { + "epoch": 0.925575101488498, + "grad_norm": 1.1369134886294892, + "learning_rate": 8.734107384920771e-06, + "loss": 0.4654, + "step": 1368 + }, + { + "epoch": 0.9262516914749662, + "grad_norm": 1.1723999917840484, + "learning_rate": 8.731488137104332e-06, + "loss": 0.5069, + "step": 1369 + }, + { + "epoch": 0.9269282814614344, + "grad_norm": 1.1442159695171845, + "learning_rate": 8.728866575966379e-06, + "loss": 0.4697, + "step": 1370 + }, + { + "epoch": 0.9276048714479026, + "grad_norm": 1.1039418864188117, + "learning_rate": 8.726242703132139e-06, + "loss": 0.4749, + "step": 1371 + }, + { + "epoch": 0.9282814614343707, + "grad_norm": 1.158271427116123, + "learning_rate": 8.72361652022827e-06, + "loss": 0.4635, + "step": 1372 + }, + { + "epoch": 0.928958051420839, + "grad_norm": 1.1324175018730116, + "learning_rate": 8.720988028882867e-06, + "loss": 0.4826, + "step": 1373 + }, + { + "epoch": 0.9296346414073072, + "grad_norm": 1.0592613871370138, + "learning_rate": 8.71835723072545e-06, + "loss": 0.4733, + "step": 1374 + }, + { + "epoch": 0.9303112313937754, + "grad_norm": 1.108042678700999, + "learning_rate": 8.715724127386971e-06, + "loss": 0.4845, + "step": 1375 + }, + { + "epoch": 0.9309878213802436, + "grad_norm": 1.0490795210120185, + "learning_rate": 8.713088720499817e-06, + "loss": 0.4674, + "step": 1376 + }, + { + "epoch": 0.9316644113667117, + "grad_norm": 1.079599177008303, + "learning_rate": 8.710451011697794e-06, + "loss": 0.4581, + "step": 1377 + }, + { + "epoch": 0.93234100135318, + "grad_norm": 1.101341041886667, + "learning_rate": 8.70781100261614e-06, + "loss": 0.4712, + "step": 1378 + }, + { + "epoch": 0.9330175913396481, + "grad_norm": 1.128688170954129, + "learning_rate": 8.705168694891522e-06, + "loss": 0.503, + "step": 1379 + }, + { + "epoch": 0.9336941813261164, + "grad_norm": 1.1146512655692067, + "learning_rate": 8.702524090162023e-06, + "loss": 0.458, + "step": 1380 + }, + { + "epoch": 0.9343707713125846, + "grad_norm": 1.0875249983285529, + "learning_rate": 8.699877190067158e-06, + "loss": 0.4644, + "step": 1381 + }, + { + "epoch": 0.9350473612990527, + "grad_norm": 1.0827841064440842, + "learning_rate": 8.697227996247861e-06, + "loss": 0.4785, + "step": 1382 + }, + { + "epoch": 0.935723951285521, + "grad_norm": 1.1930711262650655, + "learning_rate": 8.694576510346493e-06, + "loss": 0.5006, + "step": 1383 + }, + { + "epoch": 0.9364005412719891, + "grad_norm": 1.1121646400437968, + "learning_rate": 8.691922734006828e-06, + "loss": 0.4747, + "step": 1384 + }, + { + "epoch": 0.9370771312584574, + "grad_norm": 1.1020820481150944, + "learning_rate": 8.689266668874067e-06, + "loss": 0.4793, + "step": 1385 + }, + { + "epoch": 0.9377537212449256, + "grad_norm": 1.092052589638481, + "learning_rate": 8.686608316594826e-06, + "loss": 0.4748, + "step": 1386 + }, + { + "epoch": 0.9384303112313938, + "grad_norm": 1.1178469084348743, + "learning_rate": 8.683947678817139e-06, + "loss": 0.4734, + "step": 1387 + }, + { + "epoch": 0.939106901217862, + "grad_norm": 1.0851127636579434, + "learning_rate": 8.681284757190462e-06, + "loss": 0.4793, + "step": 1388 + }, + { + "epoch": 0.9397834912043301, + "grad_norm": 1.1681335670461341, + "learning_rate": 8.67861955336566e-06, + "loss": 0.5047, + "step": 1389 + }, + { + "epoch": 0.9404600811907984, + "grad_norm": 1.1228454390584235, + "learning_rate": 8.675952068995014e-06, + "loss": 0.4761, + "step": 1390 + }, + { + "epoch": 0.9411366711772666, + "grad_norm": 1.155355536164632, + "learning_rate": 8.673282305732225e-06, + "loss": 0.4931, + "step": 1391 + }, + { + "epoch": 0.9418132611637348, + "grad_norm": 1.0975254096590343, + "learning_rate": 8.670610265232398e-06, + "loss": 0.4952, + "step": 1392 + }, + { + "epoch": 0.942489851150203, + "grad_norm": 1.1263807002918433, + "learning_rate": 8.667935949152057e-06, + "loss": 0.4973, + "step": 1393 + }, + { + "epoch": 0.9431664411366711, + "grad_norm": 1.0717855553357298, + "learning_rate": 8.665259359149132e-06, + "loss": 0.4604, + "step": 1394 + }, + { + "epoch": 0.9438430311231394, + "grad_norm": 1.115443505803302, + "learning_rate": 8.662580496882967e-06, + "loss": 0.4638, + "step": 1395 + }, + { + "epoch": 0.9445196211096076, + "grad_norm": 1.0613925003688098, + "learning_rate": 8.659899364014309e-06, + "loss": 0.4661, + "step": 1396 + }, + { + "epoch": 0.9451962110960758, + "grad_norm": 1.1160444526814355, + "learning_rate": 8.657215962205318e-06, + "loss": 0.4826, + "step": 1397 + }, + { + "epoch": 0.945872801082544, + "grad_norm": 1.0982289880424816, + "learning_rate": 8.654530293119558e-06, + "loss": 0.4869, + "step": 1398 + }, + { + "epoch": 0.9465493910690121, + "grad_norm": 1.042399225804521, + "learning_rate": 8.651842358421999e-06, + "loss": 0.4491, + "step": 1399 + }, + { + "epoch": 0.9472259810554804, + "grad_norm": 1.025045296939848, + "learning_rate": 8.649152159779015e-06, + "loss": 0.4716, + "step": 1400 + }, + { + "epoch": 0.9479025710419485, + "grad_norm": 1.0188020827441104, + "learning_rate": 8.646459698858386e-06, + "loss": 0.4498, + "step": 1401 + }, + { + "epoch": 0.9485791610284168, + "grad_norm": 1.131377324119661, + "learning_rate": 8.64376497732929e-06, + "loss": 0.4726, + "step": 1402 + }, + { + "epoch": 0.949255751014885, + "grad_norm": 1.1033023675409355, + "learning_rate": 8.64106799686231e-06, + "loss": 0.4882, + "step": 1403 + }, + { + "epoch": 0.9499323410013532, + "grad_norm": 1.1570690756002653, + "learning_rate": 8.638368759129433e-06, + "loss": 0.4711, + "step": 1404 + }, + { + "epoch": 0.9506089309878214, + "grad_norm": 1.0936792629313752, + "learning_rate": 8.635667265804034e-06, + "loss": 0.4602, + "step": 1405 + }, + { + "epoch": 0.9512855209742895, + "grad_norm": 1.1097972552291377, + "learning_rate": 8.632963518560894e-06, + "loss": 0.4769, + "step": 1406 + }, + { + "epoch": 0.9519621109607578, + "grad_norm": 1.0600437578483592, + "learning_rate": 8.630257519076196e-06, + "loss": 0.4764, + "step": 1407 + }, + { + "epoch": 0.952638700947226, + "grad_norm": 1.0810009270230665, + "learning_rate": 8.627549269027509e-06, + "loss": 0.4563, + "step": 1408 + }, + { + "epoch": 0.9533152909336942, + "grad_norm": 1.1378313133212332, + "learning_rate": 8.624838770093805e-06, + "loss": 0.5015, + "step": 1409 + }, + { + "epoch": 0.9539918809201624, + "grad_norm": 1.0730409166924595, + "learning_rate": 8.622126023955446e-06, + "loss": 0.486, + "step": 1410 + }, + { + "epoch": 0.9546684709066305, + "grad_norm": 1.0924453900556088, + "learning_rate": 8.619411032294187e-06, + "loss": 0.4815, + "step": 1411 + }, + { + "epoch": 0.9553450608930988, + "grad_norm": 1.1196772622569686, + "learning_rate": 8.616693796793178e-06, + "loss": 0.4908, + "step": 1412 + }, + { + "epoch": 0.956021650879567, + "grad_norm": 1.1279912000536065, + "learning_rate": 8.613974319136959e-06, + "loss": 0.4697, + "step": 1413 + }, + { + "epoch": 0.9566982408660352, + "grad_norm": 1.0686371294128816, + "learning_rate": 8.611252601011457e-06, + "loss": 0.4557, + "step": 1414 + }, + { + "epoch": 0.9573748308525034, + "grad_norm": 1.060571430752584, + "learning_rate": 8.608528644103994e-06, + "loss": 0.4773, + "step": 1415 + }, + { + "epoch": 0.9580514208389715, + "grad_norm": 1.0943973067230608, + "learning_rate": 8.605802450103276e-06, + "loss": 0.4692, + "step": 1416 + }, + { + "epoch": 0.9587280108254398, + "grad_norm": 1.1135642740683926, + "learning_rate": 8.603074020699393e-06, + "loss": 0.4632, + "step": 1417 + }, + { + "epoch": 0.959404600811908, + "grad_norm": 1.1564508268609643, + "learning_rate": 8.600343357583826e-06, + "loss": 0.4832, + "step": 1418 + }, + { + "epoch": 0.9600811907983762, + "grad_norm": 1.188240940834957, + "learning_rate": 8.597610462449441e-06, + "loss": 0.5119, + "step": 1419 + }, + { + "epoch": 0.9607577807848444, + "grad_norm": 1.0838446802817607, + "learning_rate": 8.594875336990482e-06, + "loss": 0.4628, + "step": 1420 + }, + { + "epoch": 0.9614343707713126, + "grad_norm": 1.1083838323042579, + "learning_rate": 8.592137982902585e-06, + "loss": 0.4823, + "step": 1421 + }, + { + "epoch": 0.9621109607577808, + "grad_norm": 1.0957894962472958, + "learning_rate": 8.589398401882755e-06, + "loss": 0.4904, + "step": 1422 + }, + { + "epoch": 0.9627875507442489, + "grad_norm": 1.0759682815221387, + "learning_rate": 8.586656595629387e-06, + "loss": 0.4579, + "step": 1423 + }, + { + "epoch": 0.9634641407307172, + "grad_norm": 1.101268240904258, + "learning_rate": 8.583912565842258e-06, + "loss": 0.4619, + "step": 1424 + }, + { + "epoch": 0.9641407307171854, + "grad_norm": 1.056032844943916, + "learning_rate": 8.581166314222512e-06, + "loss": 0.4782, + "step": 1425 + }, + { + "epoch": 0.9648173207036536, + "grad_norm": 1.0247273710377593, + "learning_rate": 8.57841784247268e-06, + "loss": 0.4733, + "step": 1426 + }, + { + "epoch": 0.9654939106901218, + "grad_norm": 1.0633616342864958, + "learning_rate": 8.575667152296666e-06, + "loss": 0.4964, + "step": 1427 + }, + { + "epoch": 0.9661705006765899, + "grad_norm": 1.09250706289183, + "learning_rate": 8.572914245399748e-06, + "loss": 0.4868, + "step": 1428 + }, + { + "epoch": 0.9668470906630582, + "grad_norm": 1.0532665047266043, + "learning_rate": 8.570159123488584e-06, + "loss": 0.4591, + "step": 1429 + }, + { + "epoch": 0.9675236806495264, + "grad_norm": 1.1648035844376532, + "learning_rate": 8.567401788271195e-06, + "loss": 0.4792, + "step": 1430 + }, + { + "epoch": 0.9682002706359946, + "grad_norm": 1.0526720189732204, + "learning_rate": 8.564642241456986e-06, + "loss": 0.4745, + "step": 1431 + }, + { + "epoch": 0.9688768606224628, + "grad_norm": 1.1939719091186145, + "learning_rate": 8.561880484756726e-06, + "loss": 0.4979, + "step": 1432 + }, + { + "epoch": 0.969553450608931, + "grad_norm": 1.0643289776647997, + "learning_rate": 8.559116519882551e-06, + "loss": 0.4606, + "step": 1433 + }, + { + "epoch": 0.9702300405953992, + "grad_norm": 1.0954360300513486, + "learning_rate": 8.556350348547978e-06, + "loss": 0.4672, + "step": 1434 + }, + { + "epoch": 0.9709066305818674, + "grad_norm": 1.1244464441728843, + "learning_rate": 8.553581972467875e-06, + "loss": 0.481, + "step": 1435 + }, + { + "epoch": 0.9715832205683356, + "grad_norm": 1.131798533312523, + "learning_rate": 8.550811393358494e-06, + "loss": 0.4831, + "step": 1436 + }, + { + "epoch": 0.9722598105548038, + "grad_norm": 1.1107322276914728, + "learning_rate": 8.54803861293744e-06, + "loss": 0.4627, + "step": 1437 + }, + { + "epoch": 0.972936400541272, + "grad_norm": 1.0968367892460305, + "learning_rate": 8.545263632923687e-06, + "loss": 0.4716, + "step": 1438 + }, + { + "epoch": 0.9736129905277402, + "grad_norm": 1.0702423575289433, + "learning_rate": 8.542486455037578e-06, + "loss": 0.4892, + "step": 1439 + }, + { + "epoch": 0.9742895805142084, + "grad_norm": 1.0901673460271821, + "learning_rate": 8.539707081000808e-06, + "loss": 0.4802, + "step": 1440 + }, + { + "epoch": 0.9749661705006766, + "grad_norm": 1.0775967217988527, + "learning_rate": 8.536925512536441e-06, + "loss": 0.4852, + "step": 1441 + }, + { + "epoch": 0.9756427604871448, + "grad_norm": 1.0804318364012613, + "learning_rate": 8.534141751368901e-06, + "loss": 0.4754, + "step": 1442 + }, + { + "epoch": 0.976319350473613, + "grad_norm": 1.0948792749230274, + "learning_rate": 8.531355799223968e-06, + "loss": 0.475, + "step": 1443 + }, + { + "epoch": 0.9769959404600812, + "grad_norm": 1.040576660022014, + "learning_rate": 8.528567657828785e-06, + "loss": 0.4667, + "step": 1444 + }, + { + "epoch": 0.9776725304465493, + "grad_norm": 1.168361912067637, + "learning_rate": 8.525777328911846e-06, + "loss": 0.4813, + "step": 1445 + }, + { + "epoch": 0.9783491204330176, + "grad_norm": 1.0415737120357251, + "learning_rate": 8.522984814203006e-06, + "loss": 0.4697, + "step": 1446 + }, + { + "epoch": 0.9790257104194858, + "grad_norm": 1.135579034154808, + "learning_rate": 8.520190115433473e-06, + "loss": 0.4964, + "step": 1447 + }, + { + "epoch": 0.979702300405954, + "grad_norm": 1.0571980952930058, + "learning_rate": 8.517393234335812e-06, + "loss": 0.4839, + "step": 1448 + }, + { + "epoch": 0.9803788903924222, + "grad_norm": 1.0516782289808286, + "learning_rate": 8.514594172643934e-06, + "loss": 0.4715, + "step": 1449 + }, + { + "epoch": 0.9810554803788903, + "grad_norm": 1.0840929954449103, + "learning_rate": 8.51179293209311e-06, + "loss": 0.4601, + "step": 1450 + }, + { + "epoch": 0.9817320703653586, + "grad_norm": 1.0817488259851176, + "learning_rate": 8.508989514419959e-06, + "loss": 0.4858, + "step": 1451 + }, + { + "epoch": 0.9824086603518268, + "grad_norm": 1.04091882722342, + "learning_rate": 8.506183921362443e-06, + "loss": 0.4398, + "step": 1452 + }, + { + "epoch": 0.983085250338295, + "grad_norm": 1.0565015043834196, + "learning_rate": 8.503376154659886e-06, + "loss": 0.4723, + "step": 1453 + }, + { + "epoch": 0.9837618403247632, + "grad_norm": 1.0955380238432584, + "learning_rate": 8.500566216052948e-06, + "loss": 0.4826, + "step": 1454 + }, + { + "epoch": 0.9844384303112313, + "grad_norm": 1.0444130204805198, + "learning_rate": 8.497754107283637e-06, + "loss": 0.4566, + "step": 1455 + }, + { + "epoch": 0.9851150202976996, + "grad_norm": 0.9885016010515429, + "learning_rate": 8.494939830095315e-06, + "loss": 0.4475, + "step": 1456 + }, + { + "epoch": 0.9857916102841678, + "grad_norm": 1.0177590872550946, + "learning_rate": 8.492123386232678e-06, + "loss": 0.4618, + "step": 1457 + }, + { + "epoch": 0.986468200270636, + "grad_norm": 1.060331320050249, + "learning_rate": 8.489304777441772e-06, + "loss": 0.4804, + "step": 1458 + }, + { + "epoch": 0.9871447902571042, + "grad_norm": 1.0130478712982525, + "learning_rate": 8.486484005469977e-06, + "loss": 0.4645, + "step": 1459 + }, + { + "epoch": 0.9878213802435724, + "grad_norm": 1.090740661889863, + "learning_rate": 8.483661072066027e-06, + "loss": 0.483, + "step": 1460 + }, + { + "epoch": 0.9884979702300406, + "grad_norm": 1.084950732414929, + "learning_rate": 8.480835978979983e-06, + "loss": 0.4557, + "step": 1461 + }, + { + "epoch": 0.9891745602165088, + "grad_norm": 1.0397139752573001, + "learning_rate": 8.478008727963253e-06, + "loss": 0.4669, + "step": 1462 + }, + { + "epoch": 0.989851150202977, + "grad_norm": 1.027183170924242, + "learning_rate": 8.475179320768581e-06, + "loss": 0.4683, + "step": 1463 + }, + { + "epoch": 0.9905277401894452, + "grad_norm": 1.065718624494268, + "learning_rate": 8.472347759150044e-06, + "loss": 0.4697, + "step": 1464 + }, + { + "epoch": 0.9912043301759134, + "grad_norm": 1.048211630683376, + "learning_rate": 8.46951404486306e-06, + "loss": 0.4424, + "step": 1465 + }, + { + "epoch": 0.9918809201623816, + "grad_norm": 1.0681969687831656, + "learning_rate": 8.466678179664378e-06, + "loss": 0.4583, + "step": 1466 + }, + { + "epoch": 0.9925575101488497, + "grad_norm": 1.0772139404680132, + "learning_rate": 8.463840165312083e-06, + "loss": 0.4539, + "step": 1467 + }, + { + "epoch": 0.993234100135318, + "grad_norm": 1.1274787922980964, + "learning_rate": 8.461000003565588e-06, + "loss": 0.478, + "step": 1468 + }, + { + "epoch": 0.9939106901217862, + "grad_norm": 1.1519269128455025, + "learning_rate": 8.458157696185643e-06, + "loss": 0.4796, + "step": 1469 + }, + { + "epoch": 0.9945872801082544, + "grad_norm": 1.0985704356091963, + "learning_rate": 8.455313244934324e-06, + "loss": 0.4945, + "step": 1470 + }, + { + "epoch": 0.9952638700947226, + "grad_norm": 1.130466681997261, + "learning_rate": 8.452466651575039e-06, + "loss": 0.4701, + "step": 1471 + }, + { + "epoch": 0.9959404600811907, + "grad_norm": 1.0723777920342406, + "learning_rate": 8.44961791787252e-06, + "loss": 0.4781, + "step": 1472 + }, + { + "epoch": 0.996617050067659, + "grad_norm": 1.0129052107576901, + "learning_rate": 8.446767045592829e-06, + "loss": 0.4472, + "step": 1473 + }, + { + "epoch": 0.9972936400541272, + "grad_norm": 1.0822449262813412, + "learning_rate": 8.443914036503356e-06, + "loss": 0.5021, + "step": 1474 + }, + { + "epoch": 0.9979702300405954, + "grad_norm": 1.058103427210262, + "learning_rate": 8.44105889237281e-06, + "loss": 0.493, + "step": 1475 + }, + { + "epoch": 0.9986468200270636, + "grad_norm": 1.021156031374969, + "learning_rate": 8.438201614971227e-06, + "loss": 0.4592, + "step": 1476 + }, + { + "epoch": 0.9993234100135318, + "grad_norm": 1.0687638217219952, + "learning_rate": 8.435342206069965e-06, + "loss": 0.4808, + "step": 1477 + }, + { + "epoch": 1.0, + "grad_norm": 1.0693278482698754, + "learning_rate": 8.432480667441703e-06, + "loss": 0.4824, + "step": 1478 + }, + { + "epoch": 1.0, + "eval_loss": 0.47269096970558167, + "eval_runtime": 442.6041, + "eval_samples_per_second": 22.492, + "eval_steps_per_second": 0.705, + "step": 1478 + }, + { + "epoch": 1.0006765899864682, + "grad_norm": 1.0631144044441243, + "learning_rate": 8.429617000860441e-06, + "loss": 0.447, + "step": 1479 + }, + { + "epoch": 1.0013531799729365, + "grad_norm": 1.0134654167802628, + "learning_rate": 8.4267512081015e-06, + "loss": 0.4393, + "step": 1480 + }, + { + "epoch": 1.0020297699594045, + "grad_norm": 1.025784764238537, + "learning_rate": 8.423883290941514e-06, + "loss": 0.4477, + "step": 1481 + }, + { + "epoch": 1.0027063599458728, + "grad_norm": 1.015539247825642, + "learning_rate": 8.421013251158437e-06, + "loss": 0.4295, + "step": 1482 + }, + { + "epoch": 1.003382949932341, + "grad_norm": 1.1028614850365017, + "learning_rate": 8.418141090531543e-06, + "loss": 0.4453, + "step": 1483 + }, + { + "epoch": 1.0040595399188093, + "grad_norm": 0.9976452983820293, + "learning_rate": 8.415266810841412e-06, + "loss": 0.4188, + "step": 1484 + }, + { + "epoch": 1.0047361299052775, + "grad_norm": 1.095388091254139, + "learning_rate": 8.412390413869944e-06, + "loss": 0.4539, + "step": 1485 + }, + { + "epoch": 1.0054127198917455, + "grad_norm": 1.0429180460374765, + "learning_rate": 8.409511901400351e-06, + "loss": 0.4235, + "step": 1486 + }, + { + "epoch": 1.0060893098782138, + "grad_norm": 1.0659771378200944, + "learning_rate": 8.406631275217156e-06, + "loss": 0.4343, + "step": 1487 + }, + { + "epoch": 1.006765899864682, + "grad_norm": 1.0523513744241333, + "learning_rate": 8.40374853710619e-06, + "loss": 0.4487, + "step": 1488 + }, + { + "epoch": 1.0074424898511503, + "grad_norm": 1.1118925477216597, + "learning_rate": 8.400863688854598e-06, + "loss": 0.4507, + "step": 1489 + }, + { + "epoch": 1.0081190798376185, + "grad_norm": 1.0249195627264522, + "learning_rate": 8.397976732250827e-06, + "loss": 0.4186, + "step": 1490 + }, + { + "epoch": 1.0087956698240865, + "grad_norm": 1.091226462243931, + "learning_rate": 8.395087669084638e-06, + "loss": 0.4297, + "step": 1491 + }, + { + "epoch": 1.0094722598105548, + "grad_norm": 1.1595732717937002, + "learning_rate": 8.392196501147092e-06, + "loss": 0.4315, + "step": 1492 + }, + { + "epoch": 1.010148849797023, + "grad_norm": 1.102426306487242, + "learning_rate": 8.389303230230556e-06, + "loss": 0.4451, + "step": 1493 + }, + { + "epoch": 1.0108254397834913, + "grad_norm": 1.1335299423993308, + "learning_rate": 8.386407858128707e-06, + "loss": 0.4573, + "step": 1494 + }, + { + "epoch": 1.0115020297699595, + "grad_norm": 1.047802436754095, + "learning_rate": 8.383510386636516e-06, + "loss": 0.4198, + "step": 1495 + }, + { + "epoch": 1.0121786197564275, + "grad_norm": 1.0871744256064688, + "learning_rate": 8.380610817550256e-06, + "loss": 0.4385, + "step": 1496 + }, + { + "epoch": 1.0128552097428958, + "grad_norm": 1.1213217903098696, + "learning_rate": 8.377709152667513e-06, + "loss": 0.4608, + "step": 1497 + }, + { + "epoch": 1.013531799729364, + "grad_norm": 1.0568178302832327, + "learning_rate": 8.374805393787154e-06, + "loss": 0.4295, + "step": 1498 + }, + { + "epoch": 1.0142083897158323, + "grad_norm": 1.0866289032102656, + "learning_rate": 8.371899542709355e-06, + "loss": 0.4371, + "step": 1499 + }, + { + "epoch": 1.0148849797023005, + "grad_norm": 1.0691387752221155, + "learning_rate": 8.36899160123559e-06, + "loss": 0.4357, + "step": 1500 + }, + { + "epoch": 1.0155615696887685, + "grad_norm": 1.056073550598995, + "learning_rate": 8.366081571168625e-06, + "loss": 0.4327, + "step": 1501 + }, + { + "epoch": 1.0162381596752368, + "grad_norm": 1.1018468294030312, + "learning_rate": 8.363169454312518e-06, + "loss": 0.4437, + "step": 1502 + }, + { + "epoch": 1.016914749661705, + "grad_norm": 1.142094501778782, + "learning_rate": 8.36025525247263e-06, + "loss": 0.4566, + "step": 1503 + }, + { + "epoch": 1.0175913396481733, + "grad_norm": 1.1384720029539852, + "learning_rate": 8.357338967455605e-06, + "loss": 0.4558, + "step": 1504 + }, + { + "epoch": 1.0182679296346413, + "grad_norm": 1.0774145244239763, + "learning_rate": 8.354420601069384e-06, + "loss": 0.416, + "step": 1505 + }, + { + "epoch": 1.0189445196211095, + "grad_norm": 1.0638346943367993, + "learning_rate": 8.3515001551232e-06, + "loss": 0.4091, + "step": 1506 + }, + { + "epoch": 1.0196211096075778, + "grad_norm": 1.0742898883746654, + "learning_rate": 8.348577631427565e-06, + "loss": 0.4243, + "step": 1507 + }, + { + "epoch": 1.020297699594046, + "grad_norm": 1.1129992247357157, + "learning_rate": 8.345653031794292e-06, + "loss": 0.4484, + "step": 1508 + }, + { + "epoch": 1.0209742895805143, + "grad_norm": 1.1382557195899554, + "learning_rate": 8.342726358036473e-06, + "loss": 0.4537, + "step": 1509 + }, + { + "epoch": 1.0216508795669823, + "grad_norm": 1.1258317090624272, + "learning_rate": 8.339797611968488e-06, + "loss": 0.4212, + "step": 1510 + }, + { + "epoch": 1.0223274695534506, + "grad_norm": 1.0670788201964598, + "learning_rate": 8.336866795406003e-06, + "loss": 0.4386, + "step": 1511 + }, + { + "epoch": 1.0230040595399188, + "grad_norm": 1.080078835351708, + "learning_rate": 8.333933910165964e-06, + "loss": 0.4304, + "step": 1512 + }, + { + "epoch": 1.023680649526387, + "grad_norm": 1.0720296166600187, + "learning_rate": 8.3309989580666e-06, + "loss": 0.4505, + "step": 1513 + }, + { + "epoch": 1.0243572395128553, + "grad_norm": 1.086828424479624, + "learning_rate": 8.32806194092743e-06, + "loss": 0.4438, + "step": 1514 + }, + { + "epoch": 1.0250338294993233, + "grad_norm": 1.0861309314852288, + "learning_rate": 8.325122860569241e-06, + "loss": 0.4399, + "step": 1515 + }, + { + "epoch": 1.0257104194857916, + "grad_norm": 1.0419786852269919, + "learning_rate": 8.322181718814107e-06, + "loss": 0.4522, + "step": 1516 + }, + { + "epoch": 1.0263870094722598, + "grad_norm": 1.1276730398979562, + "learning_rate": 8.319238517485376e-06, + "loss": 0.4376, + "step": 1517 + }, + { + "epoch": 1.027063599458728, + "grad_norm": 1.0543197707149141, + "learning_rate": 8.316293258407673e-06, + "loss": 0.4276, + "step": 1518 + }, + { + "epoch": 1.0277401894451963, + "grad_norm": 1.0992416942236956, + "learning_rate": 8.313345943406903e-06, + "loss": 0.45, + "step": 1519 + }, + { + "epoch": 1.0284167794316643, + "grad_norm": 1.04032592481762, + "learning_rate": 8.310396574310239e-06, + "loss": 0.4142, + "step": 1520 + }, + { + "epoch": 1.0290933694181326, + "grad_norm": 1.0538815128264054, + "learning_rate": 8.307445152946133e-06, + "loss": 0.4455, + "step": 1521 + }, + { + "epoch": 1.0297699594046008, + "grad_norm": 1.0511836445262603, + "learning_rate": 8.304491681144306e-06, + "loss": 0.4416, + "step": 1522 + }, + { + "epoch": 1.030446549391069, + "grad_norm": 1.0377368559914297, + "learning_rate": 8.301536160735752e-06, + "loss": 0.419, + "step": 1523 + }, + { + "epoch": 1.0311231393775373, + "grad_norm": 1.0648575048510316, + "learning_rate": 8.298578593552737e-06, + "loss": 0.4403, + "step": 1524 + }, + { + "epoch": 1.0317997293640053, + "grad_norm": 1.0027013773600582, + "learning_rate": 8.295618981428788e-06, + "loss": 0.4161, + "step": 1525 + }, + { + "epoch": 1.0324763193504736, + "grad_norm": 1.100756799313297, + "learning_rate": 8.292657326198707e-06, + "loss": 0.4471, + "step": 1526 + }, + { + "epoch": 1.0331529093369418, + "grad_norm": 1.0709669894813856, + "learning_rate": 8.289693629698564e-06, + "loss": 0.4206, + "step": 1527 + }, + { + "epoch": 1.03382949932341, + "grad_norm": 1.1721190145944764, + "learning_rate": 8.286727893765687e-06, + "loss": 0.463, + "step": 1528 + }, + { + "epoch": 1.0345060893098783, + "grad_norm": 1.0595164921651812, + "learning_rate": 8.283760120238672e-06, + "loss": 0.4199, + "step": 1529 + }, + { + "epoch": 1.0351826792963463, + "grad_norm": 1.1084052432329463, + "learning_rate": 8.280790310957382e-06, + "loss": 0.436, + "step": 1530 + }, + { + "epoch": 1.0358592692828146, + "grad_norm": 1.1029448130094104, + "learning_rate": 8.277818467762937e-06, + "loss": 0.4217, + "step": 1531 + }, + { + "epoch": 1.0365358592692828, + "grad_norm": 1.0702473614609793, + "learning_rate": 8.27484459249772e-06, + "loss": 0.4369, + "step": 1532 + }, + { + "epoch": 1.037212449255751, + "grad_norm": 1.0715283201679295, + "learning_rate": 8.271868687005371e-06, + "loss": 0.4499, + "step": 1533 + }, + { + "epoch": 1.0378890392422193, + "grad_norm": 1.0883918479678936, + "learning_rate": 8.268890753130794e-06, + "loss": 0.4409, + "step": 1534 + }, + { + "epoch": 1.0385656292286873, + "grad_norm": 1.150533123005341, + "learning_rate": 8.265910792720147e-06, + "loss": 0.443, + "step": 1535 + }, + { + "epoch": 1.0392422192151556, + "grad_norm": 1.050674093677084, + "learning_rate": 8.262928807620843e-06, + "loss": 0.4245, + "step": 1536 + }, + { + "epoch": 1.0399188092016238, + "grad_norm": 1.0584633423392775, + "learning_rate": 8.259944799681555e-06, + "loss": 0.458, + "step": 1537 + }, + { + "epoch": 1.040595399188092, + "grad_norm": 1.0235428089559888, + "learning_rate": 8.256958770752203e-06, + "loss": 0.4144, + "step": 1538 + }, + { + "epoch": 1.0412719891745603, + "grad_norm": 1.129290621568861, + "learning_rate": 8.253970722683968e-06, + "loss": 0.4257, + "step": 1539 + }, + { + "epoch": 1.0419485791610283, + "grad_norm": 1.105278970568012, + "learning_rate": 8.250980657329278e-06, + "loss": 0.4439, + "step": 1540 + }, + { + "epoch": 1.0426251691474966, + "grad_norm": 1.10926981882421, + "learning_rate": 8.24798857654181e-06, + "loss": 0.4383, + "step": 1541 + }, + { + "epoch": 1.0433017591339648, + "grad_norm": 1.081545700249644, + "learning_rate": 8.244994482176495e-06, + "loss": 0.419, + "step": 1542 + }, + { + "epoch": 1.043978349120433, + "grad_norm": 1.0848326455416037, + "learning_rate": 8.241998376089508e-06, + "loss": 0.4323, + "step": 1543 + }, + { + "epoch": 1.044654939106901, + "grad_norm": 1.0956179683369176, + "learning_rate": 8.239000260138277e-06, + "loss": 0.4503, + "step": 1544 + }, + { + "epoch": 1.0453315290933693, + "grad_norm": 1.117039820391544, + "learning_rate": 8.236000136181468e-06, + "loss": 0.4533, + "step": 1545 + }, + { + "epoch": 1.0460081190798376, + "grad_norm": 1.0843051570447386, + "learning_rate": 8.232998006078998e-06, + "loss": 0.4451, + "step": 1546 + }, + { + "epoch": 1.0466847090663058, + "grad_norm": 1.0446116875656069, + "learning_rate": 8.229993871692028e-06, + "loss": 0.4231, + "step": 1547 + }, + { + "epoch": 1.047361299052774, + "grad_norm": 1.0394361472364462, + "learning_rate": 8.226987734882956e-06, + "loss": 0.4231, + "step": 1548 + }, + { + "epoch": 1.048037889039242, + "grad_norm": 1.107479284428652, + "learning_rate": 8.223979597515425e-06, + "loss": 0.4157, + "step": 1549 + }, + { + "epoch": 1.0487144790257104, + "grad_norm": 1.079236083196405, + "learning_rate": 8.220969461454322e-06, + "loss": 0.4323, + "step": 1550 + }, + { + "epoch": 1.0493910690121786, + "grad_norm": 1.1201728023363162, + "learning_rate": 8.217957328565765e-06, + "loss": 0.4481, + "step": 1551 + }, + { + "epoch": 1.0500676589986468, + "grad_norm": 1.1327604344703373, + "learning_rate": 8.214943200717114e-06, + "loss": 0.4624, + "step": 1552 + }, + { + "epoch": 1.050744248985115, + "grad_norm": 1.079497183316191, + "learning_rate": 8.211927079776969e-06, + "loss": 0.4398, + "step": 1553 + }, + { + "epoch": 1.0514208389715831, + "grad_norm": 1.0822644057515787, + "learning_rate": 8.208908967615159e-06, + "loss": 0.4321, + "step": 1554 + }, + { + "epoch": 1.0520974289580514, + "grad_norm": 1.0670379255597637, + "learning_rate": 8.205888866102753e-06, + "loss": 0.4209, + "step": 1555 + }, + { + "epoch": 1.0527740189445196, + "grad_norm": 1.0584843192491298, + "learning_rate": 8.202866777112049e-06, + "loss": 0.4274, + "step": 1556 + }, + { + "epoch": 1.0534506089309879, + "grad_norm": 1.0825157020208016, + "learning_rate": 8.199842702516584e-06, + "loss": 0.4231, + "step": 1557 + }, + { + "epoch": 1.054127198917456, + "grad_norm": 1.181058072108386, + "learning_rate": 8.196816644191116e-06, + "loss": 0.4543, + "step": 1558 + }, + { + "epoch": 1.0548037889039241, + "grad_norm": 1.1465126958375453, + "learning_rate": 8.193788604011639e-06, + "loss": 0.4317, + "step": 1559 + }, + { + "epoch": 1.0554803788903924, + "grad_norm": 1.1798399915490474, + "learning_rate": 8.190758583855379e-06, + "loss": 0.4373, + "step": 1560 + }, + { + "epoch": 1.0561569688768606, + "grad_norm": 1.0958346515864203, + "learning_rate": 8.187726585600779e-06, + "loss": 0.4418, + "step": 1561 + }, + { + "epoch": 1.0568335588633289, + "grad_norm": 1.0854684147062201, + "learning_rate": 8.18469261112752e-06, + "loss": 0.4438, + "step": 1562 + }, + { + "epoch": 1.057510148849797, + "grad_norm": 1.120287507520833, + "learning_rate": 8.181656662316498e-06, + "loss": 0.4565, + "step": 1563 + }, + { + "epoch": 1.0581867388362651, + "grad_norm": 1.1343778766597947, + "learning_rate": 8.178618741049841e-06, + "loss": 0.4498, + "step": 1564 + }, + { + "epoch": 1.0588633288227334, + "grad_norm": 1.1136835433012175, + "learning_rate": 8.175578849210894e-06, + "loss": 0.4388, + "step": 1565 + }, + { + "epoch": 1.0595399188092016, + "grad_norm": 1.0857050880221115, + "learning_rate": 8.172536988684227e-06, + "loss": 0.4312, + "step": 1566 + }, + { + "epoch": 1.0602165087956699, + "grad_norm": 1.1483632473808032, + "learning_rate": 8.169493161355632e-06, + "loss": 0.4456, + "step": 1567 + }, + { + "epoch": 1.060893098782138, + "grad_norm": 1.0666774990800618, + "learning_rate": 8.166447369112115e-06, + "loss": 0.4272, + "step": 1568 + }, + { + "epoch": 1.0615696887686061, + "grad_norm": 1.1191435336157294, + "learning_rate": 8.163399613841903e-06, + "loss": 0.4575, + "step": 1569 + }, + { + "epoch": 1.0622462787550744, + "grad_norm": 1.1086911588514294, + "learning_rate": 8.160349897434441e-06, + "loss": 0.4386, + "step": 1570 + }, + { + "epoch": 1.0629228687415426, + "grad_norm": 1.0592884396850444, + "learning_rate": 8.157298221780388e-06, + "loss": 0.4127, + "step": 1571 + }, + { + "epoch": 1.0635994587280109, + "grad_norm": 1.0673531117047905, + "learning_rate": 8.15424458877162e-06, + "loss": 0.4336, + "step": 1572 + }, + { + "epoch": 1.0642760487144791, + "grad_norm": 1.0447219327894612, + "learning_rate": 8.151189000301223e-06, + "loss": 0.4187, + "step": 1573 + }, + { + "epoch": 1.0649526387009471, + "grad_norm": 1.1356290836158105, + "learning_rate": 8.148131458263499e-06, + "loss": 0.4403, + "step": 1574 + }, + { + "epoch": 1.0656292286874154, + "grad_norm": 1.067719985999696, + "learning_rate": 8.145071964553956e-06, + "loss": 0.4495, + "step": 1575 + }, + { + "epoch": 1.0663058186738836, + "grad_norm": 1.0700587064794442, + "learning_rate": 8.142010521069319e-06, + "loss": 0.4354, + "step": 1576 + }, + { + "epoch": 1.0669824086603519, + "grad_norm": 1.066775298182691, + "learning_rate": 8.138947129707517e-06, + "loss": 0.4459, + "step": 1577 + }, + { + "epoch": 1.0676589986468201, + "grad_norm": 1.015095176896773, + "learning_rate": 8.135881792367686e-06, + "loss": 0.4104, + "step": 1578 + }, + { + "epoch": 1.0683355886332881, + "grad_norm": 1.093006565458466, + "learning_rate": 8.132814510950172e-06, + "loss": 0.443, + "step": 1579 + }, + { + "epoch": 1.0690121786197564, + "grad_norm": 1.1394810941904618, + "learning_rate": 8.129745287356521e-06, + "loss": 0.4419, + "step": 1580 + }, + { + "epoch": 1.0696887686062246, + "grad_norm": 1.0757647619739268, + "learning_rate": 8.12667412348949e-06, + "loss": 0.4099, + "step": 1581 + }, + { + "epoch": 1.0703653585926929, + "grad_norm": 1.0635248887059734, + "learning_rate": 8.12360102125303e-06, + "loss": 0.4362, + "step": 1582 + }, + { + "epoch": 1.0710419485791611, + "grad_norm": 1.1229702673160813, + "learning_rate": 8.120525982552304e-06, + "loss": 0.4478, + "step": 1583 + }, + { + "epoch": 1.0717185385656292, + "grad_norm": 1.1139017704916843, + "learning_rate": 8.117449009293668e-06, + "loss": 0.4457, + "step": 1584 + }, + { + "epoch": 1.0723951285520974, + "grad_norm": 1.1654561080780221, + "learning_rate": 8.11437010338468e-06, + "loss": 0.4449, + "step": 1585 + }, + { + "epoch": 1.0730717185385656, + "grad_norm": 1.0648716386807038, + "learning_rate": 8.111289266734095e-06, + "loss": 0.4176, + "step": 1586 + }, + { + "epoch": 1.073748308525034, + "grad_norm": 1.0903946735801888, + "learning_rate": 8.108206501251868e-06, + "loss": 0.4388, + "step": 1587 + }, + { + "epoch": 1.0744248985115021, + "grad_norm": 1.1089102923249174, + "learning_rate": 8.105121808849143e-06, + "loss": 0.4386, + "step": 1588 + }, + { + "epoch": 1.0751014884979702, + "grad_norm": 1.10952490226382, + "learning_rate": 8.102035191438268e-06, + "loss": 0.4499, + "step": 1589 + }, + { + "epoch": 1.0757780784844384, + "grad_norm": 1.0363507803276348, + "learning_rate": 8.098946650932776e-06, + "loss": 0.4223, + "step": 1590 + }, + { + "epoch": 1.0764546684709067, + "grad_norm": 1.1276298629414534, + "learning_rate": 8.095856189247396e-06, + "loss": 0.464, + "step": 1591 + }, + { + "epoch": 1.077131258457375, + "grad_norm": 1.0915285211783439, + "learning_rate": 8.092763808298048e-06, + "loss": 0.4268, + "step": 1592 + }, + { + "epoch": 1.0778078484438431, + "grad_norm": 1.0929302102253826, + "learning_rate": 8.089669510001843e-06, + "loss": 0.4348, + "step": 1593 + }, + { + "epoch": 1.0784844384303112, + "grad_norm": 1.0418638573485512, + "learning_rate": 8.086573296277078e-06, + "loss": 0.4188, + "step": 1594 + }, + { + "epoch": 1.0791610284167794, + "grad_norm": 1.1262701903447359, + "learning_rate": 8.083475169043237e-06, + "loss": 0.442, + "step": 1595 + }, + { + "epoch": 1.0798376184032477, + "grad_norm": 1.0278587665611612, + "learning_rate": 8.080375130220995e-06, + "loss": 0.4231, + "step": 1596 + }, + { + "epoch": 1.080514208389716, + "grad_norm": 1.0563790251020893, + "learning_rate": 8.077273181732207e-06, + "loss": 0.4177, + "step": 1597 + }, + { + "epoch": 1.0811907983761841, + "grad_norm": 1.0920356215333442, + "learning_rate": 8.074169325499915e-06, + "loss": 0.4391, + "step": 1598 + }, + { + "epoch": 1.0818673883626522, + "grad_norm": 1.1080050801969525, + "learning_rate": 8.071063563448341e-06, + "loss": 0.4494, + "step": 1599 + }, + { + "epoch": 1.0825439783491204, + "grad_norm": 1.0302226769676113, + "learning_rate": 8.06795589750289e-06, + "loss": 0.4275, + "step": 1600 + }, + { + "epoch": 1.0832205683355887, + "grad_norm": 1.0325206481334062, + "learning_rate": 8.06484632959015e-06, + "loss": 0.4297, + "step": 1601 + }, + { + "epoch": 1.083897158322057, + "grad_norm": 1.0812778227744115, + "learning_rate": 8.061734861637883e-06, + "loss": 0.431, + "step": 1602 + }, + { + "epoch": 1.084573748308525, + "grad_norm": 1.080243386973949, + "learning_rate": 8.058621495575032e-06, + "loss": 0.4419, + "step": 1603 + }, + { + "epoch": 1.0852503382949932, + "grad_norm": 1.0736255387871239, + "learning_rate": 8.055506233331718e-06, + "loss": 0.4333, + "step": 1604 + }, + { + "epoch": 1.0859269282814614, + "grad_norm": 1.0677306821071324, + "learning_rate": 8.052389076839233e-06, + "loss": 0.4346, + "step": 1605 + }, + { + "epoch": 1.0866035182679297, + "grad_norm": 1.1076916294256132, + "learning_rate": 8.049270028030045e-06, + "loss": 0.4269, + "step": 1606 + }, + { + "epoch": 1.087280108254398, + "grad_norm": 1.0213960167654488, + "learning_rate": 8.046149088837803e-06, + "loss": 0.4278, + "step": 1607 + }, + { + "epoch": 1.087956698240866, + "grad_norm": 1.0634181798533768, + "learning_rate": 8.043026261197312e-06, + "loss": 0.4283, + "step": 1608 + }, + { + "epoch": 1.0886332882273342, + "grad_norm": 1.1108831071175451, + "learning_rate": 8.039901547044564e-06, + "loss": 0.4376, + "step": 1609 + }, + { + "epoch": 1.0893098782138024, + "grad_norm": 1.1452809988683992, + "learning_rate": 8.03677494831671e-06, + "loss": 0.4391, + "step": 1610 + }, + { + "epoch": 1.0899864682002707, + "grad_norm": 1.0419528450291402, + "learning_rate": 8.033646466952072e-06, + "loss": 0.4256, + "step": 1611 + }, + { + "epoch": 1.090663058186739, + "grad_norm": 1.072886295704959, + "learning_rate": 8.03051610489014e-06, + "loss": 0.4272, + "step": 1612 + }, + { + "epoch": 1.091339648173207, + "grad_norm": 1.0659495437851878, + "learning_rate": 8.027383864071573e-06, + "loss": 0.4238, + "step": 1613 + }, + { + "epoch": 1.0920162381596752, + "grad_norm": 1.037692414853626, + "learning_rate": 8.024249746438189e-06, + "loss": 0.4231, + "step": 1614 + }, + { + "epoch": 1.0926928281461434, + "grad_norm": 1.054437815665456, + "learning_rate": 8.021113753932972e-06, + "loss": 0.4244, + "step": 1615 + }, + { + "epoch": 1.0933694181326117, + "grad_norm": 1.0311388550159604, + "learning_rate": 8.017975888500067e-06, + "loss": 0.4125, + "step": 1616 + }, + { + "epoch": 1.09404600811908, + "grad_norm": 1.082570047344452, + "learning_rate": 8.014836152084784e-06, + "loss": 0.4128, + "step": 1617 + }, + { + "epoch": 1.094722598105548, + "grad_norm": 1.1164160306228927, + "learning_rate": 8.01169454663359e-06, + "loss": 0.4283, + "step": 1618 + }, + { + "epoch": 1.0953991880920162, + "grad_norm": 1.1089682584435943, + "learning_rate": 8.008551074094108e-06, + "loss": 0.4249, + "step": 1619 + }, + { + "epoch": 1.0960757780784844, + "grad_norm": 1.0509464150438623, + "learning_rate": 8.005405736415127e-06, + "loss": 0.4338, + "step": 1620 + }, + { + "epoch": 1.0967523680649527, + "grad_norm": 1.1040363318896815, + "learning_rate": 8.00225853554658e-06, + "loss": 0.4366, + "step": 1621 + }, + { + "epoch": 1.097428958051421, + "grad_norm": 1.0837023230855898, + "learning_rate": 7.99910947343957e-06, + "loss": 0.4384, + "step": 1622 + }, + { + "epoch": 1.098105548037889, + "grad_norm": 0.9985884908899905, + "learning_rate": 7.995958552046338e-06, + "loss": 0.4087, + "step": 1623 + }, + { + "epoch": 1.0987821380243572, + "grad_norm": 1.1238500609596405, + "learning_rate": 7.99280577332029e-06, + "loss": 0.4446, + "step": 1624 + }, + { + "epoch": 1.0994587280108254, + "grad_norm": 1.0806190421892934, + "learning_rate": 7.989651139215979e-06, + "loss": 0.4263, + "step": 1625 + }, + { + "epoch": 1.1001353179972937, + "grad_norm": 1.1036194550943939, + "learning_rate": 7.986494651689104e-06, + "loss": 0.4327, + "step": 1626 + }, + { + "epoch": 1.100811907983762, + "grad_norm": 1.0420743085626456, + "learning_rate": 7.983336312696521e-06, + "loss": 0.401, + "step": 1627 + }, + { + "epoch": 1.10148849797023, + "grad_norm": 1.1034746489027385, + "learning_rate": 7.980176124196231e-06, + "loss": 0.4186, + "step": 1628 + }, + { + "epoch": 1.1021650879566982, + "grad_norm": 1.0759985686089186, + "learning_rate": 7.977014088147375e-06, + "loss": 0.4426, + "step": 1629 + }, + { + "epoch": 1.1028416779431665, + "grad_norm": 1.0744308311896629, + "learning_rate": 7.973850206510251e-06, + "loss": 0.4208, + "step": 1630 + }, + { + "epoch": 1.1035182679296347, + "grad_norm": 1.0771285237222417, + "learning_rate": 7.970684481246291e-06, + "loss": 0.4208, + "step": 1631 + }, + { + "epoch": 1.104194857916103, + "grad_norm": 1.0803456413753414, + "learning_rate": 7.967516914318075e-06, + "loss": 0.4373, + "step": 1632 + }, + { + "epoch": 1.104871447902571, + "grad_norm": 1.0294107599945008, + "learning_rate": 7.964347507689325e-06, + "loss": 0.4179, + "step": 1633 + }, + { + "epoch": 1.1055480378890392, + "grad_norm": 1.0942476140701136, + "learning_rate": 7.961176263324902e-06, + "loss": 0.4409, + "step": 1634 + }, + { + "epoch": 1.1062246278755075, + "grad_norm": 1.0550016369795785, + "learning_rate": 7.958003183190804e-06, + "loss": 0.4229, + "step": 1635 + }, + { + "epoch": 1.1069012178619757, + "grad_norm": 1.0228007409500721, + "learning_rate": 7.954828269254173e-06, + "loss": 0.4125, + "step": 1636 + }, + { + "epoch": 1.1075778078484437, + "grad_norm": 1.0810395794512198, + "learning_rate": 7.951651523483283e-06, + "loss": 0.4216, + "step": 1637 + }, + { + "epoch": 1.108254397834912, + "grad_norm": 1.1131415243707072, + "learning_rate": 7.948472947847546e-06, + "loss": 0.4459, + "step": 1638 + }, + { + "epoch": 1.1089309878213802, + "grad_norm": 1.1130044089257685, + "learning_rate": 7.945292544317505e-06, + "loss": 0.4293, + "step": 1639 + }, + { + "epoch": 1.1096075778078485, + "grad_norm": 1.0714274606208691, + "learning_rate": 7.942110314864842e-06, + "loss": 0.4462, + "step": 1640 + }, + { + "epoch": 1.1102841677943167, + "grad_norm": 1.0671923969322772, + "learning_rate": 7.938926261462366e-06, + "loss": 0.4472, + "step": 1641 + }, + { + "epoch": 1.1109607577807847, + "grad_norm": 1.0769530076724063, + "learning_rate": 7.93574038608402e-06, + "loss": 0.4363, + "step": 1642 + }, + { + "epoch": 1.111637347767253, + "grad_norm": 1.024809503611195, + "learning_rate": 7.932552690704871e-06, + "loss": 0.4316, + "step": 1643 + }, + { + "epoch": 1.1123139377537212, + "grad_norm": 1.103139310008525, + "learning_rate": 7.929363177301124e-06, + "loss": 0.4363, + "step": 1644 + }, + { + "epoch": 1.1129905277401895, + "grad_norm": 1.0840748778510945, + "learning_rate": 7.926171847850101e-06, + "loss": 0.4403, + "step": 1645 + }, + { + "epoch": 1.1136671177266577, + "grad_norm": 1.1234584719469616, + "learning_rate": 7.922978704330257e-06, + "loss": 0.4454, + "step": 1646 + }, + { + "epoch": 1.1143437077131257, + "grad_norm": 1.047915762428863, + "learning_rate": 7.919783748721169e-06, + "loss": 0.4319, + "step": 1647 + }, + { + "epoch": 1.115020297699594, + "grad_norm": 1.047247514056654, + "learning_rate": 7.916586983003534e-06, + "loss": 0.4294, + "step": 1648 + }, + { + "epoch": 1.1156968876860622, + "grad_norm": 1.017429778766552, + "learning_rate": 7.913388409159175e-06, + "loss": 0.401, + "step": 1649 + }, + { + "epoch": 1.1163734776725305, + "grad_norm": 1.0535665838044446, + "learning_rate": 7.910188029171039e-06, + "loss": 0.4375, + "step": 1650 + }, + { + "epoch": 1.1170500676589987, + "grad_norm": 1.0442179948495736, + "learning_rate": 7.906985845023187e-06, + "loss": 0.4111, + "step": 1651 + }, + { + "epoch": 1.1177266576454667, + "grad_norm": 1.132898799560178, + "learning_rate": 7.903781858700799e-06, + "loss": 0.449, + "step": 1652 + }, + { + "epoch": 1.118403247631935, + "grad_norm": 1.0525393249078858, + "learning_rate": 7.900576072190177e-06, + "loss": 0.4283, + "step": 1653 + }, + { + "epoch": 1.1190798376184032, + "grad_norm": 1.0962682310224106, + "learning_rate": 7.897368487478733e-06, + "loss": 0.435, + "step": 1654 + }, + { + "epoch": 1.1197564276048715, + "grad_norm": 1.1492955729743703, + "learning_rate": 7.894159106554997e-06, + "loss": 0.4382, + "step": 1655 + }, + { + "epoch": 1.1204330175913397, + "grad_norm": 1.1188180147367437, + "learning_rate": 7.890947931408614e-06, + "loss": 0.4313, + "step": 1656 + }, + { + "epoch": 1.1211096075778078, + "grad_norm": 1.0602995977916774, + "learning_rate": 7.887734964030337e-06, + "loss": 0.4404, + "step": 1657 + }, + { + "epoch": 1.121786197564276, + "grad_norm": 1.1525937637165338, + "learning_rate": 7.884520206412036e-06, + "loss": 0.4456, + "step": 1658 + }, + { + "epoch": 1.1224627875507442, + "grad_norm": 1.092653723544954, + "learning_rate": 7.881303660546684e-06, + "loss": 0.4438, + "step": 1659 + }, + { + "epoch": 1.1231393775372125, + "grad_norm": 1.100506828048419, + "learning_rate": 7.87808532842837e-06, + "loss": 0.4472, + "step": 1660 + }, + { + "epoch": 1.1238159675236807, + "grad_norm": 1.1236745282818088, + "learning_rate": 7.87486521205228e-06, + "loss": 0.4485, + "step": 1661 + }, + { + "epoch": 1.1244925575101488, + "grad_norm": 1.0868924336816534, + "learning_rate": 7.871643313414718e-06, + "loss": 0.4328, + "step": 1662 + }, + { + "epoch": 1.125169147496617, + "grad_norm": 1.1118360878728375, + "learning_rate": 7.868419634513087e-06, + "loss": 0.4251, + "step": 1663 + }, + { + "epoch": 1.1258457374830853, + "grad_norm": 1.0614445418034166, + "learning_rate": 7.865194177345894e-06, + "loss": 0.4133, + "step": 1664 + }, + { + "epoch": 1.1265223274695535, + "grad_norm": 1.0529800234272884, + "learning_rate": 7.861966943912746e-06, + "loss": 0.4228, + "step": 1665 + }, + { + "epoch": 1.1271989174560217, + "grad_norm": 1.1584257350576903, + "learning_rate": 7.858737936214355e-06, + "loss": 0.4319, + "step": 1666 + }, + { + "epoch": 1.1278755074424898, + "grad_norm": 1.1349258108372389, + "learning_rate": 7.855507156252536e-06, + "loss": 0.4516, + "step": 1667 + }, + { + "epoch": 1.128552097428958, + "grad_norm": 1.0868559683076846, + "learning_rate": 7.852274606030191e-06, + "loss": 0.4331, + "step": 1668 + }, + { + "epoch": 1.1292286874154263, + "grad_norm": 1.0401928949814874, + "learning_rate": 7.849040287551331e-06, + "loss": 0.4358, + "step": 1669 + }, + { + "epoch": 1.1299052774018945, + "grad_norm": 1.0452570529625524, + "learning_rate": 7.84580420282106e-06, + "loss": 0.4344, + "step": 1670 + }, + { + "epoch": 1.1305818673883627, + "grad_norm": 1.117649207760153, + "learning_rate": 7.842566353845575e-06, + "loss": 0.4471, + "step": 1671 + }, + { + "epoch": 1.1312584573748308, + "grad_norm": 1.0851161080602902, + "learning_rate": 7.839326742632168e-06, + "loss": 0.425, + "step": 1672 + }, + { + "epoch": 1.131935047361299, + "grad_norm": 1.0466829112444798, + "learning_rate": 7.836085371189221e-06, + "loss": 0.4281, + "step": 1673 + }, + { + "epoch": 1.1326116373477673, + "grad_norm": 1.0475345778625593, + "learning_rate": 7.832842241526212e-06, + "loss": 0.4167, + "step": 1674 + }, + { + "epoch": 1.1332882273342355, + "grad_norm": 1.014363698097369, + "learning_rate": 7.829597355653707e-06, + "loss": 0.4098, + "step": 1675 + }, + { + "epoch": 1.1339648173207038, + "grad_norm": 1.0372480643891817, + "learning_rate": 7.82635071558336e-06, + "loss": 0.4379, + "step": 1676 + }, + { + "epoch": 1.1346414073071718, + "grad_norm": 1.0327954851408154, + "learning_rate": 7.82310232332791e-06, + "loss": 0.4272, + "step": 1677 + }, + { + "epoch": 1.13531799729364, + "grad_norm": 1.0835803359927758, + "learning_rate": 7.81985218090119e-06, + "loss": 0.4432, + "step": 1678 + }, + { + "epoch": 1.1359945872801083, + "grad_norm": 1.1131813840327025, + "learning_rate": 7.81660029031811e-06, + "loss": 0.444, + "step": 1679 + }, + { + "epoch": 1.1366711772665765, + "grad_norm": 1.080588584432741, + "learning_rate": 7.813346653594667e-06, + "loss": 0.4344, + "step": 1680 + }, + { + "epoch": 1.1373477672530448, + "grad_norm": 1.0744618414363065, + "learning_rate": 7.810091272747943e-06, + "loss": 0.4042, + "step": 1681 + }, + { + "epoch": 1.1380243572395128, + "grad_norm": 1.0936668071244344, + "learning_rate": 7.806834149796094e-06, + "loss": 0.4375, + "step": 1682 + }, + { + "epoch": 1.138700947225981, + "grad_norm": 1.1156391174164546, + "learning_rate": 7.803575286758365e-06, + "loss": 0.4416, + "step": 1683 + }, + { + "epoch": 1.1393775372124493, + "grad_norm": 1.0789725899731237, + "learning_rate": 7.800314685655072e-06, + "loss": 0.4342, + "step": 1684 + }, + { + "epoch": 1.1400541271989175, + "grad_norm": 1.1220753499974567, + "learning_rate": 7.797052348507614e-06, + "loss": 0.4468, + "step": 1685 + }, + { + "epoch": 1.1407307171853858, + "grad_norm": 1.0665818324168077, + "learning_rate": 7.793788277338464e-06, + "loss": 0.4237, + "step": 1686 + }, + { + "epoch": 1.1414073071718538, + "grad_norm": 1.0722274248439019, + "learning_rate": 7.790522474171171e-06, + "loss": 0.4257, + "step": 1687 + }, + { + "epoch": 1.142083897158322, + "grad_norm": 1.0955116714359816, + "learning_rate": 7.787254941030353e-06, + "loss": 0.428, + "step": 1688 + }, + { + "epoch": 1.1427604871447903, + "grad_norm": 1.0726365796467348, + "learning_rate": 7.78398567994171e-06, + "loss": 0.4148, + "step": 1689 + }, + { + "epoch": 1.1434370771312585, + "grad_norm": 1.0856929960125399, + "learning_rate": 7.780714692932002e-06, + "loss": 0.4265, + "step": 1690 + }, + { + "epoch": 1.1441136671177268, + "grad_norm": 1.0473114825099195, + "learning_rate": 7.777441982029072e-06, + "loss": 0.4115, + "step": 1691 + }, + { + "epoch": 1.1447902571041948, + "grad_norm": 1.0510596721044594, + "learning_rate": 7.774167549261817e-06, + "loss": 0.4145, + "step": 1692 + }, + { + "epoch": 1.145466847090663, + "grad_norm": 1.0577858768604804, + "learning_rate": 7.770891396660212e-06, + "loss": 0.4363, + "step": 1693 + }, + { + "epoch": 1.1461434370771313, + "grad_norm": 1.1009706681569988, + "learning_rate": 7.767613526255296e-06, + "loss": 0.4451, + "step": 1694 + }, + { + "epoch": 1.1468200270635995, + "grad_norm": 1.0041036698381118, + "learning_rate": 7.764333940079169e-06, + "loss": 0.4029, + "step": 1695 + }, + { + "epoch": 1.1474966170500678, + "grad_norm": 1.0887492064401576, + "learning_rate": 7.761052640165e-06, + "loss": 0.4282, + "step": 1696 + }, + { + "epoch": 1.1481732070365358, + "grad_norm": 1.1042918742864336, + "learning_rate": 7.757769628547018e-06, + "loss": 0.4415, + "step": 1697 + }, + { + "epoch": 1.148849797023004, + "grad_norm": 1.0795478153043707, + "learning_rate": 7.754484907260513e-06, + "loss": 0.4363, + "step": 1698 + }, + { + "epoch": 1.1495263870094723, + "grad_norm": 1.1211007669102866, + "learning_rate": 7.751198478341836e-06, + "loss": 0.4494, + "step": 1699 + }, + { + "epoch": 1.1502029769959405, + "grad_norm": 1.0461161449887515, + "learning_rate": 7.747910343828391e-06, + "loss": 0.4185, + "step": 1700 + }, + { + "epoch": 1.1508795669824086, + "grad_norm": 1.0250573855164917, + "learning_rate": 7.744620505758652e-06, + "loss": 0.4149, + "step": 1701 + }, + { + "epoch": 1.1515561569688768, + "grad_norm": 1.1020192972492517, + "learning_rate": 7.741328966172134e-06, + "loss": 0.4289, + "step": 1702 + }, + { + "epoch": 1.152232746955345, + "grad_norm": 1.078114451975737, + "learning_rate": 7.738035727109418e-06, + "loss": 0.4157, + "step": 1703 + }, + { + "epoch": 1.1529093369418133, + "grad_norm": 1.0370301746075743, + "learning_rate": 7.734740790612137e-06, + "loss": 0.4434, + "step": 1704 + }, + { + "epoch": 1.1535859269282815, + "grad_norm": 1.0787767586201404, + "learning_rate": 7.731444158722967e-06, + "loss": 0.4315, + "step": 1705 + }, + { + "epoch": 1.1542625169147496, + "grad_norm": 1.0932127316445852, + "learning_rate": 7.728145833485647e-06, + "loss": 0.4349, + "step": 1706 + }, + { + "epoch": 1.1549391069012178, + "grad_norm": 1.035041805932224, + "learning_rate": 7.724845816944962e-06, + "loss": 0.438, + "step": 1707 + }, + { + "epoch": 1.155615696887686, + "grad_norm": 1.0324469297545122, + "learning_rate": 7.72154411114674e-06, + "loss": 0.4159, + "step": 1708 + }, + { + "epoch": 1.1562922868741543, + "grad_norm": 1.0235591812233056, + "learning_rate": 7.718240718137863e-06, + "loss": 0.3985, + "step": 1709 + }, + { + "epoch": 1.1569688768606226, + "grad_norm": 1.109226175562451, + "learning_rate": 7.714935639966257e-06, + "loss": 0.4203, + "step": 1710 + }, + { + "epoch": 1.1576454668470906, + "grad_norm": 1.1047819932685186, + "learning_rate": 7.711628878680892e-06, + "loss": 0.4381, + "step": 1711 + }, + { + "epoch": 1.1583220568335588, + "grad_norm": 1.0899695902742073, + "learning_rate": 7.708320436331782e-06, + "loss": 0.4391, + "step": 1712 + }, + { + "epoch": 1.158998646820027, + "grad_norm": 1.1171269964445691, + "learning_rate": 7.705010314969983e-06, + "loss": 0.449, + "step": 1713 + }, + { + "epoch": 1.1596752368064953, + "grad_norm": 1.134971538790631, + "learning_rate": 7.70169851664759e-06, + "loss": 0.4369, + "step": 1714 + }, + { + "epoch": 1.1603518267929636, + "grad_norm": 1.0516597377957497, + "learning_rate": 7.698385043417741e-06, + "loss": 0.4009, + "step": 1715 + }, + { + "epoch": 1.1610284167794316, + "grad_norm": 1.0468780025418476, + "learning_rate": 7.695069897334613e-06, + "loss": 0.4137, + "step": 1716 + }, + { + "epoch": 1.1617050067658998, + "grad_norm": 1.1094964357843518, + "learning_rate": 7.691753080453413e-06, + "loss": 0.4387, + "step": 1717 + }, + { + "epoch": 1.162381596752368, + "grad_norm": 1.1545923437236598, + "learning_rate": 7.688434594830392e-06, + "loss": 0.4404, + "step": 1718 + }, + { + "epoch": 1.1630581867388363, + "grad_norm": 1.0608018567651694, + "learning_rate": 7.685114442522831e-06, + "loss": 0.416, + "step": 1719 + }, + { + "epoch": 1.1637347767253043, + "grad_norm": 1.0968318468852076, + "learning_rate": 7.681792625589046e-06, + "loss": 0.4431, + "step": 1720 + }, + { + "epoch": 1.1644113667117726, + "grad_norm": 1.0488681736636545, + "learning_rate": 7.678469146088385e-06, + "loss": 0.4313, + "step": 1721 + }, + { + "epoch": 1.1650879566982408, + "grad_norm": 1.0400668779503623, + "learning_rate": 7.675144006081225e-06, + "loss": 0.4082, + "step": 1722 + }, + { + "epoch": 1.165764546684709, + "grad_norm": 1.0632258969178827, + "learning_rate": 7.671817207628973e-06, + "loss": 0.4234, + "step": 1723 + }, + { + "epoch": 1.1664411366711773, + "grad_norm": 1.030818486913185, + "learning_rate": 7.668488752794067e-06, + "loss": 0.4307, + "step": 1724 + }, + { + "epoch": 1.1671177266576453, + "grad_norm": 1.098843164214365, + "learning_rate": 7.66515864363997e-06, + "loss": 0.4415, + "step": 1725 + }, + { + "epoch": 1.1677943166441136, + "grad_norm": 1.0680730548706054, + "learning_rate": 7.661826882231165e-06, + "loss": 0.4418, + "step": 1726 + }, + { + "epoch": 1.1684709066305818, + "grad_norm": 1.0834139813120975, + "learning_rate": 7.658493470633173e-06, + "loss": 0.4426, + "step": 1727 + }, + { + "epoch": 1.16914749661705, + "grad_norm": 1.122949182521549, + "learning_rate": 7.65515841091252e-06, + "loss": 0.4591, + "step": 1728 + }, + { + "epoch": 1.1698240866035183, + "grad_norm": 1.0777417035253167, + "learning_rate": 7.651821705136771e-06, + "loss": 0.4362, + "step": 1729 + }, + { + "epoch": 1.1705006765899864, + "grad_norm": 1.0344411056612368, + "learning_rate": 7.648483355374496e-06, + "loss": 0.4196, + "step": 1730 + }, + { + "epoch": 1.1711772665764546, + "grad_norm": 1.0400708813650934, + "learning_rate": 7.645143363695302e-06, + "loss": 0.4296, + "step": 1731 + }, + { + "epoch": 1.1718538565629228, + "grad_norm": 1.0462815157853997, + "learning_rate": 7.641801732169796e-06, + "loss": 0.43, + "step": 1732 + }, + { + "epoch": 1.172530446549391, + "grad_norm": 1.082777131755936, + "learning_rate": 7.63845846286961e-06, + "loss": 0.4317, + "step": 1733 + }, + { + "epoch": 1.1732070365358593, + "grad_norm": 1.090354407656327, + "learning_rate": 7.635113557867395e-06, + "loss": 0.4321, + "step": 1734 + }, + { + "epoch": 1.1738836265223274, + "grad_norm": 1.0788999438827593, + "learning_rate": 7.63176701923681e-06, + "loss": 0.4276, + "step": 1735 + }, + { + "epoch": 1.1745602165087956, + "grad_norm": 1.0793553420262387, + "learning_rate": 7.628418849052523e-06, + "loss": 0.4232, + "step": 1736 + }, + { + "epoch": 1.1752368064952639, + "grad_norm": 1.057948289081961, + "learning_rate": 7.625069049390228e-06, + "loss": 0.4259, + "step": 1737 + }, + { + "epoch": 1.175913396481732, + "grad_norm": 1.0683544191354293, + "learning_rate": 7.621717622326617e-06, + "loss": 0.4332, + "step": 1738 + }, + { + "epoch": 1.1765899864682003, + "grad_norm": 1.11566521100598, + "learning_rate": 7.61836456993939e-06, + "loss": 0.4405, + "step": 1739 + }, + { + "epoch": 1.1772665764546684, + "grad_norm": 1.0636654982281053, + "learning_rate": 7.615009894307263e-06, + "loss": 0.426, + "step": 1740 + }, + { + "epoch": 1.1779431664411366, + "grad_norm": 1.0773563958574097, + "learning_rate": 7.611653597509954e-06, + "loss": 0.4278, + "step": 1741 + }, + { + "epoch": 1.1786197564276049, + "grad_norm": 1.0135105759166276, + "learning_rate": 7.608295681628185e-06, + "loss": 0.4164, + "step": 1742 + }, + { + "epoch": 1.179296346414073, + "grad_norm": 1.0820504998658946, + "learning_rate": 7.604936148743682e-06, + "loss": 0.4202, + "step": 1743 + }, + { + "epoch": 1.1799729364005414, + "grad_norm": 1.1241255054691106, + "learning_rate": 7.6015750009391776e-06, + "loss": 0.4572, + "step": 1744 + }, + { + "epoch": 1.1806495263870094, + "grad_norm": 1.0775197137939894, + "learning_rate": 7.5982122402983986e-06, + "loss": 0.4487, + "step": 1745 + }, + { + "epoch": 1.1813261163734776, + "grad_norm": 1.0262329998702866, + "learning_rate": 7.594847868906076e-06, + "loss": 0.4138, + "step": 1746 + }, + { + "epoch": 1.1820027063599459, + "grad_norm": 1.0803460081167837, + "learning_rate": 7.5914818888479406e-06, + "loss": 0.4315, + "step": 1747 + }, + { + "epoch": 1.182679296346414, + "grad_norm": 1.0771088692402047, + "learning_rate": 7.588114302210719e-06, + "loss": 0.4364, + "step": 1748 + }, + { + "epoch": 1.1833558863328824, + "grad_norm": 1.0496325532095967, + "learning_rate": 7.584745111082128e-06, + "loss": 0.4338, + "step": 1749 + }, + { + "epoch": 1.1840324763193504, + "grad_norm": 1.0852773892249585, + "learning_rate": 7.5813743175508914e-06, + "loss": 0.4351, + "step": 1750 + }, + { + "epoch": 1.1847090663058186, + "grad_norm": 1.0352845427311212, + "learning_rate": 7.578001923706715e-06, + "loss": 0.4362, + "step": 1751 + }, + { + "epoch": 1.1853856562922869, + "grad_norm": 1.0373837306819627, + "learning_rate": 7.574627931640304e-06, + "loss": 0.4294, + "step": 1752 + }, + { + "epoch": 1.1860622462787551, + "grad_norm": 1.0572118401090786, + "learning_rate": 7.571252343443349e-06, + "loss": 0.4432, + "step": 1753 + }, + { + "epoch": 1.1867388362652234, + "grad_norm": 1.0452270042217684, + "learning_rate": 7.5678751612085344e-06, + "loss": 0.422, + "step": 1754 + }, + { + "epoch": 1.1874154262516914, + "grad_norm": 1.0276305064576137, + "learning_rate": 7.564496387029532e-06, + "loss": 0.4243, + "step": 1755 + }, + { + "epoch": 1.1880920162381596, + "grad_norm": 1.0927408405400587, + "learning_rate": 7.5611160230009975e-06, + "loss": 0.4397, + "step": 1756 + }, + { + "epoch": 1.1887686062246279, + "grad_norm": 1.0048031575387393, + "learning_rate": 7.557734071218575e-06, + "loss": 0.3995, + "step": 1757 + }, + { + "epoch": 1.1894451962110961, + "grad_norm": 1.0367309076105888, + "learning_rate": 7.5543505337788934e-06, + "loss": 0.4099, + "step": 1758 + }, + { + "epoch": 1.1901217861975644, + "grad_norm": 1.055713401251462, + "learning_rate": 7.550965412779563e-06, + "loss": 0.4281, + "step": 1759 + }, + { + "epoch": 1.1907983761840324, + "grad_norm": 1.0926686257988263, + "learning_rate": 7.547578710319174e-06, + "loss": 0.4178, + "step": 1760 + }, + { + "epoch": 1.1914749661705006, + "grad_norm": 1.0797406046782434, + "learning_rate": 7.544190428497304e-06, + "loss": 0.4279, + "step": 1761 + }, + { + "epoch": 1.1921515561569689, + "grad_norm": 1.140108523007309, + "learning_rate": 7.540800569414501e-06, + "loss": 0.4614, + "step": 1762 + }, + { + "epoch": 1.1928281461434371, + "grad_norm": 1.049646539641545, + "learning_rate": 7.537409135172298e-06, + "loss": 0.4231, + "step": 1763 + }, + { + "epoch": 1.1935047361299054, + "grad_norm": 1.0769851264218875, + "learning_rate": 7.5340161278732e-06, + "loss": 0.4276, + "step": 1764 + }, + { + "epoch": 1.1941813261163734, + "grad_norm": 1.001464436924969, + "learning_rate": 7.530621549620689e-06, + "loss": 0.3953, + "step": 1765 + }, + { + "epoch": 1.1948579161028416, + "grad_norm": 1.097639622865649, + "learning_rate": 7.527225402519218e-06, + "loss": 0.4405, + "step": 1766 + }, + { + "epoch": 1.19553450608931, + "grad_norm": 1.0327683397057092, + "learning_rate": 7.52382768867422e-06, + "loss": 0.4259, + "step": 1767 + }, + { + "epoch": 1.1962110960757781, + "grad_norm": 1.127561600104448, + "learning_rate": 7.52042841019209e-06, + "loss": 0.4506, + "step": 1768 + }, + { + "epoch": 1.1968876860622464, + "grad_norm": 1.0511337706371238, + "learning_rate": 7.5170275691802e-06, + "loss": 0.4239, + "step": 1769 + }, + { + "epoch": 1.1975642760487144, + "grad_norm": 1.0096823700913338, + "learning_rate": 7.5136251677468856e-06, + "loss": 0.417, + "step": 1770 + }, + { + "epoch": 1.1982408660351827, + "grad_norm": 1.0442975619891155, + "learning_rate": 7.510221208001457e-06, + "loss": 0.4328, + "step": 1771 + }, + { + "epoch": 1.198917456021651, + "grad_norm": 1.056310375902745, + "learning_rate": 7.50681569205418e-06, + "loss": 0.4343, + "step": 1772 + }, + { + "epoch": 1.1995940460081191, + "grad_norm": 1.0485435366935498, + "learning_rate": 7.5034086220162945e-06, + "loss": 0.4219, + "step": 1773 + }, + { + "epoch": 1.2002706359945874, + "grad_norm": 1.1091019027460352, + "learning_rate": 7.500000000000001e-06, + "loss": 0.4204, + "step": 1774 + }, + { + "epoch": 1.2009472259810554, + "grad_norm": 1.1208881780578195, + "learning_rate": 7.496589828118458e-06, + "loss": 0.4501, + "step": 1775 + }, + { + "epoch": 1.2016238159675237, + "grad_norm": 1.036187467489681, + "learning_rate": 7.4931781084857915e-06, + "loss": 0.4152, + "step": 1776 + }, + { + "epoch": 1.202300405953992, + "grad_norm": 1.0362678918135637, + "learning_rate": 7.489764843217082e-06, + "loss": 0.4356, + "step": 1777 + }, + { + "epoch": 1.2029769959404601, + "grad_norm": 1.0492880981253974, + "learning_rate": 7.4863500344283715e-06, + "loss": 0.419, + "step": 1778 + }, + { + "epoch": 1.2036535859269284, + "grad_norm": 1.0804969954100565, + "learning_rate": 7.482933684236654e-06, + "loss": 0.4426, + "step": 1779 + }, + { + "epoch": 1.2043301759133964, + "grad_norm": 1.0420846359657001, + "learning_rate": 7.4795157947598864e-06, + "loss": 0.4394, + "step": 1780 + }, + { + "epoch": 1.2050067658998647, + "grad_norm": 1.0496467723396619, + "learning_rate": 7.476096368116974e-06, + "loss": 0.4212, + "step": 1781 + }, + { + "epoch": 1.205683355886333, + "grad_norm": 1.073609812824759, + "learning_rate": 7.4726754064277775e-06, + "loss": 0.4211, + "step": 1782 + }, + { + "epoch": 1.2063599458728012, + "grad_norm": 1.147810099634884, + "learning_rate": 7.469252911813107e-06, + "loss": 0.4456, + "step": 1783 + }, + { + "epoch": 1.2070365358592694, + "grad_norm": 1.0339857863508586, + "learning_rate": 7.465828886394729e-06, + "loss": 0.4146, + "step": 1784 + }, + { + "epoch": 1.2077131258457374, + "grad_norm": 1.05538374872532, + "learning_rate": 7.462403332295351e-06, + "loss": 0.4249, + "step": 1785 + }, + { + "epoch": 1.2083897158322057, + "grad_norm": 1.0270527698895682, + "learning_rate": 7.458976251638632e-06, + "loss": 0.4046, + "step": 1786 + }, + { + "epoch": 1.209066305818674, + "grad_norm": 1.0359624154039528, + "learning_rate": 7.455547646549179e-06, + "loss": 0.4247, + "step": 1787 + }, + { + "epoch": 1.2097428958051422, + "grad_norm": 1.0643263297091912, + "learning_rate": 7.452117519152542e-06, + "loss": 0.4119, + "step": 1788 + }, + { + "epoch": 1.2104194857916104, + "grad_norm": 1.0496257822970312, + "learning_rate": 7.448685871575213e-06, + "loss": 0.4312, + "step": 1789 + }, + { + "epoch": 1.2110960757780784, + "grad_norm": 1.0405282605598096, + "learning_rate": 7.445252705944632e-06, + "loss": 0.4171, + "step": 1790 + }, + { + "epoch": 1.2117726657645467, + "grad_norm": 1.0884311545690581, + "learning_rate": 7.441818024389173e-06, + "loss": 0.4194, + "step": 1791 + }, + { + "epoch": 1.212449255751015, + "grad_norm": 1.0476343136961743, + "learning_rate": 7.438381829038157e-06, + "loss": 0.4167, + "step": 1792 + }, + { + "epoch": 1.2131258457374832, + "grad_norm": 1.0717160931695264, + "learning_rate": 7.434944122021837e-06, + "loss": 0.4238, + "step": 1793 + }, + { + "epoch": 1.2138024357239512, + "grad_norm": 1.1084054339649736, + "learning_rate": 7.431504905471407e-06, + "loss": 0.4451, + "step": 1794 + }, + { + "epoch": 1.2144790257104194, + "grad_norm": 1.0729490077307373, + "learning_rate": 7.428064181518997e-06, + "loss": 0.4462, + "step": 1795 + }, + { + "epoch": 1.2151556156968877, + "grad_norm": 0.9860342582386903, + "learning_rate": 7.424621952297668e-06, + "loss": 0.4068, + "step": 1796 + }, + { + "epoch": 1.215832205683356, + "grad_norm": 1.0684842057776966, + "learning_rate": 7.4211782199414204e-06, + "loss": 0.4214, + "step": 1797 + }, + { + "epoch": 1.2165087956698242, + "grad_norm": 1.0366597281113505, + "learning_rate": 7.417732986585179e-06, + "loss": 0.4157, + "step": 1798 + }, + { + "epoch": 1.2171853856562922, + "grad_norm": 1.0329970051374666, + "learning_rate": 7.414286254364804e-06, + "loss": 0.4174, + "step": 1799 + }, + { + "epoch": 1.2178619756427604, + "grad_norm": 1.0586898711198538, + "learning_rate": 7.410838025417083e-06, + "loss": 0.4304, + "step": 1800 + }, + { + "epoch": 1.2185385656292287, + "grad_norm": 1.0280475621624243, + "learning_rate": 7.407388301879735e-06, + "loss": 0.4366, + "step": 1801 + }, + { + "epoch": 1.219215155615697, + "grad_norm": 1.0674000428420767, + "learning_rate": 7.403937085891397e-06, + "loss": 0.4259, + "step": 1802 + }, + { + "epoch": 1.2198917456021652, + "grad_norm": 1.0588603505389673, + "learning_rate": 7.400484379591644e-06, + "loss": 0.4378, + "step": 1803 + }, + { + "epoch": 1.2205683355886332, + "grad_norm": 1.0690086552832514, + "learning_rate": 7.397030185120962e-06, + "loss": 0.4307, + "step": 1804 + }, + { + "epoch": 1.2212449255751014, + "grad_norm": 1.0446017776878338, + "learning_rate": 7.393574504620767e-06, + "loss": 0.4356, + "step": 1805 + }, + { + "epoch": 1.2219215155615697, + "grad_norm": 1.0879163576888138, + "learning_rate": 7.390117340233396e-06, + "loss": 0.4295, + "step": 1806 + }, + { + "epoch": 1.222598105548038, + "grad_norm": 1.064772020945927, + "learning_rate": 7.386658694102103e-06, + "loss": 0.4466, + "step": 1807 + }, + { + "epoch": 1.2232746955345062, + "grad_norm": 1.0774821775397263, + "learning_rate": 7.383198568371064e-06, + "loss": 0.439, + "step": 1808 + }, + { + "epoch": 1.2239512855209742, + "grad_norm": 1.0307840381048303, + "learning_rate": 7.379736965185369e-06, + "loss": 0.4259, + "step": 1809 + }, + { + "epoch": 1.2246278755074425, + "grad_norm": 1.0893265648975483, + "learning_rate": 7.376273886691024e-06, + "loss": 0.4326, + "step": 1810 + }, + { + "epoch": 1.2253044654939107, + "grad_norm": 1.0586550592993633, + "learning_rate": 7.372809335034955e-06, + "loss": 0.4247, + "step": 1811 + }, + { + "epoch": 1.225981055480379, + "grad_norm": 0.9868054895328412, + "learning_rate": 7.369343312364994e-06, + "loss": 0.4075, + "step": 1812 + }, + { + "epoch": 1.226657645466847, + "grad_norm": 1.0251115832629527, + "learning_rate": 7.365875820829889e-06, + "loss": 0.4123, + "step": 1813 + }, + { + "epoch": 1.2273342354533152, + "grad_norm": 1.0494949178161048, + "learning_rate": 7.362406862579299e-06, + "loss": 0.4309, + "step": 1814 + }, + { + "epoch": 1.2280108254397835, + "grad_norm": 1.0830705838191654, + "learning_rate": 7.358936439763789e-06, + "loss": 0.4304, + "step": 1815 + }, + { + "epoch": 1.2286874154262517, + "grad_norm": 1.0482339849082631, + "learning_rate": 7.355464554534837e-06, + "loss": 0.4159, + "step": 1816 + }, + { + "epoch": 1.22936400541272, + "grad_norm": 1.0525305578933541, + "learning_rate": 7.351991209044822e-06, + "loss": 0.4203, + "step": 1817 + }, + { + "epoch": 1.230040595399188, + "grad_norm": 1.1684816170354542, + "learning_rate": 7.348516405447031e-06, + "loss": 0.4507, + "step": 1818 + }, + { + "epoch": 1.2307171853856562, + "grad_norm": 1.0527470120746363, + "learning_rate": 7.345040145895656e-06, + "loss": 0.4132, + "step": 1819 + }, + { + "epoch": 1.2313937753721245, + "grad_norm": 1.011165560475297, + "learning_rate": 7.341562432545793e-06, + "loss": 0.4105, + "step": 1820 + }, + { + "epoch": 1.2320703653585927, + "grad_norm": 1.0646474186484702, + "learning_rate": 7.338083267553433e-06, + "loss": 0.4406, + "step": 1821 + }, + { + "epoch": 1.232746955345061, + "grad_norm": 1.0489389597724974, + "learning_rate": 7.334602653075471e-06, + "loss": 0.4213, + "step": 1822 + }, + { + "epoch": 1.233423545331529, + "grad_norm": 1.0561877797503911, + "learning_rate": 7.331120591269701e-06, + "loss": 0.4302, + "step": 1823 + }, + { + "epoch": 1.2341001353179972, + "grad_norm": 1.0340380529270852, + "learning_rate": 7.327637084294818e-06, + "loss": 0.4167, + "step": 1824 + }, + { + "epoch": 1.2347767253044655, + "grad_norm": 1.006360752229811, + "learning_rate": 7.324152134310401e-06, + "loss": 0.4014, + "step": 1825 + }, + { + "epoch": 1.2354533152909337, + "grad_norm": 1.0498725723777145, + "learning_rate": 7.3206657434769354e-06, + "loss": 0.4218, + "step": 1826 + }, + { + "epoch": 1.236129905277402, + "grad_norm": 1.1197102651242337, + "learning_rate": 7.317177913955795e-06, + "loss": 0.4561, + "step": 1827 + }, + { + "epoch": 1.23680649526387, + "grad_norm": 1.047845814323881, + "learning_rate": 7.313688647909245e-06, + "loss": 0.4319, + "step": 1828 + }, + { + "epoch": 1.2374830852503382, + "grad_norm": 1.0133934913396878, + "learning_rate": 7.310197947500446e-06, + "loss": 0.3936, + "step": 1829 + }, + { + "epoch": 1.2381596752368065, + "grad_norm": 1.0610646071502368, + "learning_rate": 7.30670581489344e-06, + "loss": 0.4312, + "step": 1830 + }, + { + "epoch": 1.2388362652232747, + "grad_norm": 1.0550038089887133, + "learning_rate": 7.303212252253163e-06, + "loss": 0.4241, + "step": 1831 + }, + { + "epoch": 1.239512855209743, + "grad_norm": 1.0078787601100263, + "learning_rate": 7.2997172617454335e-06, + "loss": 0.4201, + "step": 1832 + }, + { + "epoch": 1.240189445196211, + "grad_norm": 1.040439043416705, + "learning_rate": 7.29622084553696e-06, + "loss": 0.4112, + "step": 1833 + }, + { + "epoch": 1.2408660351826792, + "grad_norm": 1.0851214055242746, + "learning_rate": 7.29272300579533e-06, + "loss": 0.434, + "step": 1834 + }, + { + "epoch": 1.2415426251691475, + "grad_norm": 1.0205069422957933, + "learning_rate": 7.289223744689018e-06, + "loss": 0.4144, + "step": 1835 + }, + { + "epoch": 1.2422192151556157, + "grad_norm": 1.040925371820256, + "learning_rate": 7.285723064387373e-06, + "loss": 0.4371, + "step": 1836 + }, + { + "epoch": 1.242895805142084, + "grad_norm": 1.0483087504178972, + "learning_rate": 7.282220967060634e-06, + "loss": 0.4329, + "step": 1837 + }, + { + "epoch": 1.243572395128552, + "grad_norm": 1.0723707821472495, + "learning_rate": 7.278717454879907e-06, + "loss": 0.4402, + "step": 1838 + }, + { + "epoch": 1.2442489851150202, + "grad_norm": 1.0259857378494874, + "learning_rate": 7.2752125300171835e-06, + "loss": 0.4124, + "step": 1839 + }, + { + "epoch": 1.2449255751014885, + "grad_norm": 1.056909604175791, + "learning_rate": 7.271706194645327e-06, + "loss": 0.4258, + "step": 1840 + }, + { + "epoch": 1.2456021650879567, + "grad_norm": 1.0202072520898602, + "learning_rate": 7.26819845093808e-06, + "loss": 0.4073, + "step": 1841 + }, + { + "epoch": 1.246278755074425, + "grad_norm": 1.033461438488984, + "learning_rate": 7.264689301070048e-06, + "loss": 0.4291, + "step": 1842 + }, + { + "epoch": 1.246955345060893, + "grad_norm": 1.0372665916116917, + "learning_rate": 7.2611787472167194e-06, + "loss": 0.4124, + "step": 1843 + }, + { + "epoch": 1.2476319350473613, + "grad_norm": 1.0272576712115402, + "learning_rate": 7.257666791554448e-06, + "loss": 0.4129, + "step": 1844 + }, + { + "epoch": 1.2483085250338295, + "grad_norm": 1.0549832243545578, + "learning_rate": 7.254153436260456e-06, + "loss": 0.4024, + "step": 1845 + }, + { + "epoch": 1.2489851150202977, + "grad_norm": 1.0410383317623635, + "learning_rate": 7.250638683512833e-06, + "loss": 0.426, + "step": 1846 + }, + { + "epoch": 1.249661705006766, + "grad_norm": 1.0669834282377504, + "learning_rate": 7.247122535490539e-06, + "loss": 0.4314, + "step": 1847 + }, + { + "epoch": 1.250338294993234, + "grad_norm": 1.0692057782691242, + "learning_rate": 7.2436049943733955e-06, + "loss": 0.431, + "step": 1848 + }, + { + "epoch": 1.2510148849797023, + "grad_norm": 1.064160593240285, + "learning_rate": 7.240086062342087e-06, + "loss": 0.4137, + "step": 1849 + }, + { + "epoch": 1.2516914749661705, + "grad_norm": 1.0382964704263993, + "learning_rate": 7.236565741578163e-06, + "loss": 0.4306, + "step": 1850 + }, + { + "epoch": 1.2523680649526387, + "grad_norm": 1.046704426082285, + "learning_rate": 7.233044034264034e-06, + "loss": 0.4324, + "step": 1851 + }, + { + "epoch": 1.253044654939107, + "grad_norm": 1.0918029011354768, + "learning_rate": 7.229520942582965e-06, + "loss": 0.4241, + "step": 1852 + }, + { + "epoch": 1.253721244925575, + "grad_norm": 1.0507343263088116, + "learning_rate": 7.2259964687190855e-06, + "loss": 0.4131, + "step": 1853 + }, + { + "epoch": 1.2543978349120433, + "grad_norm": 1.014695676771801, + "learning_rate": 7.22247061485738e-06, + "loss": 0.4224, + "step": 1854 + }, + { + "epoch": 1.2550744248985115, + "grad_norm": 1.027518620809915, + "learning_rate": 7.218943383183684e-06, + "loss": 0.4279, + "step": 1855 + }, + { + "epoch": 1.2557510148849798, + "grad_norm": 1.052531461411292, + "learning_rate": 7.215414775884695e-06, + "loss": 0.4331, + "step": 1856 + }, + { + "epoch": 1.256427604871448, + "grad_norm": 1.0567183807459564, + "learning_rate": 7.211884795147958e-06, + "loss": 0.4187, + "step": 1857 + }, + { + "epoch": 1.257104194857916, + "grad_norm": 0.9995176408166365, + "learning_rate": 7.208353443161871e-06, + "loss": 0.4206, + "step": 1858 + }, + { + "epoch": 1.2577807848443843, + "grad_norm": 1.0076902801485583, + "learning_rate": 7.204820722115681e-06, + "loss": 0.4188, + "step": 1859 + }, + { + "epoch": 1.2584573748308525, + "grad_norm": 1.0663423805198837, + "learning_rate": 7.201286634199484e-06, + "loss": 0.4268, + "step": 1860 + }, + { + "epoch": 1.2591339648173208, + "grad_norm": 1.0446200390665148, + "learning_rate": 7.197751181604228e-06, + "loss": 0.4322, + "step": 1861 + }, + { + "epoch": 1.259810554803789, + "grad_norm": 1.0921478581993227, + "learning_rate": 7.194214366521699e-06, + "loss": 0.4276, + "step": 1862 + }, + { + "epoch": 1.260487144790257, + "grad_norm": 1.0589696751775382, + "learning_rate": 7.190676191144532e-06, + "loss": 0.4037, + "step": 1863 + }, + { + "epoch": 1.2611637347767253, + "grad_norm": 1.0290308964919637, + "learning_rate": 7.187136657666208e-06, + "loss": 0.417, + "step": 1864 + }, + { + "epoch": 1.2618403247631935, + "grad_norm": 1.0470068438719795, + "learning_rate": 7.183595768281044e-06, + "loss": 0.4339, + "step": 1865 + }, + { + "epoch": 1.2625169147496618, + "grad_norm": 0.98901399837732, + "learning_rate": 7.180053525184202e-06, + "loss": 0.4198, + "step": 1866 + }, + { + "epoch": 1.26319350473613, + "grad_norm": 1.052338787970324, + "learning_rate": 7.176509930571682e-06, + "loss": 0.4084, + "step": 1867 + }, + { + "epoch": 1.263870094722598, + "grad_norm": 1.0619778656699166, + "learning_rate": 7.172964986640319e-06, + "loss": 0.4376, + "step": 1868 + }, + { + "epoch": 1.2645466847090663, + "grad_norm": 0.9921952317723164, + "learning_rate": 7.169418695587791e-06, + "loss": 0.4077, + "step": 1869 + }, + { + "epoch": 1.2652232746955345, + "grad_norm": 1.0602673593791327, + "learning_rate": 7.165871059612604e-06, + "loss": 0.4224, + "step": 1870 + }, + { + "epoch": 1.2658998646820028, + "grad_norm": 1.040367719037345, + "learning_rate": 7.162322080914106e-06, + "loss": 0.4229, + "step": 1871 + }, + { + "epoch": 1.266576454668471, + "grad_norm": 1.0672355169561336, + "learning_rate": 7.158771761692464e-06, + "loss": 0.4291, + "step": 1872 + }, + { + "epoch": 1.267253044654939, + "grad_norm": 1.0416062326184927, + "learning_rate": 7.155220104148694e-06, + "loss": 0.4217, + "step": 1873 + }, + { + "epoch": 1.2679296346414073, + "grad_norm": 1.036889202967758, + "learning_rate": 7.151667110484626e-06, + "loss": 0.4232, + "step": 1874 + }, + { + "epoch": 1.2686062246278755, + "grad_norm": 1.0191265847071933, + "learning_rate": 7.148112782902927e-06, + "loss": 0.4103, + "step": 1875 + }, + { + "epoch": 1.2692828146143438, + "grad_norm": 1.039796595773164, + "learning_rate": 7.144557123607087e-06, + "loss": 0.4279, + "step": 1876 + }, + { + "epoch": 1.269959404600812, + "grad_norm": 1.0191522117178033, + "learning_rate": 7.141000134801426e-06, + "loss": 0.4181, + "step": 1877 + }, + { + "epoch": 1.27063599458728, + "grad_norm": 1.0407018633602043, + "learning_rate": 7.137441818691081e-06, + "loss": 0.4352, + "step": 1878 + }, + { + "epoch": 1.2713125845737483, + "grad_norm": 1.0361978369363967, + "learning_rate": 7.133882177482019e-06, + "loss": 0.4121, + "step": 1879 + }, + { + "epoch": 1.2719891745602165, + "grad_norm": 1.05544765232057, + "learning_rate": 7.130321213381025e-06, + "loss": 0.4224, + "step": 1880 + }, + { + "epoch": 1.2726657645466848, + "grad_norm": 1.0429317514953815, + "learning_rate": 7.1267589285957075e-06, + "loss": 0.4173, + "step": 1881 + }, + { + "epoch": 1.273342354533153, + "grad_norm": 1.081130599728119, + "learning_rate": 7.123195325334486e-06, + "loss": 0.416, + "step": 1882 + }, + { + "epoch": 1.274018944519621, + "grad_norm": 1.0590471273414273, + "learning_rate": 7.119630405806607e-06, + "loss": 0.4196, + "step": 1883 + }, + { + "epoch": 1.2746955345060893, + "grad_norm": 1.027365914525238, + "learning_rate": 7.1160641722221255e-06, + "loss": 0.4017, + "step": 1884 + }, + { + "epoch": 1.2753721244925575, + "grad_norm": 1.0036207664540902, + "learning_rate": 7.112496626791915e-06, + "loss": 0.4074, + "step": 1885 + }, + { + "epoch": 1.2760487144790258, + "grad_norm": 1.102698366148571, + "learning_rate": 7.108927771727661e-06, + "loss": 0.4271, + "step": 1886 + }, + { + "epoch": 1.276725304465494, + "grad_norm": 1.0286285818381042, + "learning_rate": 7.105357609241863e-06, + "loss": 0.4157, + "step": 1887 + }, + { + "epoch": 1.277401894451962, + "grad_norm": 1.0715158955133592, + "learning_rate": 7.101786141547829e-06, + "loss": 0.4138, + "step": 1888 + }, + { + "epoch": 1.2780784844384303, + "grad_norm": 1.0377719757570174, + "learning_rate": 7.098213370859673e-06, + "loss": 0.4187, + "step": 1889 + }, + { + "epoch": 1.2787550744248986, + "grad_norm": 1.0309289124148049, + "learning_rate": 7.094639299392324e-06, + "loss": 0.4242, + "step": 1890 + }, + { + "epoch": 1.2794316644113666, + "grad_norm": 1.0905463430934894, + "learning_rate": 7.0910639293615125e-06, + "loss": 0.4291, + "step": 1891 + }, + { + "epoch": 1.280108254397835, + "grad_norm": 1.061943255649957, + "learning_rate": 7.087487262983776e-06, + "loss": 0.4369, + "step": 1892 + }, + { + "epoch": 1.280784844384303, + "grad_norm": 1.005229810215619, + "learning_rate": 7.083909302476453e-06, + "loss": 0.4151, + "step": 1893 + }, + { + "epoch": 1.2814614343707713, + "grad_norm": 1.0222024289145817, + "learning_rate": 7.080330050057687e-06, + "loss": 0.4155, + "step": 1894 + }, + { + "epoch": 1.2821380243572396, + "grad_norm": 1.0359849066352849, + "learning_rate": 7.076749507946422e-06, + "loss": 0.4209, + "step": 1895 + }, + { + "epoch": 1.2828146143437076, + "grad_norm": 0.9993712113084391, + "learning_rate": 7.0731676783624015e-06, + "loss": 0.4053, + "step": 1896 + }, + { + "epoch": 1.283491204330176, + "grad_norm": 1.0754978857289135, + "learning_rate": 7.069584563526166e-06, + "loss": 0.4193, + "step": 1897 + }, + { + "epoch": 1.284167794316644, + "grad_norm": 1.0682295147426433, + "learning_rate": 7.066000165659054e-06, + "loss": 0.4218, + "step": 1898 + }, + { + "epoch": 1.2848443843031123, + "grad_norm": 1.137024056549916, + "learning_rate": 7.062414486983197e-06, + "loss": 0.422, + "step": 1899 + }, + { + "epoch": 1.2855209742895806, + "grad_norm": 0.9893047815127538, + "learning_rate": 7.058827529721526e-06, + "loss": 0.407, + "step": 1900 + }, + { + "epoch": 1.2861975642760486, + "grad_norm": 1.093759495759805, + "learning_rate": 7.055239296097758e-06, + "loss": 0.4206, + "step": 1901 + }, + { + "epoch": 1.2868741542625168, + "grad_norm": 1.0389243522476432, + "learning_rate": 7.051649788336405e-06, + "loss": 0.4351, + "step": 1902 + }, + { + "epoch": 1.287550744248985, + "grad_norm": 1.031624389267756, + "learning_rate": 7.048059008662772e-06, + "loss": 0.4292, + "step": 1903 + }, + { + "epoch": 1.2882273342354533, + "grad_norm": 1.0127401227745771, + "learning_rate": 7.044466959302945e-06, + "loss": 0.4075, + "step": 1904 + }, + { + "epoch": 1.2889039242219216, + "grad_norm": 1.094989203921133, + "learning_rate": 7.040873642483801e-06, + "loss": 0.4302, + "step": 1905 + }, + { + "epoch": 1.2895805142083896, + "grad_norm": 1.0731525895979097, + "learning_rate": 7.037279060433004e-06, + "loss": 0.4233, + "step": 1906 + }, + { + "epoch": 1.2902571041948578, + "grad_norm": 1.092679228067818, + "learning_rate": 7.033683215379002e-06, + "loss": 0.4248, + "step": 1907 + }, + { + "epoch": 1.290933694181326, + "grad_norm": 1.0633464006107094, + "learning_rate": 7.030086109551023e-06, + "loss": 0.4207, + "step": 1908 + }, + { + "epoch": 1.2916102841677943, + "grad_norm": 1.0655938041658124, + "learning_rate": 7.02648774517908e-06, + "loss": 0.4223, + "step": 1909 + }, + { + "epoch": 1.2922868741542626, + "grad_norm": 1.002809005330905, + "learning_rate": 7.022888124493964e-06, + "loss": 0.4162, + "step": 1910 + }, + { + "epoch": 1.2929634641407306, + "grad_norm": 0.9667935377624202, + "learning_rate": 7.019287249727248e-06, + "loss": 0.3918, + "step": 1911 + }, + { + "epoch": 1.2936400541271988, + "grad_norm": 1.011605643629267, + "learning_rate": 7.015685123111276e-06, + "loss": 0.4224, + "step": 1912 + }, + { + "epoch": 1.294316644113667, + "grad_norm": 1.0589780284119585, + "learning_rate": 7.012081746879178e-06, + "loss": 0.4371, + "step": 1913 + }, + { + "epoch": 1.2949932341001353, + "grad_norm": 1.0241759976541929, + "learning_rate": 7.008477123264849e-06, + "loss": 0.4193, + "step": 1914 + }, + { + "epoch": 1.2956698240866036, + "grad_norm": 1.0397577085592005, + "learning_rate": 7.004871254502962e-06, + "loss": 0.4232, + "step": 1915 + }, + { + "epoch": 1.2963464140730716, + "grad_norm": 1.0363618531551906, + "learning_rate": 7.001264142828961e-06, + "loss": 0.4176, + "step": 1916 + }, + { + "epoch": 1.2970230040595399, + "grad_norm": 1.0314756599160502, + "learning_rate": 6.997655790479062e-06, + "loss": 0.44, + "step": 1917 + }, + { + "epoch": 1.297699594046008, + "grad_norm": 1.3284490053441678, + "learning_rate": 6.9940461996902495e-06, + "loss": 0.4167, + "step": 1918 + }, + { + "epoch": 1.2983761840324763, + "grad_norm": 1.0562678507252015, + "learning_rate": 6.990435372700273e-06, + "loss": 0.4316, + "step": 1919 + }, + { + "epoch": 1.2990527740189446, + "grad_norm": 1.042199906868355, + "learning_rate": 6.986823311747652e-06, + "loss": 0.4307, + "step": 1920 + }, + { + "epoch": 1.2997293640054126, + "grad_norm": 0.9947084159324686, + "learning_rate": 6.983210019071671e-06, + "loss": 0.4116, + "step": 1921 + }, + { + "epoch": 1.3004059539918809, + "grad_norm": 1.0684694784407172, + "learning_rate": 6.979595496912374e-06, + "loss": 0.4314, + "step": 1922 + }, + { + "epoch": 1.301082543978349, + "grad_norm": 1.0900857885218591, + "learning_rate": 6.97597974751057e-06, + "loss": 0.4322, + "step": 1923 + }, + { + "epoch": 1.3017591339648173, + "grad_norm": 1.0342858420227146, + "learning_rate": 6.972362773107832e-06, + "loss": 0.4217, + "step": 1924 + }, + { + "epoch": 1.3024357239512856, + "grad_norm": 1.0308843397983698, + "learning_rate": 6.968744575946484e-06, + "loss": 0.4242, + "step": 1925 + }, + { + "epoch": 1.3031123139377536, + "grad_norm": 1.0443067751322757, + "learning_rate": 6.965125158269619e-06, + "loss": 0.4258, + "step": 1926 + }, + { + "epoch": 1.3037889039242219, + "grad_norm": 1.0145026800131371, + "learning_rate": 6.961504522321077e-06, + "loss": 0.4162, + "step": 1927 + }, + { + "epoch": 1.30446549391069, + "grad_norm": 1.0363141094777273, + "learning_rate": 6.957882670345458e-06, + "loss": 0.4012, + "step": 1928 + }, + { + "epoch": 1.3051420838971584, + "grad_norm": 1.0071780713629512, + "learning_rate": 6.954259604588114e-06, + "loss": 0.4048, + "step": 1929 + }, + { + "epoch": 1.3058186738836266, + "grad_norm": 1.0243348488897355, + "learning_rate": 6.950635327295154e-06, + "loss": 0.4206, + "step": 1930 + }, + { + "epoch": 1.3064952638700946, + "grad_norm": 1.0591372180408505, + "learning_rate": 6.94700984071343e-06, + "loss": 0.4257, + "step": 1931 + }, + { + "epoch": 1.3071718538565629, + "grad_norm": 1.0298745842766885, + "learning_rate": 6.943383147090552e-06, + "loss": 0.4098, + "step": 1932 + }, + { + "epoch": 1.3078484438430311, + "grad_norm": 1.032539916463029, + "learning_rate": 6.939755248674872e-06, + "loss": 0.4161, + "step": 1933 + }, + { + "epoch": 1.3085250338294994, + "grad_norm": 1.025676896237079, + "learning_rate": 6.936126147715494e-06, + "loss": 0.4305, + "step": 1934 + }, + { + "epoch": 1.3092016238159676, + "grad_norm": 1.038434492928235, + "learning_rate": 6.932495846462262e-06, + "loss": 0.4011, + "step": 1935 + }, + { + "epoch": 1.3098782138024356, + "grad_norm": 1.0325297415759267, + "learning_rate": 6.928864347165769e-06, + "loss": 0.4156, + "step": 1936 + }, + { + "epoch": 1.3105548037889039, + "grad_norm": 1.045956006199918, + "learning_rate": 6.925231652077349e-06, + "loss": 0.4222, + "step": 1937 + }, + { + "epoch": 1.3112313937753721, + "grad_norm": 1.00841192315478, + "learning_rate": 6.921597763449075e-06, + "loss": 0.4145, + "step": 1938 + }, + { + "epoch": 1.3119079837618404, + "grad_norm": 1.0257453293856218, + "learning_rate": 6.917962683533765e-06, + "loss": 0.4056, + "step": 1939 + }, + { + "epoch": 1.3125845737483086, + "grad_norm": 1.0965061832624963, + "learning_rate": 6.914326414584971e-06, + "loss": 0.4286, + "step": 1940 + }, + { + "epoch": 1.3132611637347766, + "grad_norm": 1.1030288462358044, + "learning_rate": 6.9106889588569845e-06, + "loss": 0.4273, + "step": 1941 + }, + { + "epoch": 1.3139377537212449, + "grad_norm": 1.0400203867263067, + "learning_rate": 6.907050318604831e-06, + "loss": 0.436, + "step": 1942 + }, + { + "epoch": 1.3146143437077131, + "grad_norm": 1.0485655153728435, + "learning_rate": 6.903410496084272e-06, + "loss": 0.4172, + "step": 1943 + }, + { + "epoch": 1.3152909336941814, + "grad_norm": 1.03732143615595, + "learning_rate": 6.8997694935518e-06, + "loss": 0.4174, + "step": 1944 + }, + { + "epoch": 1.3159675236806496, + "grad_norm": 1.0543855252616179, + "learning_rate": 6.896127313264643e-06, + "loss": 0.4362, + "step": 1945 + }, + { + "epoch": 1.3166441136671176, + "grad_norm": 1.0945303569147031, + "learning_rate": 6.892483957480754e-06, + "loss": 0.441, + "step": 1946 + }, + { + "epoch": 1.317320703653586, + "grad_norm": 1.0308613405126186, + "learning_rate": 6.888839428458819e-06, + "loss": 0.4259, + "step": 1947 + }, + { + "epoch": 1.3179972936400541, + "grad_norm": 1.0053459570484582, + "learning_rate": 6.885193728458247e-06, + "loss": 0.42, + "step": 1948 + }, + { + "epoch": 1.3186738836265224, + "grad_norm": 1.032777221954674, + "learning_rate": 6.8815468597391785e-06, + "loss": 0.4127, + "step": 1949 + }, + { + "epoch": 1.3193504736129906, + "grad_norm": 1.0933368075310617, + "learning_rate": 6.877898824562472e-06, + "loss": 0.4295, + "step": 1950 + }, + { + "epoch": 1.3200270635994586, + "grad_norm": 1.039978254256892, + "learning_rate": 6.8742496251897185e-06, + "loss": 0.4408, + "step": 1951 + }, + { + "epoch": 1.320703653585927, + "grad_norm": 1.0321175085385472, + "learning_rate": 6.8705992638832185e-06, + "loss": 0.4315, + "step": 1952 + }, + { + "epoch": 1.3213802435723951, + "grad_norm": 1.0411565722494915, + "learning_rate": 6.8669477429060026e-06, + "loss": 0.4295, + "step": 1953 + }, + { + "epoch": 1.3220568335588634, + "grad_norm": 1.0913366053147617, + "learning_rate": 6.863295064521816e-06, + "loss": 0.4329, + "step": 1954 + }, + { + "epoch": 1.3227334235453316, + "grad_norm": 0.9443365970442889, + "learning_rate": 6.859641230995123e-06, + "loss": 0.3914, + "step": 1955 + }, + { + "epoch": 1.3234100135317997, + "grad_norm": 1.039597800151262, + "learning_rate": 6.855986244591104e-06, + "loss": 0.4338, + "step": 1956 + }, + { + "epoch": 1.324086603518268, + "grad_norm": 1.0747903357364792, + "learning_rate": 6.852330107575653e-06, + "loss": 0.4458, + "step": 1957 + }, + { + "epoch": 1.3247631935047361, + "grad_norm": 1.004428701653835, + "learning_rate": 6.848672822215378e-06, + "loss": 0.4238, + "step": 1958 + }, + { + "epoch": 1.3254397834912044, + "grad_norm": 1.0512313876126667, + "learning_rate": 6.845014390777595e-06, + "loss": 0.4378, + "step": 1959 + }, + { + "epoch": 1.3261163734776726, + "grad_norm": 1.0011327011312865, + "learning_rate": 6.841354815530341e-06, + "loss": 0.4019, + "step": 1960 + }, + { + "epoch": 1.3267929634641407, + "grad_norm": 0.9870214563568758, + "learning_rate": 6.8376940987423526e-06, + "loss": 0.4074, + "step": 1961 + }, + { + "epoch": 1.327469553450609, + "grad_norm": 1.0191409671026699, + "learning_rate": 6.834032242683075e-06, + "loss": 0.4155, + "step": 1962 + }, + { + "epoch": 1.3281461434370772, + "grad_norm": 1.0503029826899775, + "learning_rate": 6.830369249622663e-06, + "loss": 0.418, + "step": 1963 + }, + { + "epoch": 1.3288227334235454, + "grad_norm": 1.0715770347345146, + "learning_rate": 6.8267051218319766e-06, + "loss": 0.4395, + "step": 1964 + }, + { + "epoch": 1.3294993234100136, + "grad_norm": 1.029799173753757, + "learning_rate": 6.823039861582574e-06, + "loss": 0.4058, + "step": 1965 + }, + { + "epoch": 1.3301759133964817, + "grad_norm": 1.039409884365538, + "learning_rate": 6.819373471146722e-06, + "loss": 0.4178, + "step": 1966 + }, + { + "epoch": 1.33085250338295, + "grad_norm": 1.1011530152736475, + "learning_rate": 6.815705952797383e-06, + "loss": 0.4235, + "step": 1967 + }, + { + "epoch": 1.3315290933694182, + "grad_norm": 1.0272643488408197, + "learning_rate": 6.8120373088082215e-06, + "loss": 0.4036, + "step": 1968 + }, + { + "epoch": 1.3322056833558864, + "grad_norm": 1.0734396966690711, + "learning_rate": 6.808367541453599e-06, + "loss": 0.4242, + "step": 1969 + }, + { + "epoch": 1.3328822733423547, + "grad_norm": 1.0440675115802303, + "learning_rate": 6.804696653008574e-06, + "loss": 0.4169, + "step": 1970 + }, + { + "epoch": 1.3335588633288227, + "grad_norm": 1.0527102213457342, + "learning_rate": 6.801024645748899e-06, + "loss": 0.419, + "step": 1971 + }, + { + "epoch": 1.334235453315291, + "grad_norm": 1.0403169623255601, + "learning_rate": 6.797351521951021e-06, + "loss": 0.4256, + "step": 1972 + }, + { + "epoch": 1.3349120433017592, + "grad_norm": 1.0058598851333849, + "learning_rate": 6.793677283892077e-06, + "loss": 0.4106, + "step": 1973 + }, + { + "epoch": 1.3355886332882274, + "grad_norm": 0.9864499034412528, + "learning_rate": 6.7900019338499005e-06, + "loss": 0.4105, + "step": 1974 + }, + { + "epoch": 1.3362652232746957, + "grad_norm": 1.0851498101661252, + "learning_rate": 6.786325474103006e-06, + "loss": 0.4278, + "step": 1975 + }, + { + "epoch": 1.3369418132611637, + "grad_norm": 1.0427232147668426, + "learning_rate": 6.782647906930602e-06, + "loss": 0.4123, + "step": 1976 + }, + { + "epoch": 1.337618403247632, + "grad_norm": 1.008879779853957, + "learning_rate": 6.778969234612583e-06, + "loss": 0.3981, + "step": 1977 + }, + { + "epoch": 1.3382949932341002, + "grad_norm": 1.0195763958236552, + "learning_rate": 6.775289459429526e-06, + "loss": 0.4169, + "step": 1978 + }, + { + "epoch": 1.3389715832205684, + "grad_norm": 1.030043921102332, + "learning_rate": 6.771608583662694e-06, + "loss": 0.4169, + "step": 1979 + }, + { + "epoch": 1.3396481732070367, + "grad_norm": 1.04060394170562, + "learning_rate": 6.767926609594032e-06, + "loss": 0.4166, + "step": 1980 + }, + { + "epoch": 1.3403247631935047, + "grad_norm": 1.0596763439742476, + "learning_rate": 6.764243539506166e-06, + "loss": 0.4275, + "step": 1981 + }, + { + "epoch": 1.341001353179973, + "grad_norm": 1.0121725599137021, + "learning_rate": 6.760559375682398e-06, + "loss": 0.427, + "step": 1982 + }, + { + "epoch": 1.3416779431664412, + "grad_norm": 1.0288082176213642, + "learning_rate": 6.7568741204067145e-06, + "loss": 0.4204, + "step": 1983 + }, + { + "epoch": 1.3423545331529092, + "grad_norm": 1.0499843130824649, + "learning_rate": 6.753187775963773e-06, + "loss": 0.4253, + "step": 1984 + }, + { + "epoch": 1.3430311231393777, + "grad_norm": 1.0174389797105612, + "learning_rate": 6.749500344638908e-06, + "loss": 0.4063, + "step": 1985 + }, + { + "epoch": 1.3437077131258457, + "grad_norm": 1.0812254539748511, + "learning_rate": 6.74581182871813e-06, + "loss": 0.451, + "step": 1986 + }, + { + "epoch": 1.344384303112314, + "grad_norm": 1.0487451208277823, + "learning_rate": 6.7421222304881194e-06, + "loss": 0.439, + "step": 1987 + }, + { + "epoch": 1.3450608930987822, + "grad_norm": 1.0063093729936436, + "learning_rate": 6.738431552236228e-06, + "loss": 0.396, + "step": 1988 + }, + { + "epoch": 1.3457374830852502, + "grad_norm": 1.0503976905174983, + "learning_rate": 6.734739796250477e-06, + "loss": 0.4331, + "step": 1989 + }, + { + "epoch": 1.3464140730717187, + "grad_norm": 1.097386145909681, + "learning_rate": 6.731046964819555e-06, + "loss": 0.4431, + "step": 1990 + }, + { + "epoch": 1.3470906630581867, + "grad_norm": 1.0106047494897548, + "learning_rate": 6.727353060232822e-06, + "loss": 0.4049, + "step": 1991 + }, + { + "epoch": 1.347767253044655, + "grad_norm": 1.029135946008643, + "learning_rate": 6.723658084780297e-06, + "loss": 0.4341, + "step": 1992 + }, + { + "epoch": 1.3484438430311232, + "grad_norm": 0.9863741857138194, + "learning_rate": 6.719962040752665e-06, + "loss": 0.4093, + "step": 1993 + }, + { + "epoch": 1.3491204330175912, + "grad_norm": 1.044214163248404, + "learning_rate": 6.716264930441279e-06, + "loss": 0.4127, + "step": 1994 + }, + { + "epoch": 1.3497970230040595, + "grad_norm": 0.9859722455064147, + "learning_rate": 6.712566756138142e-06, + "loss": 0.3925, + "step": 1995 + }, + { + "epoch": 1.3504736129905277, + "grad_norm": 1.0171853848379828, + "learning_rate": 6.708867520135924e-06, + "loss": 0.4066, + "step": 1996 + }, + { + "epoch": 1.351150202976996, + "grad_norm": 1.136459883768459, + "learning_rate": 6.705167224727956e-06, + "loss": 0.439, + "step": 1997 + }, + { + "epoch": 1.3518267929634642, + "grad_norm": 1.0696783687374793, + "learning_rate": 6.701465872208216e-06, + "loss": 0.4246, + "step": 1998 + }, + { + "epoch": 1.3525033829499322, + "grad_norm": 1.0319016617433867, + "learning_rate": 6.697763464871346e-06, + "loss": 0.4214, + "step": 1999 + }, + { + "epoch": 1.3531799729364005, + "grad_norm": 1.0524230751022488, + "learning_rate": 6.694060005012642e-06, + "loss": 0.4408, + "step": 2000 + }, + { + "epoch": 1.3538565629228687, + "grad_norm": 1.0122123524630355, + "learning_rate": 6.690355494928043e-06, + "loss": 0.416, + "step": 2001 + }, + { + "epoch": 1.354533152909337, + "grad_norm": 1.0570408088248036, + "learning_rate": 6.686649936914151e-06, + "loss": 0.422, + "step": 2002 + }, + { + "epoch": 1.3552097428958052, + "grad_norm": 0.973013215879346, + "learning_rate": 6.682943333268208e-06, + "loss": 0.3978, + "step": 2003 + }, + { + "epoch": 1.3558863328822732, + "grad_norm": 1.0308636855480167, + "learning_rate": 6.6792356862881144e-06, + "loss": 0.4413, + "step": 2004 + }, + { + "epoch": 1.3565629228687415, + "grad_norm": 1.0410125908595642, + "learning_rate": 6.675526998272405e-06, + "loss": 0.4312, + "step": 2005 + }, + { + "epoch": 1.3572395128552097, + "grad_norm": 1.0297870259383395, + "learning_rate": 6.671817271520269e-06, + "loss": 0.4096, + "step": 2006 + }, + { + "epoch": 1.357916102841678, + "grad_norm": 1.0118324274652348, + "learning_rate": 6.668106508331539e-06, + "loss": 0.4151, + "step": 2007 + }, + { + "epoch": 1.3585926928281462, + "grad_norm": 0.987166650559704, + "learning_rate": 6.664394711006684e-06, + "loss": 0.4148, + "step": 2008 + }, + { + "epoch": 1.3592692828146142, + "grad_norm": 0.9710453550802868, + "learning_rate": 6.660681881846822e-06, + "loss": 0.3888, + "step": 2009 + }, + { + "epoch": 1.3599458728010825, + "grad_norm": 0.9792754179223846, + "learning_rate": 6.656968023153706e-06, + "loss": 0.4149, + "step": 2010 + }, + { + "epoch": 1.3606224627875507, + "grad_norm": 1.0259782184965631, + "learning_rate": 6.653253137229727e-06, + "loss": 0.3937, + "step": 2011 + }, + { + "epoch": 1.361299052774019, + "grad_norm": 1.0024310585481508, + "learning_rate": 6.6495372263779145e-06, + "loss": 0.4013, + "step": 2012 + }, + { + "epoch": 1.3619756427604872, + "grad_norm": 1.0481091780226433, + "learning_rate": 6.6458202929019345e-06, + "loss": 0.4241, + "step": 2013 + }, + { + "epoch": 1.3626522327469552, + "grad_norm": 1.0117337707229543, + "learning_rate": 6.6421023391060845e-06, + "loss": 0.4019, + "step": 2014 + }, + { + "epoch": 1.3633288227334235, + "grad_norm": 1.0499358074064378, + "learning_rate": 6.6383833672952945e-06, + "loss": 0.4141, + "step": 2015 + }, + { + "epoch": 1.3640054127198917, + "grad_norm": 1.0432854536830993, + "learning_rate": 6.634663379775126e-06, + "loss": 0.4162, + "step": 2016 + }, + { + "epoch": 1.36468200270636, + "grad_norm": 1.0471834865487055, + "learning_rate": 6.630942378851774e-06, + "loss": 0.4214, + "step": 2017 + }, + { + "epoch": 1.3653585926928282, + "grad_norm": 1.0215669999280084, + "learning_rate": 6.627220366832056e-06, + "loss": 0.4304, + "step": 2018 + }, + { + "epoch": 1.3660351826792962, + "grad_norm": 1.0546946843995326, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.4275, + "step": 2019 + }, + { + "epoch": 1.3667117726657645, + "grad_norm": 1.011735647399513, + "learning_rate": 6.619773318733934e-06, + "loss": 0.4216, + "step": 2020 + }, + { + "epoch": 1.3673883626522327, + "grad_norm": 0.9877857356075881, + "learning_rate": 6.616048287272301e-06, + "loss": 0.4086, + "step": 2021 + }, + { + "epoch": 1.368064952638701, + "grad_norm": 0.9817834055860715, + "learning_rate": 6.612322253947836e-06, + "loss": 0.3926, + "step": 2022 + }, + { + "epoch": 1.3687415426251692, + "grad_norm": 1.0050196414953598, + "learning_rate": 6.608595221070478e-06, + "loss": 0.4195, + "step": 2023 + }, + { + "epoch": 1.3694181326116373, + "grad_norm": 0.9867981999368625, + "learning_rate": 6.60486719095079e-06, + "loss": 0.4187, + "step": 2024 + }, + { + "epoch": 1.3700947225981055, + "grad_norm": 1.0113170727426537, + "learning_rate": 6.601138165899945e-06, + "loss": 0.4138, + "step": 2025 + }, + { + "epoch": 1.3707713125845737, + "grad_norm": 1.0229877977928143, + "learning_rate": 6.597408148229742e-06, + "loss": 0.427, + "step": 2026 + }, + { + "epoch": 1.371447902571042, + "grad_norm": 1.0297031727093853, + "learning_rate": 6.5936771402525875e-06, + "loss": 0.4225, + "step": 2027 + }, + { + "epoch": 1.3721244925575102, + "grad_norm": 1.0542911396541623, + "learning_rate": 6.589945144281508e-06, + "loss": 0.4312, + "step": 2028 + }, + { + "epoch": 1.3728010825439783, + "grad_norm": 1.0548128227567142, + "learning_rate": 6.586212162630137e-06, + "loss": 0.4139, + "step": 2029 + }, + { + "epoch": 1.3734776725304465, + "grad_norm": 1.040140995830836, + "learning_rate": 6.582478197612725e-06, + "loss": 0.4271, + "step": 2030 + }, + { + "epoch": 1.3741542625169147, + "grad_norm": 1.0320816500314305, + "learning_rate": 6.578743251544128e-06, + "loss": 0.4332, + "step": 2031 + }, + { + "epoch": 1.374830852503383, + "grad_norm": 1.0135099972477304, + "learning_rate": 6.57500732673981e-06, + "loss": 0.414, + "step": 2032 + }, + { + "epoch": 1.3755074424898512, + "grad_norm": 1.0028503153709398, + "learning_rate": 6.571270425515843e-06, + "loss": 0.4115, + "step": 2033 + }, + { + "epoch": 1.3761840324763193, + "grad_norm": 1.0493627329600907, + "learning_rate": 6.567532550188908e-06, + "loss": 0.4242, + "step": 2034 + }, + { + "epoch": 1.3768606224627875, + "grad_norm": 1.0858798690797127, + "learning_rate": 6.56379370307628e-06, + "loss": 0.4403, + "step": 2035 + }, + { + "epoch": 1.3775372124492558, + "grad_norm": 1.0232116076183813, + "learning_rate": 6.560053886495847e-06, + "loss": 0.3924, + "step": 2036 + }, + { + "epoch": 1.378213802435724, + "grad_norm": 0.940722554882363, + "learning_rate": 6.556313102766094e-06, + "loss": 0.3973, + "step": 2037 + }, + { + "epoch": 1.3788903924221922, + "grad_norm": 0.9745813657743075, + "learning_rate": 6.552571354206104e-06, + "loss": 0.4095, + "step": 2038 + }, + { + "epoch": 1.3795669824086603, + "grad_norm": 1.0675000533588468, + "learning_rate": 6.548828643135559e-06, + "loss": 0.422, + "step": 2039 + }, + { + "epoch": 1.3802435723951285, + "grad_norm": 1.0638864029128485, + "learning_rate": 6.545084971874738e-06, + "loss": 0.4185, + "step": 2040 + }, + { + "epoch": 1.3809201623815968, + "grad_norm": 1.060855984837904, + "learning_rate": 6.541340342744517e-06, + "loss": 0.4373, + "step": 2041 + }, + { + "epoch": 1.381596752368065, + "grad_norm": 0.966572894794586, + "learning_rate": 6.537594758066362e-06, + "loss": 0.3806, + "step": 2042 + }, + { + "epoch": 1.3822733423545333, + "grad_norm": 1.0389503133408153, + "learning_rate": 6.533848220162336e-06, + "loss": 0.4157, + "step": 2043 + }, + { + "epoch": 1.3829499323410013, + "grad_norm": 1.0053055566917941, + "learning_rate": 6.530100731355089e-06, + "loss": 0.4088, + "step": 2044 + }, + { + "epoch": 1.3836265223274695, + "grad_norm": 1.0341325677032138, + "learning_rate": 6.5263522939678626e-06, + "loss": 0.4213, + "step": 2045 + }, + { + "epoch": 1.3843031123139378, + "grad_norm": 1.0285726317859492, + "learning_rate": 6.5226029103244846e-06, + "loss": 0.4221, + "step": 2046 + }, + { + "epoch": 1.384979702300406, + "grad_norm": 1.071233932449683, + "learning_rate": 6.518852582749373e-06, + "loss": 0.4259, + "step": 2047 + }, + { + "epoch": 1.3856562922868743, + "grad_norm": 1.0102528084469493, + "learning_rate": 6.515101313567529e-06, + "loss": 0.4098, + "step": 2048 + }, + { + "epoch": 1.3863328822733423, + "grad_norm": 0.9843297015291131, + "learning_rate": 6.511349105104534e-06, + "loss": 0.4036, + "step": 2049 + }, + { + "epoch": 1.3870094722598105, + "grad_norm": 1.0064849296435698, + "learning_rate": 6.507595959686558e-06, + "loss": 0.4153, + "step": 2050 + }, + { + "epoch": 1.3876860622462788, + "grad_norm": 1.0630195299277263, + "learning_rate": 6.503841879640349e-06, + "loss": 0.4071, + "step": 2051 + }, + { + "epoch": 1.388362652232747, + "grad_norm": 1.032856512191563, + "learning_rate": 6.500086867293231e-06, + "loss": 0.4201, + "step": 2052 + }, + { + "epoch": 1.3890392422192153, + "grad_norm": 1.0589845753660267, + "learning_rate": 6.496330924973112e-06, + "loss": 0.4328, + "step": 2053 + }, + { + "epoch": 1.3897158322056833, + "grad_norm": 1.0496781629534784, + "learning_rate": 6.492574055008474e-06, + "loss": 0.4292, + "step": 2054 + }, + { + "epoch": 1.3903924221921515, + "grad_norm": 1.0373155937489198, + "learning_rate": 6.488816259728372e-06, + "loss": 0.396, + "step": 2055 + }, + { + "epoch": 1.3910690121786198, + "grad_norm": 1.0082194646039746, + "learning_rate": 6.4850575414624385e-06, + "loss": 0.408, + "step": 2056 + }, + { + "epoch": 1.391745602165088, + "grad_norm": 1.0112868305447484, + "learning_rate": 6.481297902540875e-06, + "loss": 0.4075, + "step": 2057 + }, + { + "epoch": 1.3924221921515563, + "grad_norm": 0.9777946393058407, + "learning_rate": 6.477537345294455e-06, + "loss": 0.4149, + "step": 2058 + }, + { + "epoch": 1.3930987821380243, + "grad_norm": 1.0429932967518827, + "learning_rate": 6.473775872054522e-06, + "loss": 0.4277, + "step": 2059 + }, + { + "epoch": 1.3937753721244925, + "grad_norm": 1.0203817930711212, + "learning_rate": 6.4700134851529864e-06, + "loss": 0.4085, + "step": 2060 + }, + { + "epoch": 1.3944519621109608, + "grad_norm": 0.9904306968624756, + "learning_rate": 6.466250186922325e-06, + "loss": 0.4212, + "step": 2061 + }, + { + "epoch": 1.395128552097429, + "grad_norm": 1.0431618615213187, + "learning_rate": 6.46248597969558e-06, + "loss": 0.4472, + "step": 2062 + }, + { + "epoch": 1.3958051420838973, + "grad_norm": 1.0176240279001862, + "learning_rate": 6.458720865806356e-06, + "loss": 0.4092, + "step": 2063 + }, + { + "epoch": 1.3964817320703653, + "grad_norm": 1.039550103362902, + "learning_rate": 6.454954847588824e-06, + "loss": 0.4401, + "step": 2064 + }, + { + "epoch": 1.3971583220568335, + "grad_norm": 1.0041541238833982, + "learning_rate": 6.4511879273777065e-06, + "loss": 0.4055, + "step": 2065 + }, + { + "epoch": 1.3978349120433018, + "grad_norm": 1.0145696013341563, + "learning_rate": 6.447420107508297e-06, + "loss": 0.4194, + "step": 2066 + }, + { + "epoch": 1.39851150202977, + "grad_norm": 0.986030628475444, + "learning_rate": 6.443651390316438e-06, + "loss": 0.4078, + "step": 2067 + }, + { + "epoch": 1.3991880920162383, + "grad_norm": 1.0336096121951381, + "learning_rate": 6.439881778138531e-06, + "loss": 0.4244, + "step": 2068 + }, + { + "epoch": 1.3998646820027063, + "grad_norm": 0.9870058474041503, + "learning_rate": 6.436111273311533e-06, + "loss": 0.406, + "step": 2069 + }, + { + "epoch": 1.4005412719891746, + "grad_norm": 1.0670008226310135, + "learning_rate": 6.4323398781729525e-06, + "loss": 0.4317, + "step": 2070 + }, + { + "epoch": 1.4012178619756428, + "grad_norm": 0.9814484086507456, + "learning_rate": 6.428567595060853e-06, + "loss": 0.3929, + "step": 2071 + }, + { + "epoch": 1.401894451962111, + "grad_norm": 1.0117139748524482, + "learning_rate": 6.424794426313845e-06, + "loss": 0.4199, + "step": 2072 + }, + { + "epoch": 1.4025710419485793, + "grad_norm": 1.0161994901612237, + "learning_rate": 6.42102037427109e-06, + "loss": 0.4111, + "step": 2073 + }, + { + "epoch": 1.4032476319350473, + "grad_norm": 1.0304585972871476, + "learning_rate": 6.417245441272299e-06, + "loss": 0.4104, + "step": 2074 + }, + { + "epoch": 1.4039242219215156, + "grad_norm": 1.0314546611378963, + "learning_rate": 6.413469629657724e-06, + "loss": 0.4224, + "step": 2075 + }, + { + "epoch": 1.4046008119079838, + "grad_norm": 1.0168116234374232, + "learning_rate": 6.409692941768166e-06, + "loss": 0.4173, + "step": 2076 + }, + { + "epoch": 1.4052774018944518, + "grad_norm": 1.0033202876518077, + "learning_rate": 6.405915379944967e-06, + "loss": 0.3835, + "step": 2077 + }, + { + "epoch": 1.4059539918809203, + "grad_norm": 1.035970558795831, + "learning_rate": 6.402136946530014e-06, + "loss": 0.4165, + "step": 2078 + }, + { + "epoch": 1.4066305818673883, + "grad_norm": 1.0186604980458462, + "learning_rate": 6.398357643865731e-06, + "loss": 0.399, + "step": 2079 + }, + { + "epoch": 1.4073071718538566, + "grad_norm": 1.0214763885041875, + "learning_rate": 6.394577474295081e-06, + "loss": 0.4055, + "step": 2080 + }, + { + "epoch": 1.4079837618403248, + "grad_norm": 1.021359429184758, + "learning_rate": 6.390796440161566e-06, + "loss": 0.4162, + "step": 2081 + }, + { + "epoch": 1.4086603518267928, + "grad_norm": 1.025956336559153, + "learning_rate": 6.387014543809224e-06, + "loss": 0.4123, + "step": 2082 + }, + { + "epoch": 1.4093369418132613, + "grad_norm": 0.9847594412419571, + "learning_rate": 6.383231787582625e-06, + "loss": 0.4145, + "step": 2083 + }, + { + "epoch": 1.4100135317997293, + "grad_norm": 1.0159414826010698, + "learning_rate": 6.3794481738268765e-06, + "loss": 0.4151, + "step": 2084 + }, + { + "epoch": 1.4106901217861976, + "grad_norm": 0.9925542054758073, + "learning_rate": 6.375663704887614e-06, + "loss": 0.4114, + "step": 2085 + }, + { + "epoch": 1.4113667117726658, + "grad_norm": 0.9996896795037346, + "learning_rate": 6.371878383111002e-06, + "loss": 0.4165, + "step": 2086 + }, + { + "epoch": 1.4120433017591338, + "grad_norm": 1.004250347530124, + "learning_rate": 6.368092210843739e-06, + "loss": 0.4146, + "step": 2087 + }, + { + "epoch": 1.412719891745602, + "grad_norm": 0.9784966367414293, + "learning_rate": 6.364305190433049e-06, + "loss": 0.3999, + "step": 2088 + }, + { + "epoch": 1.4133964817320703, + "grad_norm": 1.0620579280721916, + "learning_rate": 6.360517324226676e-06, + "loss": 0.404, + "step": 2089 + }, + { + "epoch": 1.4140730717185386, + "grad_norm": 1.0224848940212419, + "learning_rate": 6.3567286145728944e-06, + "loss": 0.4223, + "step": 2090 + }, + { + "epoch": 1.4147496617050068, + "grad_norm": 1.0376396714818348, + "learning_rate": 6.3529390638205036e-06, + "loss": 0.4346, + "step": 2091 + }, + { + "epoch": 1.4154262516914748, + "grad_norm": 1.059024918556977, + "learning_rate": 6.349148674318816e-06, + "loss": 0.4127, + "step": 2092 + }, + { + "epoch": 1.416102841677943, + "grad_norm": 1.007584639816123, + "learning_rate": 6.34535744841767e-06, + "loss": 0.3986, + "step": 2093 + }, + { + "epoch": 1.4167794316644113, + "grad_norm": 0.9533261973399811, + "learning_rate": 6.341565388467425e-06, + "loss": 0.3835, + "step": 2094 + }, + { + "epoch": 1.4174560216508796, + "grad_norm": 1.0029995368131168, + "learning_rate": 6.3377724968189494e-06, + "loss": 0.397, + "step": 2095 + }, + { + "epoch": 1.4181326116373478, + "grad_norm": 1.043716743652025, + "learning_rate": 6.3339787758236316e-06, + "loss": 0.4079, + "step": 2096 + }, + { + "epoch": 1.4188092016238159, + "grad_norm": 1.0160555984925257, + "learning_rate": 6.330184227833376e-06, + "loss": 0.4151, + "step": 2097 + }, + { + "epoch": 1.419485791610284, + "grad_norm": 1.0329018979428226, + "learning_rate": 6.326388855200598e-06, + "loss": 0.4101, + "step": 2098 + }, + { + "epoch": 1.4201623815967523, + "grad_norm": 0.9657804031861481, + "learning_rate": 6.322592660278223e-06, + "loss": 0.4059, + "step": 2099 + }, + { + "epoch": 1.4208389715832206, + "grad_norm": 1.0086465820944381, + "learning_rate": 6.3187956454196885e-06, + "loss": 0.3947, + "step": 2100 + }, + { + "epoch": 1.4215155615696888, + "grad_norm": 1.052454309985347, + "learning_rate": 6.314997812978938e-06, + "loss": 0.432, + "step": 2101 + }, + { + "epoch": 1.4221921515561569, + "grad_norm": 1.0484186597494867, + "learning_rate": 6.311199165310422e-06, + "loss": 0.4273, + "step": 2102 + }, + { + "epoch": 1.422868741542625, + "grad_norm": 0.9937917427499688, + "learning_rate": 6.3073997047691e-06, + "loss": 0.4187, + "step": 2103 + }, + { + "epoch": 1.4235453315290933, + "grad_norm": 1.0071429853112834, + "learning_rate": 6.30359943371043e-06, + "loss": 0.408, + "step": 2104 + }, + { + "epoch": 1.4242219215155616, + "grad_norm": 0.9953451190094351, + "learning_rate": 6.299798354490376e-06, + "loss": 0.4115, + "step": 2105 + }, + { + "epoch": 1.4248985115020298, + "grad_norm": 0.9670336945990794, + "learning_rate": 6.295996469465404e-06, + "loss": 0.3935, + "step": 2106 + }, + { + "epoch": 1.4255751014884979, + "grad_norm": 0.9823171512966387, + "learning_rate": 6.292193780992475e-06, + "loss": 0.3895, + "step": 2107 + }, + { + "epoch": 1.426251691474966, + "grad_norm": 1.0153987229933188, + "learning_rate": 6.288390291429054e-06, + "loss": 0.3911, + "step": 2108 + }, + { + "epoch": 1.4269282814614344, + "grad_norm": 1.035840492614243, + "learning_rate": 6.284586003133096e-06, + "loss": 0.4104, + "step": 2109 + }, + { + "epoch": 1.4276048714479026, + "grad_norm": 1.0085552883909579, + "learning_rate": 6.280780918463057e-06, + "loss": 0.4161, + "step": 2110 + }, + { + "epoch": 1.4282814614343708, + "grad_norm": 0.9956474044663148, + "learning_rate": 6.276975039777885e-06, + "loss": 0.3851, + "step": 2111 + }, + { + "epoch": 1.4289580514208389, + "grad_norm": 1.0342229865985904, + "learning_rate": 6.2731683694370185e-06, + "loss": 0.4256, + "step": 2112 + }, + { + "epoch": 1.4296346414073071, + "grad_norm": 1.0160702632344145, + "learning_rate": 6.269360909800386e-06, + "loss": 0.4168, + "step": 2113 + }, + { + "epoch": 1.4303112313937754, + "grad_norm": 0.9608203581897241, + "learning_rate": 6.265552663228411e-06, + "loss": 0.3755, + "step": 2114 + }, + { + "epoch": 1.4309878213802436, + "grad_norm": 1.0125062904723066, + "learning_rate": 6.261743632081998e-06, + "loss": 0.4154, + "step": 2115 + }, + { + "epoch": 1.4316644113667119, + "grad_norm": 0.9670174920797765, + "learning_rate": 6.257933818722544e-06, + "loss": 0.3824, + "step": 2116 + }, + { + "epoch": 1.4323410013531799, + "grad_norm": 1.0672483252699188, + "learning_rate": 6.254123225511924e-06, + "loss": 0.4192, + "step": 2117 + }, + { + "epoch": 1.4330175913396481, + "grad_norm": 1.0042193693383927, + "learning_rate": 6.250311854812504e-06, + "loss": 0.4081, + "step": 2118 + }, + { + "epoch": 1.4336941813261164, + "grad_norm": 1.0040947846388422, + "learning_rate": 6.246499708987127e-06, + "loss": 0.4172, + "step": 2119 + }, + { + "epoch": 1.4343707713125846, + "grad_norm": 1.0531594718451485, + "learning_rate": 6.242686790399117e-06, + "loss": 0.41, + "step": 2120 + }, + { + "epoch": 1.4350473612990529, + "grad_norm": 1.035884758642596, + "learning_rate": 6.238873101412282e-06, + "loss": 0.4059, + "step": 2121 + }, + { + "epoch": 1.4357239512855209, + "grad_norm": 1.0148740218596994, + "learning_rate": 6.2350586443908965e-06, + "loss": 0.4029, + "step": 2122 + }, + { + "epoch": 1.4364005412719891, + "grad_norm": 0.9839871761940735, + "learning_rate": 6.231243421699725e-06, + "loss": 0.4098, + "step": 2123 + }, + { + "epoch": 1.4370771312584574, + "grad_norm": 0.9675074879941716, + "learning_rate": 6.227427435703997e-06, + "loss": 0.4171, + "step": 2124 + }, + { + "epoch": 1.4377537212449256, + "grad_norm": 1.011359475891578, + "learning_rate": 6.223610688769418e-06, + "loss": 0.4101, + "step": 2125 + }, + { + "epoch": 1.4384303112313939, + "grad_norm": 0.9841656612063509, + "learning_rate": 6.219793183262165e-06, + "loss": 0.3997, + "step": 2126 + }, + { + "epoch": 1.439106901217862, + "grad_norm": 1.0397277570030254, + "learning_rate": 6.215974921548888e-06, + "loss": 0.4202, + "step": 2127 + }, + { + "epoch": 1.4397834912043301, + "grad_norm": 0.9854370592211071, + "learning_rate": 6.2121559059966995e-06, + "loss": 0.4138, + "step": 2128 + }, + { + "epoch": 1.4404600811907984, + "grad_norm": 1.0094625634095216, + "learning_rate": 6.2083361389731874e-06, + "loss": 0.3963, + "step": 2129 + }, + { + "epoch": 1.4411366711772666, + "grad_norm": 1.0825200264477999, + "learning_rate": 6.204515622846399e-06, + "loss": 0.4205, + "step": 2130 + }, + { + "epoch": 1.4418132611637349, + "grad_norm": 0.9902815540778717, + "learning_rate": 6.200694359984849e-06, + "loss": 0.3992, + "step": 2131 + }, + { + "epoch": 1.442489851150203, + "grad_norm": 1.0698463297234384, + "learning_rate": 6.1968723527575155e-06, + "loss": 0.4092, + "step": 2132 + }, + { + "epoch": 1.4431664411366711, + "grad_norm": 1.0377798757102696, + "learning_rate": 6.193049603533835e-06, + "loss": 0.4216, + "step": 2133 + }, + { + "epoch": 1.4438430311231394, + "grad_norm": 1.0075304616902754, + "learning_rate": 6.189226114683708e-06, + "loss": 0.4117, + "step": 2134 + }, + { + "epoch": 1.4445196211096076, + "grad_norm": 0.9797038194536343, + "learning_rate": 6.185401888577488e-06, + "loss": 0.3993, + "step": 2135 + }, + { + "epoch": 1.4451962110960759, + "grad_norm": 0.9999885464307031, + "learning_rate": 6.181576927585993e-06, + "loss": 0.403, + "step": 2136 + }, + { + "epoch": 1.445872801082544, + "grad_norm": 1.0130171281322573, + "learning_rate": 6.177751234080491e-06, + "loss": 0.4189, + "step": 2137 + }, + { + "epoch": 1.4465493910690121, + "grad_norm": 1.068396234897941, + "learning_rate": 6.173924810432705e-06, + "loss": 0.4254, + "step": 2138 + }, + { + "epoch": 1.4472259810554804, + "grad_norm": 1.030850482764561, + "learning_rate": 6.170097659014812e-06, + "loss": 0.4076, + "step": 2139 + }, + { + "epoch": 1.4479025710419486, + "grad_norm": 1.0792286942112153, + "learning_rate": 6.166269782199441e-06, + "loss": 0.4173, + "step": 2140 + }, + { + "epoch": 1.4485791610284169, + "grad_norm": 1.0148254137711399, + "learning_rate": 6.162441182359667e-06, + "loss": 0.408, + "step": 2141 + }, + { + "epoch": 1.449255751014885, + "grad_norm": 1.0562316910922944, + "learning_rate": 6.158611861869018e-06, + "loss": 0.4346, + "step": 2142 + }, + { + "epoch": 1.4499323410013532, + "grad_norm": 1.0185731414358785, + "learning_rate": 6.154781823101463e-06, + "loss": 0.4117, + "step": 2143 + }, + { + "epoch": 1.4506089309878214, + "grad_norm": 1.0430597801857664, + "learning_rate": 6.150951068431424e-06, + "loss": 0.4493, + "step": 2144 + }, + { + "epoch": 1.4512855209742896, + "grad_norm": 1.084614047825812, + "learning_rate": 6.147119600233758e-06, + "loss": 0.4253, + "step": 2145 + }, + { + "epoch": 1.451962110960758, + "grad_norm": 1.0144796122155888, + "learning_rate": 6.143287420883772e-06, + "loss": 0.4051, + "step": 2146 + }, + { + "epoch": 1.452638700947226, + "grad_norm": 1.0024725767130016, + "learning_rate": 6.1394545327572086e-06, + "loss": 0.421, + "step": 2147 + }, + { + "epoch": 1.4533152909336942, + "grad_norm": 1.0032027162996657, + "learning_rate": 6.135620938230254e-06, + "loss": 0.4253, + "step": 2148 + }, + { + "epoch": 1.4539918809201624, + "grad_norm": 1.0336166024199123, + "learning_rate": 6.131786639679527e-06, + "loss": 0.4356, + "step": 2149 + }, + { + "epoch": 1.4546684709066307, + "grad_norm": 0.9555994545271854, + "learning_rate": 6.127951639482088e-06, + "loss": 0.4114, + "step": 2150 + }, + { + "epoch": 1.455345060893099, + "grad_norm": 1.0132960523516634, + "learning_rate": 6.1241159400154306e-06, + "loss": 0.4209, + "step": 2151 + }, + { + "epoch": 1.456021650879567, + "grad_norm": 1.0214122607983387, + "learning_rate": 6.12027954365748e-06, + "loss": 0.4061, + "step": 2152 + }, + { + "epoch": 1.4566982408660352, + "grad_norm": 1.0266615457272228, + "learning_rate": 6.116442452786599e-06, + "loss": 0.4309, + "step": 2153 + }, + { + "epoch": 1.4573748308525034, + "grad_norm": 1.011781840536705, + "learning_rate": 6.112604669781572e-06, + "loss": 0.3876, + "step": 2154 + }, + { + "epoch": 1.4580514208389717, + "grad_norm": 1.0360269028968327, + "learning_rate": 6.108766197021623e-06, + "loss": 0.4088, + "step": 2155 + }, + { + "epoch": 1.45872801082544, + "grad_norm": 0.9855891213130734, + "learning_rate": 6.104927036886392e-06, + "loss": 0.3855, + "step": 2156 + }, + { + "epoch": 1.459404600811908, + "grad_norm": 1.0539991329465321, + "learning_rate": 6.101087191755958e-06, + "loss": 0.4295, + "step": 2157 + }, + { + "epoch": 1.4600811907983762, + "grad_norm": 0.9684348580901866, + "learning_rate": 6.097246664010813e-06, + "loss": 0.3954, + "step": 2158 + }, + { + "epoch": 1.4607577807848444, + "grad_norm": 0.9983027694439996, + "learning_rate": 6.09340545603188e-06, + "loss": 0.4062, + "step": 2159 + }, + { + "epoch": 1.4614343707713127, + "grad_norm": 1.0069651354132814, + "learning_rate": 6.0895635702004985e-06, + "loss": 0.4052, + "step": 2160 + }, + { + "epoch": 1.462110960757781, + "grad_norm": 0.9881751226110508, + "learning_rate": 6.085721008898434e-06, + "loss": 0.409, + "step": 2161 + }, + { + "epoch": 1.462787550744249, + "grad_norm": 0.9881654970084861, + "learning_rate": 6.081877774507864e-06, + "loss": 0.4092, + "step": 2162 + }, + { + "epoch": 1.4634641407307172, + "grad_norm": 1.045557510597242, + "learning_rate": 6.078033869411389e-06, + "loss": 0.4258, + "step": 2163 + }, + { + "epoch": 1.4641407307171854, + "grad_norm": 0.976336614296186, + "learning_rate": 6.0741892959920205e-06, + "loss": 0.3912, + "step": 2164 + }, + { + "epoch": 1.4648173207036537, + "grad_norm": 1.000923515659377, + "learning_rate": 6.070344056633189e-06, + "loss": 0.402, + "step": 2165 + }, + { + "epoch": 1.465493910690122, + "grad_norm": 1.0262794468116898, + "learning_rate": 6.066498153718735e-06, + "loss": 0.4124, + "step": 2166 + }, + { + "epoch": 1.46617050067659, + "grad_norm": 0.9873685270438126, + "learning_rate": 6.062651589632911e-06, + "loss": 0.4041, + "step": 2167 + }, + { + "epoch": 1.4668470906630582, + "grad_norm": 0.9844673104985412, + "learning_rate": 6.05880436676038e-06, + "loss": 0.4114, + "step": 2168 + }, + { + "epoch": 1.4675236806495264, + "grad_norm": 0.9890288008350204, + "learning_rate": 6.054956487486212e-06, + "loss": 0.3869, + "step": 2169 + }, + { + "epoch": 1.4682002706359945, + "grad_norm": 1.0490782938832361, + "learning_rate": 6.0511079541958825e-06, + "loss": 0.4363, + "step": 2170 + }, + { + "epoch": 1.468876860622463, + "grad_norm": 1.0553022937329775, + "learning_rate": 6.04725876927528e-06, + "loss": 0.4331, + "step": 2171 + }, + { + "epoch": 1.469553450608931, + "grad_norm": 1.0339233337705305, + "learning_rate": 6.043408935110688e-06, + "loss": 0.4149, + "step": 2172 + }, + { + "epoch": 1.4702300405953992, + "grad_norm": 1.0117611069346526, + "learning_rate": 6.039558454088796e-06, + "loss": 0.4212, + "step": 2173 + }, + { + "epoch": 1.4709066305818674, + "grad_norm": 1.0203444894179419, + "learning_rate": 6.035707328596698e-06, + "loss": 0.4112, + "step": 2174 + }, + { + "epoch": 1.4715832205683355, + "grad_norm": 0.9874046098207819, + "learning_rate": 6.0318555610218796e-06, + "loss": 0.4023, + "step": 2175 + }, + { + "epoch": 1.472259810554804, + "grad_norm": 1.0204399045804808, + "learning_rate": 6.0280031537522335e-06, + "loss": 0.4057, + "step": 2176 + }, + { + "epoch": 1.472936400541272, + "grad_norm": 1.0045546043147813, + "learning_rate": 6.02415010917604e-06, + "loss": 0.4088, + "step": 2177 + }, + { + "epoch": 1.4736129905277402, + "grad_norm": 0.9858879316465735, + "learning_rate": 6.020296429681985e-06, + "loss": 0.3895, + "step": 2178 + }, + { + "epoch": 1.4742895805142084, + "grad_norm": 1.0639995087095757, + "learning_rate": 6.016442117659135e-06, + "loss": 0.4374, + "step": 2179 + }, + { + "epoch": 1.4749661705006765, + "grad_norm": 1.030599144176719, + "learning_rate": 6.0125871754969614e-06, + "loss": 0.4104, + "step": 2180 + }, + { + "epoch": 1.4756427604871447, + "grad_norm": 1.0499180208111727, + "learning_rate": 6.0087316055853175e-06, + "loss": 0.4295, + "step": 2181 + }, + { + "epoch": 1.476319350473613, + "grad_norm": 1.0024112069476467, + "learning_rate": 6.00487541031445e-06, + "loss": 0.4093, + "step": 2182 + }, + { + "epoch": 1.4769959404600812, + "grad_norm": 1.0045344325247096, + "learning_rate": 6.001018592074991e-06, + "loss": 0.3969, + "step": 2183 + }, + { + "epoch": 1.4776725304465494, + "grad_norm": 1.0484222432402965, + "learning_rate": 5.997161153257963e-06, + "loss": 0.4168, + "step": 2184 + }, + { + "epoch": 1.4783491204330175, + "grad_norm": 0.9747271322636657, + "learning_rate": 5.9933030962547656e-06, + "loss": 0.3912, + "step": 2185 + }, + { + "epoch": 1.4790257104194857, + "grad_norm": 0.9710713217965347, + "learning_rate": 5.989444423457189e-06, + "loss": 0.4005, + "step": 2186 + }, + { + "epoch": 1.479702300405954, + "grad_norm": 1.0312150264264268, + "learning_rate": 5.985585137257401e-06, + "loss": 0.4015, + "step": 2187 + }, + { + "epoch": 1.4803788903924222, + "grad_norm": 0.9633740731424717, + "learning_rate": 5.981725240047954e-06, + "loss": 0.3908, + "step": 2188 + }, + { + "epoch": 1.4810554803788905, + "grad_norm": 0.9531478331831216, + "learning_rate": 5.977864734221773e-06, + "loss": 0.3845, + "step": 2189 + }, + { + "epoch": 1.4817320703653585, + "grad_norm": 1.0021867670006501, + "learning_rate": 5.974003622172167e-06, + "loss": 0.4056, + "step": 2190 + }, + { + "epoch": 1.4824086603518267, + "grad_norm": 0.971336601001469, + "learning_rate": 5.9701419062928125e-06, + "loss": 0.3924, + "step": 2191 + }, + { + "epoch": 1.483085250338295, + "grad_norm": 0.9917086008158201, + "learning_rate": 5.9662795889777666e-06, + "loss": 0.4317, + "step": 2192 + }, + { + "epoch": 1.4837618403247632, + "grad_norm": 1.0056804736424774, + "learning_rate": 5.962416672621461e-06, + "loss": 0.4243, + "step": 2193 + }, + { + "epoch": 1.4844384303112315, + "grad_norm": 1.0147683891138681, + "learning_rate": 5.958553159618693e-06, + "loss": 0.4154, + "step": 2194 + }, + { + "epoch": 1.4851150202976995, + "grad_norm": 1.0097165966172468, + "learning_rate": 5.954689052364633e-06, + "loss": 0.404, + "step": 2195 + }, + { + "epoch": 1.4857916102841677, + "grad_norm": 1.0387173804317473, + "learning_rate": 5.950824353254818e-06, + "loss": 0.4005, + "step": 2196 + }, + { + "epoch": 1.486468200270636, + "grad_norm": 1.043193783611487, + "learning_rate": 5.946959064685156e-06, + "loss": 0.413, + "step": 2197 + }, + { + "epoch": 1.4871447902571042, + "grad_norm": 1.0233606891645106, + "learning_rate": 5.943093189051916e-06, + "loss": 0.4177, + "step": 2198 + }, + { + "epoch": 1.4878213802435725, + "grad_norm": 1.0477094318719762, + "learning_rate": 5.939226728751733e-06, + "loss": 0.4386, + "step": 2199 + }, + { + "epoch": 1.4884979702300405, + "grad_norm": 0.983423855482106, + "learning_rate": 5.9353596861816e-06, + "loss": 0.3966, + "step": 2200 + }, + { + "epoch": 1.4891745602165087, + "grad_norm": 0.9964883532351336, + "learning_rate": 5.931492063738882e-06, + "loss": 0.4394, + "step": 2201 + }, + { + "epoch": 1.489851150202977, + "grad_norm": 0.978593472732646, + "learning_rate": 5.92762386382129e-06, + "loss": 0.4026, + "step": 2202 + }, + { + "epoch": 1.4905277401894452, + "grad_norm": 1.0112570921479211, + "learning_rate": 5.9237550888269045e-06, + "loss": 0.408, + "step": 2203 + }, + { + "epoch": 1.4912043301759135, + "grad_norm": 0.9935296823955936, + "learning_rate": 5.919885741154155e-06, + "loss": 0.4171, + "step": 2204 + }, + { + "epoch": 1.4918809201623815, + "grad_norm": 1.033162613222971, + "learning_rate": 5.916015823201827e-06, + "loss": 0.4321, + "step": 2205 + }, + { + "epoch": 1.4925575101488497, + "grad_norm": 0.9962948137010584, + "learning_rate": 5.912145337369064e-06, + "loss": 0.3992, + "step": 2206 + }, + { + "epoch": 1.493234100135318, + "grad_norm": 0.9590877277172909, + "learning_rate": 5.908274286055358e-06, + "loss": 0.3957, + "step": 2207 + }, + { + "epoch": 1.4939106901217862, + "grad_norm": 0.995820507945258, + "learning_rate": 5.904402671660551e-06, + "loss": 0.3919, + "step": 2208 + }, + { + "epoch": 1.4945872801082545, + "grad_norm": 1.0116404554341, + "learning_rate": 5.900530496584834e-06, + "loss": 0.41, + "step": 2209 + }, + { + "epoch": 1.4952638700947225, + "grad_norm": 1.070836313357937, + "learning_rate": 5.8966577632287506e-06, + "loss": 0.4464, + "step": 2210 + }, + { + "epoch": 1.4959404600811907, + "grad_norm": 1.0142241367544296, + "learning_rate": 5.892784473993184e-06, + "loss": 0.4204, + "step": 2211 + }, + { + "epoch": 1.496617050067659, + "grad_norm": 0.9766416633150202, + "learning_rate": 5.888910631279366e-06, + "loss": 0.4072, + "step": 2212 + }, + { + "epoch": 1.4972936400541272, + "grad_norm": 1.0323348362597207, + "learning_rate": 5.885036237488868e-06, + "loss": 0.4228, + "step": 2213 + }, + { + "epoch": 1.4979702300405955, + "grad_norm": 0.9965720514178534, + "learning_rate": 5.88116129502361e-06, + "loss": 0.424, + "step": 2214 + }, + { + "epoch": 1.4986468200270635, + "grad_norm": 0.97259341698905, + "learning_rate": 5.8772858062858414e-06, + "loss": 0.406, + "step": 2215 + }, + { + "epoch": 1.4993234100135318, + "grad_norm": 0.9522869853187673, + "learning_rate": 5.873409773678163e-06, + "loss": 0.3924, + "step": 2216 + }, + { + "epoch": 1.5, + "grad_norm": 1.009839991700367, + "learning_rate": 5.869533199603498e-06, + "loss": 0.4014, + "step": 2217 + }, + { + "epoch": 1.5006765899864682, + "grad_norm": 0.9636034088064258, + "learning_rate": 5.8656560864651225e-06, + "loss": 0.3903, + "step": 2218 + }, + { + "epoch": 1.5013531799729365, + "grad_norm": 1.0144118187504318, + "learning_rate": 5.861778436666631e-06, + "loss": 0.425, + "step": 2219 + }, + { + "epoch": 1.5020297699594045, + "grad_norm": 0.9916576777743628, + "learning_rate": 5.857900252611959e-06, + "loss": 0.4061, + "step": 2220 + }, + { + "epoch": 1.5027063599458728, + "grad_norm": 0.9706289075843019, + "learning_rate": 5.854021536705373e-06, + "loss": 0.3867, + "step": 2221 + }, + { + "epoch": 1.503382949932341, + "grad_norm": 0.971966039458518, + "learning_rate": 5.8501422913514665e-06, + "loss": 0.3951, + "step": 2222 + }, + { + "epoch": 1.5040595399188093, + "grad_norm": 0.985326801428694, + "learning_rate": 5.846262518955163e-06, + "loss": 0.4032, + "step": 2223 + }, + { + "epoch": 1.5047361299052775, + "grad_norm": 0.9551996737468442, + "learning_rate": 5.842382221921711e-06, + "loss": 0.3962, + "step": 2224 + }, + { + "epoch": 1.5054127198917455, + "grad_norm": 0.9561646383114321, + "learning_rate": 5.838501402656688e-06, + "loss": 0.3946, + "step": 2225 + }, + { + "epoch": 1.5060893098782138, + "grad_norm": 1.0144061776251505, + "learning_rate": 5.83462006356599e-06, + "loss": 0.4134, + "step": 2226 + }, + { + "epoch": 1.506765899864682, + "grad_norm": 1.005883555201239, + "learning_rate": 5.830738207055841e-06, + "loss": 0.421, + "step": 2227 + }, + { + "epoch": 1.5074424898511503, + "grad_norm": 1.0269198057962325, + "learning_rate": 5.8268558355327795e-06, + "loss": 0.4198, + "step": 2228 + }, + { + "epoch": 1.5081190798376185, + "grad_norm": 1.0346316593454372, + "learning_rate": 5.82297295140367e-06, + "loss": 0.4206, + "step": 2229 + }, + { + "epoch": 1.5087956698240865, + "grad_norm": 1.000171251123499, + "learning_rate": 5.819089557075689e-06, + "loss": 0.4125, + "step": 2230 + }, + { + "epoch": 1.5094722598105548, + "grad_norm": 0.9753200371400269, + "learning_rate": 5.815205654956333e-06, + "loss": 0.401, + "step": 2231 + }, + { + "epoch": 1.510148849797023, + "grad_norm": 1.0139267001233923, + "learning_rate": 5.811321247453409e-06, + "loss": 0.4191, + "step": 2232 + }, + { + "epoch": 1.510825439783491, + "grad_norm": 1.0389399982637497, + "learning_rate": 5.807436336975045e-06, + "loss": 0.4456, + "step": 2233 + }, + { + "epoch": 1.5115020297699595, + "grad_norm": 0.9659952802787669, + "learning_rate": 5.803550925929673e-06, + "loss": 0.3862, + "step": 2234 + }, + { + "epoch": 1.5121786197564275, + "grad_norm": 1.0134302993890432, + "learning_rate": 5.799665016726039e-06, + "loss": 0.4042, + "step": 2235 + }, + { + "epoch": 1.5128552097428958, + "grad_norm": 0.976441311394339, + "learning_rate": 5.795778611773197e-06, + "loss": 0.4035, + "step": 2236 + }, + { + "epoch": 1.513531799729364, + "grad_norm": 1.0074819417472705, + "learning_rate": 5.791891713480509e-06, + "loss": 0.4151, + "step": 2237 + }, + { + "epoch": 1.514208389715832, + "grad_norm": 0.9959722043595142, + "learning_rate": 5.788004324257643e-06, + "loss": 0.4117, + "step": 2238 + }, + { + "epoch": 1.5148849797023005, + "grad_norm": 1.01420060485514, + "learning_rate": 5.784116446514571e-06, + "loss": 0.4131, + "step": 2239 + }, + { + "epoch": 1.5155615696887685, + "grad_norm": 0.9829139365052306, + "learning_rate": 5.780228082661564e-06, + "loss": 0.4041, + "step": 2240 + }, + { + "epoch": 1.5162381596752368, + "grad_norm": 1.0053876242969089, + "learning_rate": 5.776339235109203e-06, + "loss": 0.422, + "step": 2241 + }, + { + "epoch": 1.516914749661705, + "grad_norm": 1.013534384685476, + "learning_rate": 5.772449906268362e-06, + "loss": 0.4136, + "step": 2242 + }, + { + "epoch": 1.517591339648173, + "grad_norm": 1.0148785727587901, + "learning_rate": 5.768560098550213e-06, + "loss": 0.4031, + "step": 2243 + }, + { + "epoch": 1.5182679296346415, + "grad_norm": 1.0474190558815468, + "learning_rate": 5.764669814366231e-06, + "loss": 0.4234, + "step": 2244 + }, + { + "epoch": 1.5189445196211095, + "grad_norm": 1.0018100450636278, + "learning_rate": 5.760779056128178e-06, + "loss": 0.4046, + "step": 2245 + }, + { + "epoch": 1.5196211096075778, + "grad_norm": 1.0116566315373592, + "learning_rate": 5.756887826248118e-06, + "loss": 0.4063, + "step": 2246 + }, + { + "epoch": 1.520297699594046, + "grad_norm": 0.9556860738614604, + "learning_rate": 5.752996127138404e-06, + "loss": 0.382, + "step": 2247 + }, + { + "epoch": 1.520974289580514, + "grad_norm": 1.0173269939376488, + "learning_rate": 5.749103961211679e-06, + "loss": 0.4043, + "step": 2248 + }, + { + "epoch": 1.5216508795669825, + "grad_norm": 0.9942473878490836, + "learning_rate": 5.745211330880872e-06, + "loss": 0.4094, + "step": 2249 + }, + { + "epoch": 1.5223274695534506, + "grad_norm": 0.9556359663419925, + "learning_rate": 5.74131823855921e-06, + "loss": 0.3828, + "step": 2250 + }, + { + "epoch": 1.5230040595399188, + "grad_norm": 1.0265181287659098, + "learning_rate": 5.737424686660198e-06, + "loss": 0.4019, + "step": 2251 + }, + { + "epoch": 1.523680649526387, + "grad_norm": 0.9726488479080468, + "learning_rate": 5.733530677597627e-06, + "loss": 0.3878, + "step": 2252 + }, + { + "epoch": 1.524357239512855, + "grad_norm": 1.0499864864766668, + "learning_rate": 5.729636213785574e-06, + "loss": 0.423, + "step": 2253 + }, + { + "epoch": 1.5250338294993235, + "grad_norm": 1.0068708560616513, + "learning_rate": 5.725741297638399e-06, + "loss": 0.4278, + "step": 2254 + }, + { + "epoch": 1.5257104194857916, + "grad_norm": 1.034615105276963, + "learning_rate": 5.721845931570734e-06, + "loss": 0.4387, + "step": 2255 + }, + { + "epoch": 1.5263870094722598, + "grad_norm": 1.0290506331977352, + "learning_rate": 5.717950117997502e-06, + "loss": 0.433, + "step": 2256 + }, + { + "epoch": 1.527063599458728, + "grad_norm": 0.9811439512559603, + "learning_rate": 5.714053859333893e-06, + "loss": 0.4013, + "step": 2257 + }, + { + "epoch": 1.527740189445196, + "grad_norm": 0.9810755960910839, + "learning_rate": 5.710157157995382e-06, + "loss": 0.4209, + "step": 2258 + }, + { + "epoch": 1.5284167794316645, + "grad_norm": 1.0003383439623459, + "learning_rate": 5.70626001639771e-06, + "loss": 0.4003, + "step": 2259 + }, + { + "epoch": 1.5290933694181326, + "grad_norm": 1.0134598468978735, + "learning_rate": 5.702362436956895e-06, + "loss": 0.4277, + "step": 2260 + }, + { + "epoch": 1.5297699594046008, + "grad_norm": 0.9768564362949052, + "learning_rate": 5.6984644220892295e-06, + "loss": 0.3834, + "step": 2261 + }, + { + "epoch": 1.530446549391069, + "grad_norm": 1.0157926527820418, + "learning_rate": 5.694565974211267e-06, + "loss": 0.4137, + "step": 2262 + }, + { + "epoch": 1.531123139377537, + "grad_norm": 1.0021830220183225, + "learning_rate": 5.69066709573984e-06, + "loss": 0.4025, + "step": 2263 + }, + { + "epoch": 1.5317997293640055, + "grad_norm": 0.9950282307580286, + "learning_rate": 5.686767789092041e-06, + "loss": 0.4057, + "step": 2264 + }, + { + "epoch": 1.5324763193504736, + "grad_norm": 0.9449335611367016, + "learning_rate": 5.6828680566852314e-06, + "loss": 0.3674, + "step": 2265 + }, + { + "epoch": 1.5331529093369418, + "grad_norm": 1.0421155842733352, + "learning_rate": 5.678967900937032e-06, + "loss": 0.4137, + "step": 2266 + }, + { + "epoch": 1.53382949932341, + "grad_norm": 0.9408262218022814, + "learning_rate": 5.675067324265332e-06, + "loss": 0.3802, + "step": 2267 + }, + { + "epoch": 1.534506089309878, + "grad_norm": 0.996290281295342, + "learning_rate": 5.671166329088278e-06, + "loss": 0.4076, + "step": 2268 + }, + { + "epoch": 1.5351826792963466, + "grad_norm": 0.9779660773938046, + "learning_rate": 5.667264917824277e-06, + "loss": 0.3816, + "step": 2269 + }, + { + "epoch": 1.5358592692828146, + "grad_norm": 0.991178142562463, + "learning_rate": 5.663363092891991e-06, + "loss": 0.4073, + "step": 2270 + }, + { + "epoch": 1.5365358592692828, + "grad_norm": 0.9668804347055321, + "learning_rate": 5.659460856710346e-06, + "loss": 0.3883, + "step": 2271 + }, + { + "epoch": 1.537212449255751, + "grad_norm": 0.9683349351541414, + "learning_rate": 5.655558211698513e-06, + "loss": 0.3926, + "step": 2272 + }, + { + "epoch": 1.537889039242219, + "grad_norm": 1.017749415000904, + "learning_rate": 5.651655160275925e-06, + "loss": 0.3997, + "step": 2273 + }, + { + "epoch": 1.5385656292286876, + "grad_norm": 1.0059778627270217, + "learning_rate": 5.647751704862263e-06, + "loss": 0.4133, + "step": 2274 + }, + { + "epoch": 1.5392422192151556, + "grad_norm": 0.9583458928957137, + "learning_rate": 5.643847847877458e-06, + "loss": 0.3857, + "step": 2275 + }, + { + "epoch": 1.5399188092016238, + "grad_norm": 0.9918021848342551, + "learning_rate": 5.639943591741691e-06, + "loss": 0.4095, + "step": 2276 + }, + { + "epoch": 1.540595399188092, + "grad_norm": 1.039030873789592, + "learning_rate": 5.636038938875391e-06, + "loss": 0.4143, + "step": 2277 + }, + { + "epoch": 1.54127198917456, + "grad_norm": 0.9961728205433525, + "learning_rate": 5.632133891699232e-06, + "loss": 0.4002, + "step": 2278 + }, + { + "epoch": 1.5419485791610286, + "grad_norm": 1.0206552817038608, + "learning_rate": 5.628228452634132e-06, + "loss": 0.4271, + "step": 2279 + }, + { + "epoch": 1.5426251691474966, + "grad_norm": 0.9854776972135874, + "learning_rate": 5.624322624101255e-06, + "loss": 0.4144, + "step": 2280 + }, + { + "epoch": 1.5433017591339648, + "grad_norm": 0.9846204437039179, + "learning_rate": 5.620416408522002e-06, + "loss": 0.3846, + "step": 2281 + }, + { + "epoch": 1.543978349120433, + "grad_norm": 0.9734074897864438, + "learning_rate": 5.616509808318017e-06, + "loss": 0.3956, + "step": 2282 + }, + { + "epoch": 1.544654939106901, + "grad_norm": 0.9593990971851657, + "learning_rate": 5.612602825911179e-06, + "loss": 0.3977, + "step": 2283 + }, + { + "epoch": 1.5453315290933696, + "grad_norm": 1.011829251442061, + "learning_rate": 5.608695463723614e-06, + "loss": 0.3878, + "step": 2284 + }, + { + "epoch": 1.5460081190798376, + "grad_norm": 0.9979774715964842, + "learning_rate": 5.604787724177666e-06, + "loss": 0.4093, + "step": 2285 + }, + { + "epoch": 1.5466847090663058, + "grad_norm": 0.9995402562129577, + "learning_rate": 5.600879609695929e-06, + "loss": 0.4019, + "step": 2286 + }, + { + "epoch": 1.547361299052774, + "grad_norm": 0.9885585941891722, + "learning_rate": 5.596971122701221e-06, + "loss": 0.3953, + "step": 2287 + }, + { + "epoch": 1.548037889039242, + "grad_norm": 1.0157358178499796, + "learning_rate": 5.593062265616598e-06, + "loss": 0.4108, + "step": 2288 + }, + { + "epoch": 1.5487144790257106, + "grad_norm": 1.0101678433679804, + "learning_rate": 5.589153040865333e-06, + "loss": 0.3982, + "step": 2289 + }, + { + "epoch": 1.5493910690121786, + "grad_norm": 0.9783707287552068, + "learning_rate": 5.585243450870941e-06, + "loss": 0.3984, + "step": 2290 + }, + { + "epoch": 1.5500676589986468, + "grad_norm": 1.0180500062920153, + "learning_rate": 5.581333498057153e-06, + "loss": 0.4109, + "step": 2291 + }, + { + "epoch": 1.550744248985115, + "grad_norm": 0.9728140985192564, + "learning_rate": 5.577423184847932e-06, + "loss": 0.3989, + "step": 2292 + }, + { + "epoch": 1.5514208389715831, + "grad_norm": 1.0024004656794963, + "learning_rate": 5.573512513667459e-06, + "loss": 0.4061, + "step": 2293 + }, + { + "epoch": 1.5520974289580516, + "grad_norm": 1.0317757167651191, + "learning_rate": 5.56960148694014e-06, + "loss": 0.4211, + "step": 2294 + }, + { + "epoch": 1.5527740189445196, + "grad_norm": 1.041495961186371, + "learning_rate": 5.565690107090603e-06, + "loss": 0.4235, + "step": 2295 + }, + { + "epoch": 1.5534506089309879, + "grad_norm": 0.9670002460128947, + "learning_rate": 5.5617783765436894e-06, + "loss": 0.4011, + "step": 2296 + }, + { + "epoch": 1.554127198917456, + "grad_norm": 1.0031794362545174, + "learning_rate": 5.557866297724462e-06, + "loss": 0.4095, + "step": 2297 + }, + { + "epoch": 1.5548037889039241, + "grad_norm": 1.035520247341661, + "learning_rate": 5.553953873058201e-06, + "loss": 0.4095, + "step": 2298 + }, + { + "epoch": 1.5554803788903924, + "grad_norm": 1.028372688486636, + "learning_rate": 5.550041104970398e-06, + "loss": 0.4029, + "step": 2299 + }, + { + "epoch": 1.5561569688768606, + "grad_norm": 1.0468296714090302, + "learning_rate": 5.5461279958867556e-06, + "loss": 0.4129, + "step": 2300 + }, + { + "epoch": 1.5568335588633289, + "grad_norm": 0.9934287194979324, + "learning_rate": 5.542214548233195e-06, + "loss": 0.4019, + "step": 2301 + }, + { + "epoch": 1.557510148849797, + "grad_norm": 0.9697645355800891, + "learning_rate": 5.538300764435838e-06, + "loss": 0.3932, + "step": 2302 + }, + { + "epoch": 1.5581867388362651, + "grad_norm": 0.960511373094424, + "learning_rate": 5.534386646921023e-06, + "loss": 0.3925, + "step": 2303 + }, + { + "epoch": 1.5588633288227334, + "grad_norm": 0.9698412020165607, + "learning_rate": 5.530472198115291e-06, + "loss": 0.4018, + "step": 2304 + }, + { + "epoch": 1.5595399188092016, + "grad_norm": 1.0075412021710797, + "learning_rate": 5.52655742044539e-06, + "loss": 0.4014, + "step": 2305 + }, + { + "epoch": 1.5602165087956699, + "grad_norm": 0.9531646833014361, + "learning_rate": 5.522642316338268e-06, + "loss": 0.3907, + "step": 2306 + }, + { + "epoch": 1.560893098782138, + "grad_norm": 0.975687940759243, + "learning_rate": 5.518726888221082e-06, + "loss": 0.4209, + "step": 2307 + }, + { + "epoch": 1.5615696887686061, + "grad_norm": 0.9912853978664564, + "learning_rate": 5.514811138521186e-06, + "loss": 0.4105, + "step": 2308 + }, + { + "epoch": 1.5622462787550744, + "grad_norm": 1.0186549000050666, + "learning_rate": 5.510895069666132e-06, + "loss": 0.4078, + "step": 2309 + }, + { + "epoch": 1.5629228687415426, + "grad_norm": 0.958316811831813, + "learning_rate": 5.506978684083672e-06, + "loss": 0.3905, + "step": 2310 + }, + { + "epoch": 1.5635994587280109, + "grad_norm": 0.9774153785639785, + "learning_rate": 5.503061984201755e-06, + "loss": 0.399, + "step": 2311 + }, + { + "epoch": 1.5642760487144791, + "grad_norm": 0.955794405103259, + "learning_rate": 5.499144972448525e-06, + "loss": 0.3931, + "step": 2312 + }, + { + "epoch": 1.5649526387009471, + "grad_norm": 0.9774132951159862, + "learning_rate": 5.495227651252315e-06, + "loss": 0.3832, + "step": 2313 + }, + { + "epoch": 1.5656292286874154, + "grad_norm": 0.9987220192751699, + "learning_rate": 5.4913100230416536e-06, + "loss": 0.4018, + "step": 2314 + }, + { + "epoch": 1.5663058186738836, + "grad_norm": 1.0118246470256507, + "learning_rate": 5.48739209024526e-06, + "loss": 0.3949, + "step": 2315 + }, + { + "epoch": 1.5669824086603519, + "grad_norm": 1.0111098350208763, + "learning_rate": 5.483473855292043e-06, + "loss": 0.4113, + "step": 2316 + }, + { + "epoch": 1.5676589986468201, + "grad_norm": 1.011694614551445, + "learning_rate": 5.479555320611094e-06, + "loss": 0.4105, + "step": 2317 + }, + { + "epoch": 1.5683355886332881, + "grad_norm": 0.9944630703558041, + "learning_rate": 5.475636488631697e-06, + "loss": 0.4006, + "step": 2318 + }, + { + "epoch": 1.5690121786197564, + "grad_norm": 0.9691193619216525, + "learning_rate": 5.471717361783312e-06, + "loss": 0.3984, + "step": 2319 + }, + { + "epoch": 1.5696887686062246, + "grad_norm": 0.9681395025541236, + "learning_rate": 5.46779794249559e-06, + "loss": 0.402, + "step": 2320 + }, + { + "epoch": 1.5703653585926927, + "grad_norm": 1.0003940204844282, + "learning_rate": 5.463878233198358e-06, + "loss": 0.4187, + "step": 2321 + }, + { + "epoch": 1.5710419485791611, + "grad_norm": 0.9839529228017467, + "learning_rate": 5.459958236321625e-06, + "loss": 0.4055, + "step": 2322 + }, + { + "epoch": 1.5717185385656292, + "grad_norm": 0.9743159591644858, + "learning_rate": 5.4560379542955766e-06, + "loss": 0.4036, + "step": 2323 + }, + { + "epoch": 1.5723951285520974, + "grad_norm": 0.9310310633257152, + "learning_rate": 5.45211738955058e-06, + "loss": 0.3816, + "step": 2324 + }, + { + "epoch": 1.5730717185385656, + "grad_norm": 1.0129131101388067, + "learning_rate": 5.448196544517168e-06, + "loss": 0.4107, + "step": 2325 + }, + { + "epoch": 1.5737483085250337, + "grad_norm": 0.9800142033118875, + "learning_rate": 5.444275421626058e-06, + "loss": 0.3877, + "step": 2326 + }, + { + "epoch": 1.5744248985115021, + "grad_norm": 1.0316613940545272, + "learning_rate": 5.440354023308134e-06, + "loss": 0.4144, + "step": 2327 + }, + { + "epoch": 1.5751014884979702, + "grad_norm": 0.9644863429417769, + "learning_rate": 5.436432351994452e-06, + "loss": 0.3902, + "step": 2328 + }, + { + "epoch": 1.5757780784844384, + "grad_norm": 0.9648545223376626, + "learning_rate": 5.4325104101162345e-06, + "loss": 0.3985, + "step": 2329 + }, + { + "epoch": 1.5764546684709067, + "grad_norm": 1.003378886490529, + "learning_rate": 5.428588200104875e-06, + "loss": 0.4017, + "step": 2330 + }, + { + "epoch": 1.5771312584573747, + "grad_norm": 0.9680578123630208, + "learning_rate": 5.4246657243919345e-06, + "loss": 0.4041, + "step": 2331 + }, + { + "epoch": 1.5778078484438431, + "grad_norm": 1.0128271350654896, + "learning_rate": 5.420742985409132e-06, + "loss": 0.4245, + "step": 2332 + }, + { + "epoch": 1.5784844384303112, + "grad_norm": 0.9864696145754461, + "learning_rate": 5.41681998558836e-06, + "loss": 0.3989, + "step": 2333 + }, + { + "epoch": 1.5791610284167794, + "grad_norm": 1.0153830884177073, + "learning_rate": 5.412896727361663e-06, + "loss": 0.4159, + "step": 2334 + }, + { + "epoch": 1.5798376184032477, + "grad_norm": 1.0047043718136448, + "learning_rate": 5.408973213161251e-06, + "loss": 0.4168, + "step": 2335 + }, + { + "epoch": 1.5805142083897157, + "grad_norm": 0.9560154987677202, + "learning_rate": 5.405049445419488e-06, + "loss": 0.3977, + "step": 2336 + }, + { + "epoch": 1.5811907983761841, + "grad_norm": 1.0028043875909254, + "learning_rate": 5.401125426568904e-06, + "loss": 0.4032, + "step": 2337 + }, + { + "epoch": 1.5818673883626522, + "grad_norm": 1.0007929350169738, + "learning_rate": 5.397201159042176e-06, + "loss": 0.4088, + "step": 2338 + }, + { + "epoch": 1.5825439783491204, + "grad_norm": 0.9727404241204132, + "learning_rate": 5.393276645272139e-06, + "loss": 0.4074, + "step": 2339 + }, + { + "epoch": 1.5832205683355887, + "grad_norm": 1.0159029139278153, + "learning_rate": 5.3893518876917795e-06, + "loss": 0.4188, + "step": 2340 + }, + { + "epoch": 1.5838971583220567, + "grad_norm": 0.9950006384350888, + "learning_rate": 5.385426888734237e-06, + "loss": 0.4278, + "step": 2341 + }, + { + "epoch": 1.5845737483085252, + "grad_norm": 0.9691195495194651, + "learning_rate": 5.381501650832798e-06, + "loss": 0.4054, + "step": 2342 + }, + { + "epoch": 1.5852503382949932, + "grad_norm": 0.9892340362703014, + "learning_rate": 5.377576176420899e-06, + "loss": 0.3977, + "step": 2343 + }, + { + "epoch": 1.5859269282814614, + "grad_norm": 0.9905294500933729, + "learning_rate": 5.373650467932122e-06, + "loss": 0.4087, + "step": 2344 + }, + { + "epoch": 1.5866035182679297, + "grad_norm": 0.9884003949708602, + "learning_rate": 5.3697245278001956e-06, + "loss": 0.3924, + "step": 2345 + }, + { + "epoch": 1.5872801082543977, + "grad_norm": 1.0255615096231483, + "learning_rate": 5.365798358458989e-06, + "loss": 0.4116, + "step": 2346 + }, + { + "epoch": 1.5879566982408662, + "grad_norm": 0.9813091714942518, + "learning_rate": 5.361871962342519e-06, + "loss": 0.3929, + "step": 2347 + }, + { + "epoch": 1.5886332882273342, + "grad_norm": 0.9838680036315577, + "learning_rate": 5.357945341884936e-06, + "loss": 0.3949, + "step": 2348 + }, + { + "epoch": 1.5893098782138024, + "grad_norm": 0.9736474740941714, + "learning_rate": 5.354018499520536e-06, + "loss": 0.4057, + "step": 2349 + }, + { + "epoch": 1.5899864682002707, + "grad_norm": 0.9952129739574762, + "learning_rate": 5.350091437683746e-06, + "loss": 0.3984, + "step": 2350 + }, + { + "epoch": 1.5906630581867387, + "grad_norm": 1.0008006940252432, + "learning_rate": 5.346164158809136e-06, + "loss": 0.4107, + "step": 2351 + }, + { + "epoch": 1.5913396481732072, + "grad_norm": 0.9824612424974478, + "learning_rate": 5.342236665331407e-06, + "loss": 0.3954, + "step": 2352 + }, + { + "epoch": 1.5920162381596752, + "grad_norm": 0.9938453964896405, + "learning_rate": 5.338308959685391e-06, + "loss": 0.3972, + "step": 2353 + }, + { + "epoch": 1.5926928281461434, + "grad_norm": 1.0569164944507126, + "learning_rate": 5.334381044306057e-06, + "loss": 0.4054, + "step": 2354 + }, + { + "epoch": 1.5933694181326117, + "grad_norm": 0.990150980524875, + "learning_rate": 5.3304529216284974e-06, + "loss": 0.405, + "step": 2355 + }, + { + "epoch": 1.5940460081190797, + "grad_norm": 0.9685797819320492, + "learning_rate": 5.32652459408794e-06, + "loss": 0.3947, + "step": 2356 + }, + { + "epoch": 1.5947225981055482, + "grad_norm": 1.0024465568043461, + "learning_rate": 5.322596064119731e-06, + "loss": 0.4055, + "step": 2357 + }, + { + "epoch": 1.5953991880920162, + "grad_norm": 0.9920328432585775, + "learning_rate": 5.318667334159354e-06, + "loss": 0.4213, + "step": 2358 + }, + { + "epoch": 1.5960757780784844, + "grad_norm": 0.9765680870242228, + "learning_rate": 5.314738406642405e-06, + "loss": 0.3984, + "step": 2359 + }, + { + "epoch": 1.5967523680649527, + "grad_norm": 1.005283725203942, + "learning_rate": 5.310809284004608e-06, + "loss": 0.416, + "step": 2360 + }, + { + "epoch": 1.5974289580514207, + "grad_norm": 0.961858610443524, + "learning_rate": 5.306879968681808e-06, + "loss": 0.3816, + "step": 2361 + }, + { + "epoch": 1.5981055480378892, + "grad_norm": 0.9989041031269195, + "learning_rate": 5.30295046310997e-06, + "loss": 0.4014, + "step": 2362 + }, + { + "epoch": 1.5987821380243572, + "grad_norm": 0.9950318811336293, + "learning_rate": 5.299020769725172e-06, + "loss": 0.4052, + "step": 2363 + }, + { + "epoch": 1.5994587280108254, + "grad_norm": 0.9948212172466734, + "learning_rate": 5.2950908909636144e-06, + "loss": 0.3939, + "step": 2364 + }, + { + "epoch": 1.6001353179972937, + "grad_norm": 1.107328121830043, + "learning_rate": 5.2911608292616116e-06, + "loss": 0.4115, + "step": 2365 + }, + { + "epoch": 1.6008119079837617, + "grad_norm": 1.0294051869885203, + "learning_rate": 5.2872305870555874e-06, + "loss": 0.4012, + "step": 2366 + }, + { + "epoch": 1.6014884979702302, + "grad_norm": 0.9501992577493624, + "learning_rate": 5.2833001667820815e-06, + "loss": 0.4, + "step": 2367 + }, + { + "epoch": 1.6021650879566982, + "grad_norm": 0.9790989327468724, + "learning_rate": 5.279369570877742e-06, + "loss": 0.4052, + "step": 2368 + }, + { + "epoch": 1.6028416779431665, + "grad_norm": 1.0294988926251059, + "learning_rate": 5.275438801779328e-06, + "loss": 0.4221, + "step": 2369 + }, + { + "epoch": 1.6035182679296347, + "grad_norm": 0.9899006835399469, + "learning_rate": 5.271507861923701e-06, + "loss": 0.3871, + "step": 2370 + }, + { + "epoch": 1.6041948579161027, + "grad_norm": 0.9615138135448351, + "learning_rate": 5.267576753747839e-06, + "loss": 0.3903, + "step": 2371 + }, + { + "epoch": 1.6048714479025712, + "grad_norm": 1.0248485881243827, + "learning_rate": 5.263645479688807e-06, + "loss": 0.4108, + "step": 2372 + }, + { + "epoch": 1.6055480378890392, + "grad_norm": 1.0032178822921418, + "learning_rate": 5.2597140421837915e-06, + "loss": 0.3958, + "step": 2373 + }, + { + "epoch": 1.6062246278755075, + "grad_norm": 0.9727493284366812, + "learning_rate": 5.255782443670068e-06, + "loss": 0.3955, + "step": 2374 + }, + { + "epoch": 1.6069012178619757, + "grad_norm": 0.9653040391428355, + "learning_rate": 5.251850686585015e-06, + "loss": 0.3929, + "step": 2375 + }, + { + "epoch": 1.6075778078484437, + "grad_norm": 0.9697107515103446, + "learning_rate": 5.247918773366112e-06, + "loss": 0.3951, + "step": 2376 + }, + { + "epoch": 1.6082543978349122, + "grad_norm": 0.9399237390205499, + "learning_rate": 5.243986706450933e-06, + "loss": 0.3958, + "step": 2377 + }, + { + "epoch": 1.6089309878213802, + "grad_norm": 0.9319844417225434, + "learning_rate": 5.240054488277148e-06, + "loss": 0.3835, + "step": 2378 + }, + { + "epoch": 1.6096075778078485, + "grad_norm": 0.9830045208853183, + "learning_rate": 5.2361221212825175e-06, + "loss": 0.3961, + "step": 2379 + }, + { + "epoch": 1.6102841677943167, + "grad_norm": 0.9819929312852093, + "learning_rate": 5.2321896079048994e-06, + "loss": 0.3924, + "step": 2380 + }, + { + "epoch": 1.6109607577807847, + "grad_norm": 1.0094043944216367, + "learning_rate": 5.2282569505822414e-06, + "loss": 0.4044, + "step": 2381 + }, + { + "epoch": 1.6116373477672532, + "grad_norm": 1.02076523814372, + "learning_rate": 5.224324151752575e-06, + "loss": 0.4013, + "step": 2382 + }, + { + "epoch": 1.6123139377537212, + "grad_norm": 0.954164908647174, + "learning_rate": 5.220391213854028e-06, + "loss": 0.3812, + "step": 2383 + }, + { + "epoch": 1.6129905277401895, + "grad_norm": 0.986288816316368, + "learning_rate": 5.216458139324806e-06, + "loss": 0.3974, + "step": 2384 + }, + { + "epoch": 1.6136671177266577, + "grad_norm": 0.9596211458199466, + "learning_rate": 5.212524930603205e-06, + "loss": 0.3899, + "step": 2385 + }, + { + "epoch": 1.6143437077131257, + "grad_norm": 0.9468358695130175, + "learning_rate": 5.208591590127603e-06, + "loss": 0.3809, + "step": 2386 + }, + { + "epoch": 1.6150202976995942, + "grad_norm": 0.9676314826767896, + "learning_rate": 5.2046581203364585e-06, + "loss": 0.4009, + "step": 2387 + }, + { + "epoch": 1.6156968876860622, + "grad_norm": 0.9503631507105473, + "learning_rate": 5.200724523668311e-06, + "loss": 0.3966, + "step": 2388 + }, + { + "epoch": 1.6163734776725305, + "grad_norm": 0.995098593676996, + "learning_rate": 5.196790802561776e-06, + "loss": 0.4008, + "step": 2389 + }, + { + "epoch": 1.6170500676589987, + "grad_norm": 1.017968085182166, + "learning_rate": 5.192856959455552e-06, + "loss": 0.3968, + "step": 2390 + }, + { + "epoch": 1.6177266576454667, + "grad_norm": 0.9789732775986919, + "learning_rate": 5.188922996788409e-06, + "loss": 0.4187, + "step": 2391 + }, + { + "epoch": 1.618403247631935, + "grad_norm": 1.0108573019123042, + "learning_rate": 5.184988916999191e-06, + "loss": 0.4223, + "step": 2392 + }, + { + "epoch": 1.6190798376184032, + "grad_norm": 1.0106158369845477, + "learning_rate": 5.181054722526815e-06, + "loss": 0.4037, + "step": 2393 + }, + { + "epoch": 1.6197564276048715, + "grad_norm": 0.990290753929674, + "learning_rate": 5.177120415810271e-06, + "loss": 0.4034, + "step": 2394 + }, + { + "epoch": 1.6204330175913397, + "grad_norm": 1.0226528890275217, + "learning_rate": 5.173185999288615e-06, + "loss": 0.3997, + "step": 2395 + }, + { + "epoch": 1.6211096075778078, + "grad_norm": 0.9778349078234524, + "learning_rate": 5.1692514754009744e-06, + "loss": 0.3897, + "step": 2396 + }, + { + "epoch": 1.621786197564276, + "grad_norm": 1.007186403602729, + "learning_rate": 5.165316846586541e-06, + "loss": 0.4009, + "step": 2397 + }, + { + "epoch": 1.6224627875507442, + "grad_norm": 1.0023724528046685, + "learning_rate": 5.161382115284576e-06, + "loss": 0.3971, + "step": 2398 + }, + { + "epoch": 1.6231393775372125, + "grad_norm": 1.017661408717099, + "learning_rate": 5.1574472839343956e-06, + "loss": 0.4205, + "step": 2399 + }, + { + "epoch": 1.6238159675236807, + "grad_norm": 0.9487143093152202, + "learning_rate": 5.153512354975388e-06, + "loss": 0.3901, + "step": 2400 + }, + { + "epoch": 1.6244925575101488, + "grad_norm": 1.0116142038234044, + "learning_rate": 5.1495773308469935e-06, + "loss": 0.4142, + "step": 2401 + }, + { + "epoch": 1.625169147496617, + "grad_norm": 0.989352812414496, + "learning_rate": 5.145642213988716e-06, + "loss": 0.4017, + "step": 2402 + }, + { + "epoch": 1.6258457374830853, + "grad_norm": 1.0344613071561655, + "learning_rate": 5.1417070068401165e-06, + "loss": 0.4245, + "step": 2403 + }, + { + "epoch": 1.6265223274695535, + "grad_norm": 0.9608071894768904, + "learning_rate": 5.137771711840811e-06, + "loss": 0.4013, + "step": 2404 + }, + { + "epoch": 1.6271989174560217, + "grad_norm": 0.9400399139794705, + "learning_rate": 5.133836331430469e-06, + "loss": 0.3793, + "step": 2405 + }, + { + "epoch": 1.6278755074424898, + "grad_norm": 0.9709653980951414, + "learning_rate": 5.129900868048817e-06, + "loss": 0.3963, + "step": 2406 + }, + { + "epoch": 1.628552097428958, + "grad_norm": 0.9429233603810708, + "learning_rate": 5.1259653241356275e-06, + "loss": 0.3926, + "step": 2407 + }, + { + "epoch": 1.6292286874154263, + "grad_norm": 1.0217327433525651, + "learning_rate": 5.1220297021307275e-06, + "loss": 0.4026, + "step": 2408 + }, + { + "epoch": 1.6299052774018945, + "grad_norm": 1.0123619398831514, + "learning_rate": 5.11809400447399e-06, + "loss": 0.4031, + "step": 2409 + }, + { + "epoch": 1.6305818673883627, + "grad_norm": 1.0188862677375232, + "learning_rate": 5.114158233605334e-06, + "loss": 0.4104, + "step": 2410 + }, + { + "epoch": 1.6312584573748308, + "grad_norm": 1.0241358372537814, + "learning_rate": 5.110222391964728e-06, + "loss": 0.3973, + "step": 2411 + }, + { + "epoch": 1.631935047361299, + "grad_norm": 0.9663345385656488, + "learning_rate": 5.106286481992179e-06, + "loss": 0.3815, + "step": 2412 + }, + { + "epoch": 1.6326116373477673, + "grad_norm": 1.0022872276250057, + "learning_rate": 5.1023505061277405e-06, + "loss": 0.393, + "step": 2413 + }, + { + "epoch": 1.6332882273342353, + "grad_norm": 1.0059115627431798, + "learning_rate": 5.098414466811504e-06, + "loss": 0.418, + "step": 2414 + }, + { + "epoch": 1.6339648173207038, + "grad_norm": 0.9583990161145314, + "learning_rate": 5.094478366483604e-06, + "loss": 0.3999, + "step": 2415 + }, + { + "epoch": 1.6346414073071718, + "grad_norm": 0.9622835480998754, + "learning_rate": 5.090542207584207e-06, + "loss": 0.3834, + "step": 2416 + }, + { + "epoch": 1.63531799729364, + "grad_norm": 1.0064874711712921, + "learning_rate": 5.086605992553524e-06, + "loss": 0.4022, + "step": 2417 + }, + { + "epoch": 1.6359945872801083, + "grad_norm": 0.9908528598407368, + "learning_rate": 5.082669723831793e-06, + "loss": 0.3993, + "step": 2418 + }, + { + "epoch": 1.6366711772665763, + "grad_norm": 0.9615276986995829, + "learning_rate": 5.07873340385929e-06, + "loss": 0.3902, + "step": 2419 + }, + { + "epoch": 1.6373477672530448, + "grad_norm": 1.0495752425472613, + "learning_rate": 5.074797035076319e-06, + "loss": 0.4242, + "step": 2420 + }, + { + "epoch": 1.6380243572395128, + "grad_norm": 0.9585523749831784, + "learning_rate": 5.070860619923218e-06, + "loss": 0.3969, + "step": 2421 + }, + { + "epoch": 1.638700947225981, + "grad_norm": 1.015554940825568, + "learning_rate": 5.066924160840353e-06, + "loss": 0.4233, + "step": 2422 + }, + { + "epoch": 1.6393775372124493, + "grad_norm": 0.9467678571613855, + "learning_rate": 5.062987660268114e-06, + "loss": 0.388, + "step": 2423 + }, + { + "epoch": 1.6400541271989173, + "grad_norm": 1.000460880805651, + "learning_rate": 5.059051120646924e-06, + "loss": 0.4028, + "step": 2424 + }, + { + "epoch": 1.6407307171853858, + "grad_norm": 0.9797102196071796, + "learning_rate": 5.055114544417219e-06, + "loss": 0.4048, + "step": 2425 + }, + { + "epoch": 1.6414073071718538, + "grad_norm": 0.9702410011455744, + "learning_rate": 5.051177934019468e-06, + "loss": 0.3915, + "step": 2426 + }, + { + "epoch": 1.642083897158322, + "grad_norm": 0.9370772551216627, + "learning_rate": 5.047241291894156e-06, + "loss": 0.3731, + "step": 2427 + }, + { + "epoch": 1.6427604871447903, + "grad_norm": 1.026925123148177, + "learning_rate": 5.043304620481791e-06, + "loss": 0.407, + "step": 2428 + }, + { + "epoch": 1.6434370771312583, + "grad_norm": 0.9992844526014942, + "learning_rate": 5.039367922222894e-06, + "loss": 0.3979, + "step": 2429 + }, + { + "epoch": 1.6441136671177268, + "grad_norm": 1.0054139793882098, + "learning_rate": 5.035431199558008e-06, + "loss": 0.4016, + "step": 2430 + }, + { + "epoch": 1.6447902571041948, + "grad_norm": 0.9497452369957721, + "learning_rate": 5.031494454927688e-06, + "loss": 0.3975, + "step": 2431 + }, + { + "epoch": 1.645466847090663, + "grad_norm": 0.9291845580578972, + "learning_rate": 5.027557690772503e-06, + "loss": 0.3897, + "step": 2432 + }, + { + "epoch": 1.6461434370771313, + "grad_norm": 0.9440350987672868, + "learning_rate": 5.0236209095330344e-06, + "loss": 0.391, + "step": 2433 + }, + { + "epoch": 1.6468200270635993, + "grad_norm": 1.0391267060279628, + "learning_rate": 5.019684113649877e-06, + "loss": 0.4143, + "step": 2434 + }, + { + "epoch": 1.6474966170500678, + "grad_norm": 0.9543893791124238, + "learning_rate": 5.0157473055636285e-06, + "loss": 0.3975, + "step": 2435 + }, + { + "epoch": 1.6481732070365358, + "grad_norm": 1.0037319738186286, + "learning_rate": 5.011810487714901e-06, + "loss": 0.4081, + "step": 2436 + }, + { + "epoch": 1.648849797023004, + "grad_norm": 0.9959185049663246, + "learning_rate": 5.007873662544306e-06, + "loss": 0.3842, + "step": 2437 + }, + { + "epoch": 1.6495263870094723, + "grad_norm": 1.0285901605597476, + "learning_rate": 5.003936832492465e-06, + "loss": 0.4217, + "step": 2438 + }, + { + "epoch": 1.6502029769959403, + "grad_norm": 0.996720165050114, + "learning_rate": 5e-06, + "loss": 0.4069, + "step": 2439 + }, + { + "epoch": 1.6508795669824088, + "grad_norm": 1.0165325946286154, + "learning_rate": 4.9960631675075364e-06, + "loss": 0.4122, + "step": 2440 + }, + { + "epoch": 1.6515561569688768, + "grad_norm": 0.9838467350512484, + "learning_rate": 4.9921263374556946e-06, + "loss": 0.4112, + "step": 2441 + }, + { + "epoch": 1.652232746955345, + "grad_norm": 0.9358784159452869, + "learning_rate": 4.988189512285101e-06, + "loss": 0.3737, + "step": 2442 + }, + { + "epoch": 1.6529093369418133, + "grad_norm": 1.0590403923273386, + "learning_rate": 4.984252694436373e-06, + "loss": 0.4196, + "step": 2443 + }, + { + "epoch": 1.6535859269282813, + "grad_norm": 0.9860837940120097, + "learning_rate": 4.980315886350125e-06, + "loss": 0.4086, + "step": 2444 + }, + { + "epoch": 1.6542625169147498, + "grad_norm": 0.9410651720083746, + "learning_rate": 4.976379090466966e-06, + "loss": 0.3785, + "step": 2445 + }, + { + "epoch": 1.6549391069012178, + "grad_norm": 1.0073897065366546, + "learning_rate": 4.972442309227498e-06, + "loss": 0.4233, + "step": 2446 + }, + { + "epoch": 1.655615696887686, + "grad_norm": 0.9760170203175808, + "learning_rate": 4.968505545072314e-06, + "loss": 0.3953, + "step": 2447 + }, + { + "epoch": 1.6562922868741543, + "grad_norm": 0.9929780657948141, + "learning_rate": 4.964568800441993e-06, + "loss": 0.401, + "step": 2448 + }, + { + "epoch": 1.6569688768606223, + "grad_norm": 1.0440276577316752, + "learning_rate": 4.960632077777107e-06, + "loss": 0.4195, + "step": 2449 + }, + { + "epoch": 1.6576454668470908, + "grad_norm": 0.9456506570153204, + "learning_rate": 4.956695379518211e-06, + "loss": 0.3831, + "step": 2450 + }, + { + "epoch": 1.6583220568335588, + "grad_norm": 0.9965709761859097, + "learning_rate": 4.952758708105845e-06, + "loss": 0.4035, + "step": 2451 + }, + { + "epoch": 1.658998646820027, + "grad_norm": 1.0007676453145316, + "learning_rate": 4.948822065980533e-06, + "loss": 0.4133, + "step": 2452 + }, + { + "epoch": 1.6596752368064953, + "grad_norm": 1.018873735482474, + "learning_rate": 4.944885455582783e-06, + "loss": 0.4055, + "step": 2453 + }, + { + "epoch": 1.6603518267929633, + "grad_norm": 0.9989665958176519, + "learning_rate": 4.940948879353078e-06, + "loss": 0.4063, + "step": 2454 + }, + { + "epoch": 1.6610284167794318, + "grad_norm": 0.9575429430528978, + "learning_rate": 4.937012339731886e-06, + "loss": 0.3895, + "step": 2455 + }, + { + "epoch": 1.6617050067658998, + "grad_norm": 1.0094319061016028, + "learning_rate": 4.933075839159649e-06, + "loss": 0.414, + "step": 2456 + }, + { + "epoch": 1.662381596752368, + "grad_norm": 1.0013037553216197, + "learning_rate": 4.929139380076784e-06, + "loss": 0.4009, + "step": 2457 + }, + { + "epoch": 1.6630581867388363, + "grad_norm": 0.9701707451088951, + "learning_rate": 4.9252029649236835e-06, + "loss": 0.3904, + "step": 2458 + }, + { + "epoch": 1.6637347767253043, + "grad_norm": 1.1201475247092365, + "learning_rate": 4.921266596140712e-06, + "loss": 0.427, + "step": 2459 + }, + { + "epoch": 1.6644113667117728, + "grad_norm": 0.9840473255127528, + "learning_rate": 4.917330276168208e-06, + "loss": 0.4007, + "step": 2460 + }, + { + "epoch": 1.6650879566982408, + "grad_norm": 0.9613880804638744, + "learning_rate": 4.913394007446477e-06, + "loss": 0.4058, + "step": 2461 + }, + { + "epoch": 1.665764546684709, + "grad_norm": 0.9918636978939089, + "learning_rate": 4.909457792415793e-06, + "loss": 0.3991, + "step": 2462 + }, + { + "epoch": 1.6664411366711773, + "grad_norm": 0.9647445437940695, + "learning_rate": 4.905521633516399e-06, + "loss": 0.3828, + "step": 2463 + }, + { + "epoch": 1.6671177266576453, + "grad_norm": 1.052504476643506, + "learning_rate": 4.9015855331884984e-06, + "loss": 0.419, + "step": 2464 + }, + { + "epoch": 1.6677943166441138, + "grad_norm": 0.9525407604860707, + "learning_rate": 4.897649493872262e-06, + "loss": 0.3746, + "step": 2465 + }, + { + "epoch": 1.6684709066305818, + "grad_norm": 0.9277772490225074, + "learning_rate": 4.8937135180078236e-06, + "loss": 0.3884, + "step": 2466 + }, + { + "epoch": 1.66914749661705, + "grad_norm": 1.0117669002250895, + "learning_rate": 4.889777608035273e-06, + "loss": 0.4063, + "step": 2467 + }, + { + "epoch": 1.6698240866035183, + "grad_norm": 1.055597674608897, + "learning_rate": 4.8858417663946665e-06, + "loss": 0.4061, + "step": 2468 + }, + { + "epoch": 1.6705006765899864, + "grad_norm": 0.9758827935957363, + "learning_rate": 4.8819059955260105e-06, + "loss": 0.4125, + "step": 2469 + }, + { + "epoch": 1.6711772665764548, + "grad_norm": 1.0205766549635598, + "learning_rate": 4.877970297869273e-06, + "loss": 0.4306, + "step": 2470 + }, + { + "epoch": 1.6718538565629228, + "grad_norm": 0.9600860616578318, + "learning_rate": 4.874034675864373e-06, + "loss": 0.397, + "step": 2471 + }, + { + "epoch": 1.672530446549391, + "grad_norm": 0.974992711207263, + "learning_rate": 4.870099131951185e-06, + "loss": 0.3937, + "step": 2472 + }, + { + "epoch": 1.6732070365358593, + "grad_norm": 0.9862395000213485, + "learning_rate": 4.866163668569531e-06, + "loss": 0.415, + "step": 2473 + }, + { + "epoch": 1.6738836265223274, + "grad_norm": 0.9325508078781863, + "learning_rate": 4.862228288159191e-06, + "loss": 0.383, + "step": 2474 + }, + { + "epoch": 1.6745602165087958, + "grad_norm": 0.9730870854691235, + "learning_rate": 4.858292993159884e-06, + "loss": 0.4132, + "step": 2475 + }, + { + "epoch": 1.6752368064952639, + "grad_norm": 0.9273973957731927, + "learning_rate": 4.854357786011286e-06, + "loss": 0.3898, + "step": 2476 + }, + { + "epoch": 1.675913396481732, + "grad_norm": 0.9323393999201217, + "learning_rate": 4.850422669153009e-06, + "loss": 0.3856, + "step": 2477 + }, + { + "epoch": 1.6765899864682003, + "grad_norm": 0.9978979968034617, + "learning_rate": 4.846487645024614e-06, + "loss": 0.4136, + "step": 2478 + }, + { + "epoch": 1.6772665764546684, + "grad_norm": 0.9919189258668775, + "learning_rate": 4.842552716065605e-06, + "loss": 0.393, + "step": 2479 + }, + { + "epoch": 1.6779431664411368, + "grad_norm": 0.9860228586672177, + "learning_rate": 4.838617884715425e-06, + "loss": 0.3926, + "step": 2480 + }, + { + "epoch": 1.6786197564276049, + "grad_norm": 0.9497105596467433, + "learning_rate": 4.8346831534134595e-06, + "loss": 0.4007, + "step": 2481 + }, + { + "epoch": 1.679296346414073, + "grad_norm": 0.946837631080875, + "learning_rate": 4.830748524599026e-06, + "loss": 0.3793, + "step": 2482 + }, + { + "epoch": 1.6799729364005414, + "grad_norm": 0.9792036312094987, + "learning_rate": 4.826814000711388e-06, + "loss": 0.4066, + "step": 2483 + }, + { + "epoch": 1.6806495263870094, + "grad_norm": 0.9987975919217557, + "learning_rate": 4.822879584189732e-06, + "loss": 0.4023, + "step": 2484 + }, + { + "epoch": 1.6813261163734776, + "grad_norm": 0.974885705723936, + "learning_rate": 4.818945277473187e-06, + "loss": 0.3855, + "step": 2485 + }, + { + "epoch": 1.6820027063599459, + "grad_norm": 0.9920840388676263, + "learning_rate": 4.81501108300081e-06, + "loss": 0.4026, + "step": 2486 + }, + { + "epoch": 1.682679296346414, + "grad_norm": 0.9793117271477235, + "learning_rate": 4.811077003211592e-06, + "loss": 0.4056, + "step": 2487 + }, + { + "epoch": 1.6833558863328824, + "grad_norm": 0.9590240533571854, + "learning_rate": 4.807143040544448e-06, + "loss": 0.3866, + "step": 2488 + }, + { + "epoch": 1.6840324763193504, + "grad_norm": 0.9938264882104233, + "learning_rate": 4.803209197438224e-06, + "loss": 0.3956, + "step": 2489 + }, + { + "epoch": 1.6847090663058186, + "grad_norm": 0.9751500959166967, + "learning_rate": 4.799275476331692e-06, + "loss": 0.3935, + "step": 2490 + }, + { + "epoch": 1.6853856562922869, + "grad_norm": 0.9848046221330152, + "learning_rate": 4.795341879663543e-06, + "loss": 0.3991, + "step": 2491 + }, + { + "epoch": 1.6860622462787551, + "grad_norm": 0.9968903361134329, + "learning_rate": 4.791408409872398e-06, + "loss": 0.3877, + "step": 2492 + }, + { + "epoch": 1.6867388362652234, + "grad_norm": 0.9559870333735009, + "learning_rate": 4.787475069396796e-06, + "loss": 0.3815, + "step": 2493 + }, + { + "epoch": 1.6874154262516914, + "grad_norm": 1.0082635190519689, + "learning_rate": 4.783541860675195e-06, + "loss": 0.4023, + "step": 2494 + }, + { + "epoch": 1.6880920162381596, + "grad_norm": 0.9340656171911037, + "learning_rate": 4.779608786145974e-06, + "loss": 0.372, + "step": 2495 + }, + { + "epoch": 1.6887686062246279, + "grad_norm": 1.035107897292333, + "learning_rate": 4.775675848247427e-06, + "loss": 0.3865, + "step": 2496 + }, + { + "epoch": 1.6894451962110961, + "grad_norm": 0.9419311321149763, + "learning_rate": 4.771743049417761e-06, + "loss": 0.3835, + "step": 2497 + }, + { + "epoch": 1.6901217861975644, + "grad_norm": 0.9655304719839376, + "learning_rate": 4.767810392095102e-06, + "loss": 0.3812, + "step": 2498 + }, + { + "epoch": 1.6907983761840324, + "grad_norm": 1.0089002997344731, + "learning_rate": 4.763877878717484e-06, + "loss": 0.3771, + "step": 2499 + }, + { + "epoch": 1.6914749661705006, + "grad_norm": 1.0046405398410736, + "learning_rate": 4.759945511722854e-06, + "loss": 0.4076, + "step": 2500 + }, + { + "epoch": 1.6921515561569689, + "grad_norm": 1.0425561925430737, + "learning_rate": 4.756013293549067e-06, + "loss": 0.4041, + "step": 2501 + }, + { + "epoch": 1.6928281461434371, + "grad_norm": 1.0285633291185599, + "learning_rate": 4.752081226633888e-06, + "loss": 0.4041, + "step": 2502 + }, + { + "epoch": 1.6935047361299054, + "grad_norm": 0.9941045938081207, + "learning_rate": 4.748149313414987e-06, + "loss": 0.3897, + "step": 2503 + }, + { + "epoch": 1.6941813261163734, + "grad_norm": 0.9794331946229003, + "learning_rate": 4.744217556329935e-06, + "loss": 0.4013, + "step": 2504 + }, + { + "epoch": 1.6948579161028416, + "grad_norm": 0.9781149286705279, + "learning_rate": 4.740285957816211e-06, + "loss": 0.3984, + "step": 2505 + }, + { + "epoch": 1.69553450608931, + "grad_norm": 0.9883140881255817, + "learning_rate": 4.736354520311194e-06, + "loss": 0.4042, + "step": 2506 + }, + { + "epoch": 1.696211096075778, + "grad_norm": 0.9809521054755324, + "learning_rate": 4.732423246252164e-06, + "loss": 0.4019, + "step": 2507 + }, + { + "epoch": 1.6968876860622464, + "grad_norm": 0.9582374094988552, + "learning_rate": 4.728492138076299e-06, + "loss": 0.3904, + "step": 2508 + }, + { + "epoch": 1.6975642760487144, + "grad_norm": 1.0099402802386952, + "learning_rate": 4.724561198220672e-06, + "loss": 0.4152, + "step": 2509 + }, + { + "epoch": 1.6982408660351827, + "grad_norm": 0.9576961148741598, + "learning_rate": 4.7206304291222585e-06, + "loss": 0.412, + "step": 2510 + }, + { + "epoch": 1.698917456021651, + "grad_norm": 0.9957008712591706, + "learning_rate": 4.71669983321792e-06, + "loss": 0.3992, + "step": 2511 + }, + { + "epoch": 1.699594046008119, + "grad_norm": 0.9975015024573402, + "learning_rate": 4.712769412944413e-06, + "loss": 0.4093, + "step": 2512 + }, + { + "epoch": 1.7002706359945874, + "grad_norm": 1.0042184608801534, + "learning_rate": 4.70883917073839e-06, + "loss": 0.4211, + "step": 2513 + }, + { + "epoch": 1.7009472259810554, + "grad_norm": 0.9747676471139658, + "learning_rate": 4.704909109036387e-06, + "loss": 0.4074, + "step": 2514 + }, + { + "epoch": 1.7016238159675237, + "grad_norm": 1.0168200233189133, + "learning_rate": 4.700979230274829e-06, + "loss": 0.4067, + "step": 2515 + }, + { + "epoch": 1.702300405953992, + "grad_norm": 0.9894428805887173, + "learning_rate": 4.697049536890033e-06, + "loss": 0.3858, + "step": 2516 + }, + { + "epoch": 1.70297699594046, + "grad_norm": 0.963903563044763, + "learning_rate": 4.693120031318194e-06, + "loss": 0.4, + "step": 2517 + }, + { + "epoch": 1.7036535859269284, + "grad_norm": 0.9568426588036553, + "learning_rate": 4.6891907159953935e-06, + "loss": 0.4018, + "step": 2518 + }, + { + "epoch": 1.7043301759133964, + "grad_norm": 0.9947049613394754, + "learning_rate": 4.685261593357598e-06, + "loss": 0.406, + "step": 2519 + }, + { + "epoch": 1.7050067658998647, + "grad_norm": 1.0081997102706153, + "learning_rate": 4.681332665840647e-06, + "loss": 0.409, + "step": 2520 + }, + { + "epoch": 1.705683355886333, + "grad_norm": 0.949419793529757, + "learning_rate": 4.677403935880269e-06, + "loss": 0.3956, + "step": 2521 + }, + { + "epoch": 1.706359945872801, + "grad_norm": 1.0266615324466901, + "learning_rate": 4.673475405912061e-06, + "loss": 0.3994, + "step": 2522 + }, + { + "epoch": 1.7070365358592694, + "grad_norm": 0.9190420839204616, + "learning_rate": 4.669547078371503e-06, + "loss": 0.3902, + "step": 2523 + }, + { + "epoch": 1.7077131258457374, + "grad_norm": 0.9818924505177589, + "learning_rate": 4.6656189556939446e-06, + "loss": 0.3862, + "step": 2524 + }, + { + "epoch": 1.7083897158322057, + "grad_norm": 0.9706183912843495, + "learning_rate": 4.6616910403146095e-06, + "loss": 0.3762, + "step": 2525 + }, + { + "epoch": 1.709066305818674, + "grad_norm": 0.9735002002315447, + "learning_rate": 4.657763334668594e-06, + "loss": 0.3921, + "step": 2526 + }, + { + "epoch": 1.709742895805142, + "grad_norm": 0.9538228455638207, + "learning_rate": 4.653835841190865e-06, + "loss": 0.3955, + "step": 2527 + }, + { + "epoch": 1.7104194857916104, + "grad_norm": 0.9702689262931259, + "learning_rate": 4.649908562316255e-06, + "loss": 0.4019, + "step": 2528 + }, + { + "epoch": 1.7110960757780784, + "grad_norm": 0.9772188973398663, + "learning_rate": 4.645981500479466e-06, + "loss": 0.3902, + "step": 2529 + }, + { + "epoch": 1.7117726657645467, + "grad_norm": 0.9759152384890787, + "learning_rate": 4.6420546581150665e-06, + "loss": 0.4073, + "step": 2530 + }, + { + "epoch": 1.712449255751015, + "grad_norm": 0.9932995343832387, + "learning_rate": 4.6381280376574836e-06, + "loss": 0.4033, + "step": 2531 + }, + { + "epoch": 1.713125845737483, + "grad_norm": 1.0010008985759689, + "learning_rate": 4.634201641541013e-06, + "loss": 0.4068, + "step": 2532 + }, + { + "epoch": 1.7138024357239514, + "grad_norm": 0.961355633082657, + "learning_rate": 4.630275472199805e-06, + "loss": 0.3874, + "step": 2533 + }, + { + "epoch": 1.7144790257104194, + "grad_norm": 0.9217728068074276, + "learning_rate": 4.626349532067879e-06, + "loss": 0.3934, + "step": 2534 + }, + { + "epoch": 1.7151556156968877, + "grad_norm": 0.976942632753318, + "learning_rate": 4.622423823579102e-06, + "loss": 0.3887, + "step": 2535 + }, + { + "epoch": 1.715832205683356, + "grad_norm": 1.015293084944598, + "learning_rate": 4.618498349167204e-06, + "loss": 0.4073, + "step": 2536 + }, + { + "epoch": 1.716508795669824, + "grad_norm": 0.9932608942785961, + "learning_rate": 4.6145731112657644e-06, + "loss": 0.411, + "step": 2537 + }, + { + "epoch": 1.7171853856562924, + "grad_norm": 0.9641347696128801, + "learning_rate": 4.610648112308221e-06, + "loss": 0.4009, + "step": 2538 + }, + { + "epoch": 1.7178619756427604, + "grad_norm": 0.9671623185500506, + "learning_rate": 4.6067233547278614e-06, + "loss": 0.3955, + "step": 2539 + }, + { + "epoch": 1.7185385656292287, + "grad_norm": 0.9321717137763164, + "learning_rate": 4.602798840957825e-06, + "loss": 0.3892, + "step": 2540 + }, + { + "epoch": 1.719215155615697, + "grad_norm": 0.9263826797640947, + "learning_rate": 4.598874573431097e-06, + "loss": 0.3874, + "step": 2541 + }, + { + "epoch": 1.719891745602165, + "grad_norm": 0.970645935808104, + "learning_rate": 4.594950554580512e-06, + "loss": 0.4122, + "step": 2542 + }, + { + "epoch": 1.7205683355886334, + "grad_norm": 0.9428103978257253, + "learning_rate": 4.5910267868387525e-06, + "loss": 0.3962, + "step": 2543 + }, + { + "epoch": 1.7212449255751014, + "grad_norm": 0.9054889673155659, + "learning_rate": 4.587103272638339e-06, + "loss": 0.3775, + "step": 2544 + }, + { + "epoch": 1.7219215155615697, + "grad_norm": 0.9552481048147414, + "learning_rate": 4.583180014411642e-06, + "loss": 0.3932, + "step": 2545 + }, + { + "epoch": 1.722598105548038, + "grad_norm": 0.9731394076224001, + "learning_rate": 4.579257014590869e-06, + "loss": 0.3829, + "step": 2546 + }, + { + "epoch": 1.723274695534506, + "grad_norm": 0.9975616552805409, + "learning_rate": 4.575334275608067e-06, + "loss": 0.4022, + "step": 2547 + }, + { + "epoch": 1.7239512855209744, + "grad_norm": 0.9729225738080103, + "learning_rate": 4.571411799895126e-06, + "loss": 0.3901, + "step": 2548 + }, + { + "epoch": 1.7246278755074425, + "grad_norm": 0.9797995715333138, + "learning_rate": 4.567489589883766e-06, + "loss": 0.4007, + "step": 2549 + }, + { + "epoch": 1.7253044654939107, + "grad_norm": 0.9844888604879379, + "learning_rate": 4.563567648005551e-06, + "loss": 0.3951, + "step": 2550 + }, + { + "epoch": 1.725981055480379, + "grad_norm": 0.9689950884325625, + "learning_rate": 4.559645976691868e-06, + "loss": 0.369, + "step": 2551 + }, + { + "epoch": 1.726657645466847, + "grad_norm": 0.9984196359505926, + "learning_rate": 4.5557245783739425e-06, + "loss": 0.4027, + "step": 2552 + }, + { + "epoch": 1.7273342354533154, + "grad_norm": 0.9805503226918398, + "learning_rate": 4.551803455482833e-06, + "loss": 0.4208, + "step": 2553 + }, + { + "epoch": 1.7280108254397835, + "grad_norm": 1.0077826768104767, + "learning_rate": 4.5478826104494225e-06, + "loss": 0.4092, + "step": 2554 + }, + { + "epoch": 1.7286874154262517, + "grad_norm": 0.9510535599740801, + "learning_rate": 4.543962045704424e-06, + "loss": 0.3783, + "step": 2555 + }, + { + "epoch": 1.72936400541272, + "grad_norm": 0.9902279326458776, + "learning_rate": 4.540041763678377e-06, + "loss": 0.409, + "step": 2556 + }, + { + "epoch": 1.730040595399188, + "grad_norm": 0.9746722082380599, + "learning_rate": 4.536121766801645e-06, + "loss": 0.3987, + "step": 2557 + }, + { + "epoch": 1.7307171853856564, + "grad_norm": 0.9550626556260181, + "learning_rate": 4.532202057504412e-06, + "loss": 0.4047, + "step": 2558 + }, + { + "epoch": 1.7313937753721245, + "grad_norm": 0.9969625364693467, + "learning_rate": 4.528282638216689e-06, + "loss": 0.4134, + "step": 2559 + }, + { + "epoch": 1.7320703653585927, + "grad_norm": 0.9720163552146841, + "learning_rate": 4.524363511368304e-06, + "loss": 0.3854, + "step": 2560 + }, + { + "epoch": 1.732746955345061, + "grad_norm": 0.999014438323783, + "learning_rate": 4.520444679388906e-06, + "loss": 0.3974, + "step": 2561 + }, + { + "epoch": 1.733423545331529, + "grad_norm": 0.953774151841673, + "learning_rate": 4.516526144707957e-06, + "loss": 0.3794, + "step": 2562 + }, + { + "epoch": 1.7341001353179974, + "grad_norm": 0.9454426187396396, + "learning_rate": 4.512607909754741e-06, + "loss": 0.3867, + "step": 2563 + }, + { + "epoch": 1.7347767253044655, + "grad_norm": 0.9992784563132288, + "learning_rate": 4.508689976958348e-06, + "loss": 0.4067, + "step": 2564 + }, + { + "epoch": 1.7354533152909337, + "grad_norm": 0.958222621957998, + "learning_rate": 4.504772348747687e-06, + "loss": 0.3937, + "step": 2565 + }, + { + "epoch": 1.736129905277402, + "grad_norm": 1.0209496378822407, + "learning_rate": 4.500855027551477e-06, + "loss": 0.4102, + "step": 2566 + }, + { + "epoch": 1.73680649526387, + "grad_norm": 0.9895744674110816, + "learning_rate": 4.496938015798246e-06, + "loss": 0.3941, + "step": 2567 + }, + { + "epoch": 1.7374830852503385, + "grad_norm": 0.946289958972408, + "learning_rate": 4.493021315916328e-06, + "loss": 0.3786, + "step": 2568 + }, + { + "epoch": 1.7381596752368065, + "grad_norm": 0.9676733990694013, + "learning_rate": 4.48910493033387e-06, + "loss": 0.3799, + "step": 2569 + }, + { + "epoch": 1.7388362652232747, + "grad_norm": 0.904426391775157, + "learning_rate": 4.485188861478817e-06, + "loss": 0.3675, + "step": 2570 + }, + { + "epoch": 1.739512855209743, + "grad_norm": 0.9186497123107233, + "learning_rate": 4.481273111778919e-06, + "loss": 0.3873, + "step": 2571 + }, + { + "epoch": 1.740189445196211, + "grad_norm": 0.9458210866205528, + "learning_rate": 4.477357683661734e-06, + "loss": 0.4012, + "step": 2572 + }, + { + "epoch": 1.7408660351826795, + "grad_norm": 0.9594112673318085, + "learning_rate": 4.473442579554612e-06, + "loss": 0.3978, + "step": 2573 + }, + { + "epoch": 1.7415426251691475, + "grad_norm": 1.001722672912692, + "learning_rate": 4.46952780188471e-06, + "loss": 0.4078, + "step": 2574 + }, + { + "epoch": 1.7422192151556157, + "grad_norm": 0.9942070997910358, + "learning_rate": 4.465613353078978e-06, + "loss": 0.4187, + "step": 2575 + }, + { + "epoch": 1.742895805142084, + "grad_norm": 0.9257521647079752, + "learning_rate": 4.461699235564164e-06, + "loss": 0.3893, + "step": 2576 + }, + { + "epoch": 1.743572395128552, + "grad_norm": 0.9908152539375658, + "learning_rate": 4.457785451766808e-06, + "loss": 0.4008, + "step": 2577 + }, + { + "epoch": 1.7442489851150202, + "grad_norm": 1.0562975524693239, + "learning_rate": 4.453872004113247e-06, + "loss": 0.3936, + "step": 2578 + }, + { + "epoch": 1.7449255751014885, + "grad_norm": 0.9301518609938227, + "learning_rate": 4.449958895029604e-06, + "loss": 0.3727, + "step": 2579 + }, + { + "epoch": 1.7456021650879567, + "grad_norm": 0.9508606126679935, + "learning_rate": 4.446046126941801e-06, + "loss": 0.3863, + "step": 2580 + }, + { + "epoch": 1.746278755074425, + "grad_norm": 0.9848562022486078, + "learning_rate": 4.442133702275539e-06, + "loss": 0.4021, + "step": 2581 + }, + { + "epoch": 1.746955345060893, + "grad_norm": 0.9995111740140712, + "learning_rate": 4.438221623456312e-06, + "loss": 0.3918, + "step": 2582 + }, + { + "epoch": 1.7476319350473613, + "grad_norm": 1.0064645844396989, + "learning_rate": 4.4343098929094e-06, + "loss": 0.4127, + "step": 2583 + }, + { + "epoch": 1.7483085250338295, + "grad_norm": 0.9872433886775531, + "learning_rate": 4.4303985130598615e-06, + "loss": 0.3964, + "step": 2584 + }, + { + "epoch": 1.7489851150202977, + "grad_norm": 0.9802756173637774, + "learning_rate": 4.426487486332544e-06, + "loss": 0.4066, + "step": 2585 + }, + { + "epoch": 1.749661705006766, + "grad_norm": 0.9557454752543781, + "learning_rate": 4.42257681515207e-06, + "loss": 0.3957, + "step": 2586 + }, + { + "epoch": 1.750338294993234, + "grad_norm": 0.9189226504260087, + "learning_rate": 4.4186665019428485e-06, + "loss": 0.3829, + "step": 2587 + }, + { + "epoch": 1.7510148849797023, + "grad_norm": 0.9879276621095745, + "learning_rate": 4.41475654912906e-06, + "loss": 0.4164, + "step": 2588 + }, + { + "epoch": 1.7516914749661705, + "grad_norm": 0.9525601118399755, + "learning_rate": 4.410846959134667e-06, + "loss": 0.3764, + "step": 2589 + }, + { + "epoch": 1.7523680649526387, + "grad_norm": 1.0173347974012619, + "learning_rate": 4.406937734383405e-06, + "loss": 0.4016, + "step": 2590 + }, + { + "epoch": 1.753044654939107, + "grad_norm": 0.9863231506254914, + "learning_rate": 4.4030288772987795e-06, + "loss": 0.3904, + "step": 2591 + }, + { + "epoch": 1.753721244925575, + "grad_norm": 0.9614479440117635, + "learning_rate": 4.399120390304072e-06, + "loss": 0.3959, + "step": 2592 + }, + { + "epoch": 1.7543978349120433, + "grad_norm": 0.9263515045538261, + "learning_rate": 4.395212275822336e-06, + "loss": 0.3921, + "step": 2593 + }, + { + "epoch": 1.7550744248985115, + "grad_norm": 0.9319742115582491, + "learning_rate": 4.391304536276389e-06, + "loss": 0.3662, + "step": 2594 + }, + { + "epoch": 1.7557510148849798, + "grad_norm": 0.9971421392739953, + "learning_rate": 4.3873971740888205e-06, + "loss": 0.39, + "step": 2595 + }, + { + "epoch": 1.756427604871448, + "grad_norm": 0.9510498106724605, + "learning_rate": 4.383490191681985e-06, + "loss": 0.3749, + "step": 2596 + }, + { + "epoch": 1.757104194857916, + "grad_norm": 1.0002185766225264, + "learning_rate": 4.379583591477999e-06, + "loss": 0.4079, + "step": 2597 + }, + { + "epoch": 1.7577807848443843, + "grad_norm": 1.0015549185137866, + "learning_rate": 4.375677375898746e-06, + "loss": 0.4016, + "step": 2598 + }, + { + "epoch": 1.7584573748308525, + "grad_norm": 1.0650451438844017, + "learning_rate": 4.371771547365869e-06, + "loss": 0.4166, + "step": 2599 + }, + { + "epoch": 1.7591339648173205, + "grad_norm": 1.026298144196455, + "learning_rate": 4.367866108300769e-06, + "loss": 0.4091, + "step": 2600 + }, + { + "epoch": 1.759810554803789, + "grad_norm": 0.9603534848580563, + "learning_rate": 4.3639610611246106e-06, + "loss": 0.3858, + "step": 2601 + }, + { + "epoch": 1.760487144790257, + "grad_norm": 0.949781912855817, + "learning_rate": 4.36005640825831e-06, + "loss": 0.3845, + "step": 2602 + }, + { + "epoch": 1.7611637347767253, + "grad_norm": 0.9796734545820398, + "learning_rate": 4.3561521521225445e-06, + "loss": 0.388, + "step": 2603 + }, + { + "epoch": 1.7618403247631935, + "grad_norm": 0.966494254091051, + "learning_rate": 4.352248295137739e-06, + "loss": 0.4071, + "step": 2604 + }, + { + "epoch": 1.7625169147496615, + "grad_norm": 0.9506188669527146, + "learning_rate": 4.348344839724076e-06, + "loss": 0.4017, + "step": 2605 + }, + { + "epoch": 1.76319350473613, + "grad_norm": 0.94332338237239, + "learning_rate": 4.3444417883014885e-06, + "loss": 0.3781, + "step": 2606 + }, + { + "epoch": 1.763870094722598, + "grad_norm": 0.9593292089830917, + "learning_rate": 4.340539143289655e-06, + "loss": 0.3844, + "step": 2607 + }, + { + "epoch": 1.7645466847090663, + "grad_norm": 0.9654001859757949, + "learning_rate": 4.33663690710801e-06, + "loss": 0.3958, + "step": 2608 + }, + { + "epoch": 1.7652232746955345, + "grad_norm": 0.9567085243098365, + "learning_rate": 4.332735082175724e-06, + "loss": 0.3908, + "step": 2609 + }, + { + "epoch": 1.7658998646820026, + "grad_norm": 1.0020124880525525, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.4193, + "step": 2610 + }, + { + "epoch": 1.766576454668471, + "grad_norm": 0.9887964853614057, + "learning_rate": 4.32493267573467e-06, + "loss": 0.391, + "step": 2611 + }, + { + "epoch": 1.767253044654939, + "grad_norm": 0.985615480192359, + "learning_rate": 4.3210320990629696e-06, + "loss": 0.3904, + "step": 2612 + }, + { + "epoch": 1.7679296346414073, + "grad_norm": 0.9480404220784492, + "learning_rate": 4.31713194331477e-06, + "loss": 0.3927, + "step": 2613 + }, + { + "epoch": 1.7686062246278755, + "grad_norm": 0.9404850542944785, + "learning_rate": 4.313232210907959e-06, + "loss": 0.3786, + "step": 2614 + }, + { + "epoch": 1.7692828146143436, + "grad_norm": 0.9429814884624229, + "learning_rate": 4.30933290426016e-06, + "loss": 0.378, + "step": 2615 + }, + { + "epoch": 1.769959404600812, + "grad_norm": 0.9777368455535926, + "learning_rate": 4.305434025788735e-06, + "loss": 0.4096, + "step": 2616 + }, + { + "epoch": 1.77063599458728, + "grad_norm": 1.0115823585875652, + "learning_rate": 4.301535577910774e-06, + "loss": 0.4199, + "step": 2617 + }, + { + "epoch": 1.7713125845737483, + "grad_norm": 0.9776174108039173, + "learning_rate": 4.297637563043106e-06, + "loss": 0.4014, + "step": 2618 + }, + { + "epoch": 1.7719891745602165, + "grad_norm": 0.9286579482031964, + "learning_rate": 4.293739983602292e-06, + "loss": 0.3726, + "step": 2619 + }, + { + "epoch": 1.7726657645466846, + "grad_norm": 0.9537553847089697, + "learning_rate": 4.28984284200462e-06, + "loss": 0.3854, + "step": 2620 + }, + { + "epoch": 1.773342354533153, + "grad_norm": 0.9546262623135069, + "learning_rate": 4.285946140666107e-06, + "loss": 0.4035, + "step": 2621 + }, + { + "epoch": 1.774018944519621, + "grad_norm": 1.0004135073921654, + "learning_rate": 4.282049882002499e-06, + "loss": 0.3859, + "step": 2622 + }, + { + "epoch": 1.7746955345060893, + "grad_norm": 1.0216652599147376, + "learning_rate": 4.278154068429268e-06, + "loss": 0.4145, + "step": 2623 + }, + { + "epoch": 1.7753721244925575, + "grad_norm": 0.9577692175618054, + "learning_rate": 4.274258702361604e-06, + "loss": 0.3813, + "step": 2624 + }, + { + "epoch": 1.7760487144790256, + "grad_norm": 0.9802380852400819, + "learning_rate": 4.270363786214427e-06, + "loss": 0.4077, + "step": 2625 + }, + { + "epoch": 1.776725304465494, + "grad_norm": 0.9665671248211338, + "learning_rate": 4.266469322402374e-06, + "loss": 0.3959, + "step": 2626 + }, + { + "epoch": 1.777401894451962, + "grad_norm": 0.9309997043303668, + "learning_rate": 4.2625753133398036e-06, + "loss": 0.3792, + "step": 2627 + }, + { + "epoch": 1.7780784844384303, + "grad_norm": 0.9657798512939555, + "learning_rate": 4.25868176144079e-06, + "loss": 0.3952, + "step": 2628 + }, + { + "epoch": 1.7787550744248986, + "grad_norm": 0.9700576818111204, + "learning_rate": 4.254788669119127e-06, + "loss": 0.3886, + "step": 2629 + }, + { + "epoch": 1.7794316644113666, + "grad_norm": 1.03212159402125, + "learning_rate": 4.250896038788324e-06, + "loss": 0.4208, + "step": 2630 + }, + { + "epoch": 1.780108254397835, + "grad_norm": 0.9616286968417425, + "learning_rate": 4.247003872861598e-06, + "loss": 0.3997, + "step": 2631 + }, + { + "epoch": 1.780784844384303, + "grad_norm": 0.9258081970944867, + "learning_rate": 4.2431121737518824e-06, + "loss": 0.3705, + "step": 2632 + }, + { + "epoch": 1.7814614343707713, + "grad_norm": 0.9676995922370372, + "learning_rate": 4.239220943871823e-06, + "loss": 0.3885, + "step": 2633 + }, + { + "epoch": 1.7821380243572396, + "grad_norm": 0.9676763528747013, + "learning_rate": 4.23533018563377e-06, + "loss": 0.3937, + "step": 2634 + }, + { + "epoch": 1.7828146143437076, + "grad_norm": 0.9807034187447221, + "learning_rate": 4.231439901449788e-06, + "loss": 0.3983, + "step": 2635 + }, + { + "epoch": 1.783491204330176, + "grad_norm": 0.948920942161716, + "learning_rate": 4.227550093731641e-06, + "loss": 0.3936, + "step": 2636 + }, + { + "epoch": 1.784167794316644, + "grad_norm": 0.9647084658527924, + "learning_rate": 4.223660764890799e-06, + "loss": 0.3724, + "step": 2637 + }, + { + "epoch": 1.7848443843031123, + "grad_norm": 0.9664239389181164, + "learning_rate": 4.2197719173384374e-06, + "loss": 0.3955, + "step": 2638 + }, + { + "epoch": 1.7855209742895806, + "grad_norm": 0.9278347339777552, + "learning_rate": 4.215883553485431e-06, + "loss": 0.392, + "step": 2639 + }, + { + "epoch": 1.7861975642760486, + "grad_norm": 1.0060101602339717, + "learning_rate": 4.211995675742358e-06, + "loss": 0.4117, + "step": 2640 + }, + { + "epoch": 1.786874154262517, + "grad_norm": 1.0135331058750892, + "learning_rate": 4.208108286519491e-06, + "loss": 0.4075, + "step": 2641 + }, + { + "epoch": 1.787550744248985, + "grad_norm": 1.0396604216111962, + "learning_rate": 4.204221388226803e-06, + "loss": 0.4108, + "step": 2642 + }, + { + "epoch": 1.7882273342354533, + "grad_norm": 0.9397392360088378, + "learning_rate": 4.2003349832739624e-06, + "loss": 0.3936, + "step": 2643 + }, + { + "epoch": 1.7889039242219216, + "grad_norm": 0.9390935068668829, + "learning_rate": 4.196449074070329e-06, + "loss": 0.3768, + "step": 2644 + }, + { + "epoch": 1.7895805142083896, + "grad_norm": 0.961629240316789, + "learning_rate": 4.1925636630249565e-06, + "loss": 0.3868, + "step": 2645 + }, + { + "epoch": 1.790257104194858, + "grad_norm": 0.9693878022246628, + "learning_rate": 4.1886787525465914e-06, + "loss": 0.3923, + "step": 2646 + }, + { + "epoch": 1.790933694181326, + "grad_norm": 0.9788227794864665, + "learning_rate": 4.184794345043668e-06, + "loss": 0.3955, + "step": 2647 + }, + { + "epoch": 1.7916102841677943, + "grad_norm": 1.0285734374589028, + "learning_rate": 4.180910442924312e-06, + "loss": 0.4248, + "step": 2648 + }, + { + "epoch": 1.7922868741542626, + "grad_norm": 0.9516476002575738, + "learning_rate": 4.17702704859633e-06, + "loss": 0.3742, + "step": 2649 + }, + { + "epoch": 1.7929634641407306, + "grad_norm": 0.9234015190624797, + "learning_rate": 4.173144164467221e-06, + "loss": 0.3924, + "step": 2650 + }, + { + "epoch": 1.793640054127199, + "grad_norm": 1.006276307711603, + "learning_rate": 4.169261792944161e-06, + "loss": 0.4126, + "step": 2651 + }, + { + "epoch": 1.794316644113667, + "grad_norm": 0.9489648526139915, + "learning_rate": 4.165379936434011e-06, + "loss": 0.3786, + "step": 2652 + }, + { + "epoch": 1.7949932341001353, + "grad_norm": 0.9669686659997468, + "learning_rate": 4.161498597343313e-06, + "loss": 0.4056, + "step": 2653 + }, + { + "epoch": 1.7956698240866036, + "grad_norm": 0.9658512544044849, + "learning_rate": 4.15761777807829e-06, + "loss": 0.3727, + "step": 2654 + }, + { + "epoch": 1.7963464140730716, + "grad_norm": 0.9627151248354098, + "learning_rate": 4.153737481044838e-06, + "loss": 0.3811, + "step": 2655 + }, + { + "epoch": 1.79702300405954, + "grad_norm": 0.9926586492534942, + "learning_rate": 4.149857708648536e-06, + "loss": 0.4075, + "step": 2656 + }, + { + "epoch": 1.797699594046008, + "grad_norm": 0.9776992890715819, + "learning_rate": 4.1459784632946295e-06, + "loss": 0.3965, + "step": 2657 + }, + { + "epoch": 1.7983761840324763, + "grad_norm": 0.9615659408473649, + "learning_rate": 4.142099747388042e-06, + "loss": 0.4024, + "step": 2658 + }, + { + "epoch": 1.7990527740189446, + "grad_norm": 0.98622655383633, + "learning_rate": 4.138221563333371e-06, + "loss": 0.3889, + "step": 2659 + }, + { + "epoch": 1.7997293640054126, + "grad_norm": 0.9563767130211177, + "learning_rate": 4.134343913534879e-06, + "loss": 0.386, + "step": 2660 + }, + { + "epoch": 1.800405953991881, + "grad_norm": 0.9234774059435488, + "learning_rate": 4.1304668003965016e-06, + "loss": 0.3831, + "step": 2661 + }, + { + "epoch": 1.801082543978349, + "grad_norm": 1.0327608879236543, + "learning_rate": 4.126590226321838e-06, + "loss": 0.4124, + "step": 2662 + }, + { + "epoch": 1.8017591339648173, + "grad_norm": 0.9670260073711607, + "learning_rate": 4.12271419371416e-06, + "loss": 0.3965, + "step": 2663 + }, + { + "epoch": 1.8024357239512856, + "grad_norm": 0.9812009881153325, + "learning_rate": 4.118838704976392e-06, + "loss": 0.3781, + "step": 2664 + }, + { + "epoch": 1.8031123139377536, + "grad_norm": 0.9201699169620586, + "learning_rate": 4.114963762511134e-06, + "loss": 0.3799, + "step": 2665 + }, + { + "epoch": 1.803788903924222, + "grad_norm": 0.9953322972521901, + "learning_rate": 4.111089368720635e-06, + "loss": 0.4127, + "step": 2666 + }, + { + "epoch": 1.80446549391069, + "grad_norm": 0.9507164657537404, + "learning_rate": 4.107215526006818e-06, + "loss": 0.3826, + "step": 2667 + }, + { + "epoch": 1.8051420838971584, + "grad_norm": 0.9893140146578959, + "learning_rate": 4.10334223677125e-06, + "loss": 0.4072, + "step": 2668 + }, + { + "epoch": 1.8058186738836266, + "grad_norm": 0.9879521587844674, + "learning_rate": 4.099469503415167e-06, + "loss": 0.4058, + "step": 2669 + }, + { + "epoch": 1.8064952638700946, + "grad_norm": 0.9442173163374029, + "learning_rate": 4.0955973283394525e-06, + "loss": 0.3842, + "step": 2670 + }, + { + "epoch": 1.8071718538565629, + "grad_norm": 1.0009768172003903, + "learning_rate": 4.091725713944644e-06, + "loss": 0.4018, + "step": 2671 + }, + { + "epoch": 1.8078484438430311, + "grad_norm": 0.9331511419632416, + "learning_rate": 4.087854662630937e-06, + "loss": 0.3717, + "step": 2672 + }, + { + "epoch": 1.8085250338294994, + "grad_norm": 0.986564902343268, + "learning_rate": 4.083984176798175e-06, + "loss": 0.3869, + "step": 2673 + }, + { + "epoch": 1.8092016238159676, + "grad_norm": 0.9985087496075559, + "learning_rate": 4.080114258845846e-06, + "loss": 0.4135, + "step": 2674 + }, + { + "epoch": 1.8098782138024356, + "grad_norm": 0.9826142874991449, + "learning_rate": 4.076244911173097e-06, + "loss": 0.4002, + "step": 2675 + }, + { + "epoch": 1.8105548037889039, + "grad_norm": 1.0101246168508613, + "learning_rate": 4.072376136178712e-06, + "loss": 0.4208, + "step": 2676 + }, + { + "epoch": 1.8112313937753721, + "grad_norm": 0.982051469432822, + "learning_rate": 4.06850793626112e-06, + "loss": 0.3806, + "step": 2677 + }, + { + "epoch": 1.8119079837618404, + "grad_norm": 1.0006965108244374, + "learning_rate": 4.064640313818401e-06, + "loss": 0.4249, + "step": 2678 + }, + { + "epoch": 1.8125845737483086, + "grad_norm": 1.02113807045358, + "learning_rate": 4.06077327124827e-06, + "loss": 0.3953, + "step": 2679 + }, + { + "epoch": 1.8132611637347766, + "grad_norm": 0.9188533856662646, + "learning_rate": 4.056906810948086e-06, + "loss": 0.3844, + "step": 2680 + }, + { + "epoch": 1.8139377537212449, + "grad_norm": 0.9823063149835631, + "learning_rate": 4.053040935314845e-06, + "loss": 0.392, + "step": 2681 + }, + { + "epoch": 1.8146143437077131, + "grad_norm": 0.9586090895924663, + "learning_rate": 4.049175646745182e-06, + "loss": 0.4027, + "step": 2682 + }, + { + "epoch": 1.8152909336941814, + "grad_norm": 0.9210304149156828, + "learning_rate": 4.045310947635369e-06, + "loss": 0.3691, + "step": 2683 + }, + { + "epoch": 1.8159675236806496, + "grad_norm": 1.004792893723896, + "learning_rate": 4.041446840381309e-06, + "loss": 0.3948, + "step": 2684 + }, + { + "epoch": 1.8166441136671176, + "grad_norm": 0.9640420720464306, + "learning_rate": 4.03758332737854e-06, + "loss": 0.3885, + "step": 2685 + }, + { + "epoch": 1.817320703653586, + "grad_norm": 0.9761056105847574, + "learning_rate": 4.033720411022235e-06, + "loss": 0.4018, + "step": 2686 + }, + { + "epoch": 1.8179972936400541, + "grad_norm": 0.9940739219726901, + "learning_rate": 4.02985809370719e-06, + "loss": 0.399, + "step": 2687 + }, + { + "epoch": 1.8186738836265224, + "grad_norm": 0.9297981262839745, + "learning_rate": 4.025996377827836e-06, + "loss": 0.3831, + "step": 2688 + }, + { + "epoch": 1.8193504736129906, + "grad_norm": 0.9538154269143837, + "learning_rate": 4.022135265778226e-06, + "loss": 0.379, + "step": 2689 + }, + { + "epoch": 1.8200270635994586, + "grad_norm": 0.9263921989936219, + "learning_rate": 4.018274759952047e-06, + "loss": 0.4042, + "step": 2690 + }, + { + "epoch": 1.820703653585927, + "grad_norm": 0.977585672815, + "learning_rate": 4.0144148627426e-06, + "loss": 0.3972, + "step": 2691 + }, + { + "epoch": 1.8213802435723951, + "grad_norm": 0.9605042438747269, + "learning_rate": 4.010555576542812e-06, + "loss": 0.4013, + "step": 2692 + }, + { + "epoch": 1.8220568335588632, + "grad_norm": 0.9462577925101531, + "learning_rate": 4.006696903745236e-06, + "loss": 0.3864, + "step": 2693 + }, + { + "epoch": 1.8227334235453316, + "grad_norm": 0.9587643452720432, + "learning_rate": 4.002838846742039e-06, + "loss": 0.3914, + "step": 2694 + }, + { + "epoch": 1.8234100135317997, + "grad_norm": 0.9498816356715988, + "learning_rate": 3.998981407925009e-06, + "loss": 0.3879, + "step": 2695 + }, + { + "epoch": 1.824086603518268, + "grad_norm": 0.920748654494678, + "learning_rate": 3.995124589685552e-06, + "loss": 0.3731, + "step": 2696 + }, + { + "epoch": 1.8247631935047361, + "grad_norm": 0.9648858745318133, + "learning_rate": 3.991268394414685e-06, + "loss": 0.3941, + "step": 2697 + }, + { + "epoch": 1.8254397834912042, + "grad_norm": 0.9301512330105254, + "learning_rate": 3.987412824503041e-06, + "loss": 0.3734, + "step": 2698 + }, + { + "epoch": 1.8261163734776726, + "grad_norm": 0.9844036475247598, + "learning_rate": 3.983557882340866e-06, + "loss": 0.3926, + "step": 2699 + }, + { + "epoch": 1.8267929634641407, + "grad_norm": 0.9906461597560315, + "learning_rate": 3.979703570318017e-06, + "loss": 0.4173, + "step": 2700 + }, + { + "epoch": 1.827469553450609, + "grad_norm": 0.9505497224356612, + "learning_rate": 3.97584989082396e-06, + "loss": 0.3916, + "step": 2701 + }, + { + "epoch": 1.8281461434370772, + "grad_norm": 0.9359231694932668, + "learning_rate": 3.971996846247767e-06, + "loss": 0.3939, + "step": 2702 + }, + { + "epoch": 1.8288227334235452, + "grad_norm": 0.9399633161213121, + "learning_rate": 3.968144438978121e-06, + "loss": 0.3742, + "step": 2703 + }, + { + "epoch": 1.8294993234100136, + "grad_norm": 0.9549451444962479, + "learning_rate": 3.964292671403303e-06, + "loss": 0.4075, + "step": 2704 + }, + { + "epoch": 1.8301759133964817, + "grad_norm": 0.9618288042986526, + "learning_rate": 3.960441545911205e-06, + "loss": 0.3884, + "step": 2705 + }, + { + "epoch": 1.83085250338295, + "grad_norm": 0.9383453073080402, + "learning_rate": 3.956591064889313e-06, + "loss": 0.3832, + "step": 2706 + }, + { + "epoch": 1.8315290933694182, + "grad_norm": 0.9367145772648151, + "learning_rate": 3.952741230724721e-06, + "loss": 0.3816, + "step": 2707 + }, + { + "epoch": 1.8322056833558862, + "grad_norm": 0.9654028112278012, + "learning_rate": 3.948892045804117e-06, + "loss": 0.3713, + "step": 2708 + }, + { + "epoch": 1.8328822733423547, + "grad_norm": 0.9957421271193522, + "learning_rate": 3.94504351251379e-06, + "loss": 0.3944, + "step": 2709 + }, + { + "epoch": 1.8335588633288227, + "grad_norm": 0.9391522691304137, + "learning_rate": 3.9411956332396224e-06, + "loss": 0.3771, + "step": 2710 + }, + { + "epoch": 1.834235453315291, + "grad_norm": 0.9736197204389111, + "learning_rate": 3.937348410367091e-06, + "loss": 0.3816, + "step": 2711 + }, + { + "epoch": 1.8349120433017592, + "grad_norm": 1.0525202114093986, + "learning_rate": 3.9335018462812664e-06, + "loss": 0.4316, + "step": 2712 + }, + { + "epoch": 1.8355886332882272, + "grad_norm": 0.9756580087226838, + "learning_rate": 3.929655943366812e-06, + "loss": 0.3857, + "step": 2713 + }, + { + "epoch": 1.8362652232746957, + "grad_norm": 0.9336004222503018, + "learning_rate": 3.92581070400798e-06, + "loss": 0.3816, + "step": 2714 + }, + { + "epoch": 1.8369418132611637, + "grad_norm": 0.9880252843245524, + "learning_rate": 3.921966130588612e-06, + "loss": 0.3949, + "step": 2715 + }, + { + "epoch": 1.837618403247632, + "grad_norm": 0.950446373911663, + "learning_rate": 3.918122225492139e-06, + "loss": 0.3997, + "step": 2716 + }, + { + "epoch": 1.8382949932341002, + "grad_norm": 0.9621395005451292, + "learning_rate": 3.914278991101568e-06, + "loss": 0.393, + "step": 2717 + }, + { + "epoch": 1.8389715832205682, + "grad_norm": 0.9824936681078674, + "learning_rate": 3.910436429799503e-06, + "loss": 0.3995, + "step": 2718 + }, + { + "epoch": 1.8396481732070367, + "grad_norm": 0.9946873820346164, + "learning_rate": 3.906594543968122e-06, + "loss": 0.3998, + "step": 2719 + }, + { + "epoch": 1.8403247631935047, + "grad_norm": 0.9509829767988345, + "learning_rate": 3.902753335989188e-06, + "loss": 0.389, + "step": 2720 + }, + { + "epoch": 1.841001353179973, + "grad_norm": 0.9461778708114417, + "learning_rate": 3.898912808244043e-06, + "loss": 0.3846, + "step": 2721 + }, + { + "epoch": 1.8416779431664412, + "grad_norm": 0.9334507205472801, + "learning_rate": 3.895072963113607e-06, + "loss": 0.3867, + "step": 2722 + }, + { + "epoch": 1.8423545331529092, + "grad_norm": 0.9432282296290785, + "learning_rate": 3.89123380297838e-06, + "loss": 0.3915, + "step": 2723 + }, + { + "epoch": 1.8430311231393777, + "grad_norm": 0.9606854252233376, + "learning_rate": 3.887395330218429e-06, + "loss": 0.3812, + "step": 2724 + }, + { + "epoch": 1.8437077131258457, + "grad_norm": 0.9703721196340805, + "learning_rate": 3.883557547213404e-06, + "loss": 0.3897, + "step": 2725 + }, + { + "epoch": 1.844384303112314, + "grad_norm": 0.9642040443018232, + "learning_rate": 3.8797204563425215e-06, + "loss": 0.3937, + "step": 2726 + }, + { + "epoch": 1.8450608930987822, + "grad_norm": 0.9882315760969734, + "learning_rate": 3.875884059984571e-06, + "loss": 0.3948, + "step": 2727 + }, + { + "epoch": 1.8457374830852502, + "grad_norm": 0.9661486897118964, + "learning_rate": 3.872048360517914e-06, + "loss": 0.3937, + "step": 2728 + }, + { + "epoch": 1.8464140730717187, + "grad_norm": 0.9408857728057767, + "learning_rate": 3.868213360320474e-06, + "loss": 0.3589, + "step": 2729 + }, + { + "epoch": 1.8470906630581867, + "grad_norm": 0.9498115251523825, + "learning_rate": 3.864379061769749e-06, + "loss": 0.3772, + "step": 2730 + }, + { + "epoch": 1.847767253044655, + "grad_norm": 0.9391010920454833, + "learning_rate": 3.860545467242793e-06, + "loss": 0.3814, + "step": 2731 + }, + { + "epoch": 1.8484438430311232, + "grad_norm": 0.9602026878812513, + "learning_rate": 3.856712579116229e-06, + "loss": 0.3806, + "step": 2732 + }, + { + "epoch": 1.8491204330175912, + "grad_norm": 1.017718591836699, + "learning_rate": 3.852880399766243e-06, + "loss": 0.4049, + "step": 2733 + }, + { + "epoch": 1.8497970230040597, + "grad_norm": 0.9905362353007553, + "learning_rate": 3.8490489315685764e-06, + "loss": 0.3944, + "step": 2734 + }, + { + "epoch": 1.8504736129905277, + "grad_norm": 0.990721430207882, + "learning_rate": 3.845218176898537e-06, + "loss": 0.4101, + "step": 2735 + }, + { + "epoch": 1.851150202976996, + "grad_norm": 0.9000359867998304, + "learning_rate": 3.8413881381309845e-06, + "loss": 0.3726, + "step": 2736 + }, + { + "epoch": 1.8518267929634642, + "grad_norm": 0.9844985020788298, + "learning_rate": 3.837558817640334e-06, + "loss": 0.3995, + "step": 2737 + }, + { + "epoch": 1.8525033829499322, + "grad_norm": 0.896020116602737, + "learning_rate": 3.8337302178005605e-06, + "loss": 0.3756, + "step": 2738 + }, + { + "epoch": 1.8531799729364007, + "grad_norm": 0.9453510728604818, + "learning_rate": 3.829902340985189e-06, + "loss": 0.3845, + "step": 2739 + }, + { + "epoch": 1.8538565629228687, + "grad_norm": 1.0149339322894193, + "learning_rate": 3.826075189567296e-06, + "loss": 0.3954, + "step": 2740 + }, + { + "epoch": 1.854533152909337, + "grad_norm": 0.9369259685457049, + "learning_rate": 3.82224876591951e-06, + "loss": 0.3915, + "step": 2741 + }, + { + "epoch": 1.8552097428958052, + "grad_norm": 0.9924464907482013, + "learning_rate": 3.818423072414007e-06, + "loss": 0.4011, + "step": 2742 + }, + { + "epoch": 1.8558863328822732, + "grad_norm": 0.9844163643599041, + "learning_rate": 3.8145981114225135e-06, + "loss": 0.3947, + "step": 2743 + }, + { + "epoch": 1.8565629228687417, + "grad_norm": 0.990797930854464, + "learning_rate": 3.8107738853162953e-06, + "loss": 0.3878, + "step": 2744 + }, + { + "epoch": 1.8572395128552097, + "grad_norm": 0.9848806184867611, + "learning_rate": 3.8069503964661656e-06, + "loss": 0.3891, + "step": 2745 + }, + { + "epoch": 1.857916102841678, + "grad_norm": 0.9747597649510048, + "learning_rate": 3.803127647242486e-06, + "loss": 0.3919, + "step": 2746 + }, + { + "epoch": 1.8585926928281462, + "grad_norm": 0.9035277076684101, + "learning_rate": 3.7993056400151516e-06, + "loss": 0.3629, + "step": 2747 + }, + { + "epoch": 1.8592692828146142, + "grad_norm": 0.9818462370713226, + "learning_rate": 3.795484377153601e-06, + "loss": 0.3951, + "step": 2748 + }, + { + "epoch": 1.8599458728010827, + "grad_norm": 0.9477913833635639, + "learning_rate": 3.791663861026814e-06, + "loss": 0.3945, + "step": 2749 + }, + { + "epoch": 1.8606224627875507, + "grad_norm": 0.9145189752205615, + "learning_rate": 3.787844094003302e-06, + "loss": 0.3741, + "step": 2750 + }, + { + "epoch": 1.861299052774019, + "grad_norm": 0.9236608243752011, + "learning_rate": 3.7840250784511147e-06, + "loss": 0.387, + "step": 2751 + }, + { + "epoch": 1.8619756427604872, + "grad_norm": 0.9938022598784646, + "learning_rate": 3.780206816737837e-06, + "loss": 0.4062, + "step": 2752 + }, + { + "epoch": 1.8626522327469552, + "grad_norm": 0.9461125538937857, + "learning_rate": 3.776389311230584e-06, + "loss": 0.3759, + "step": 2753 + }, + { + "epoch": 1.8633288227334237, + "grad_norm": 0.9787522934889943, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.3842, + "step": 2754 + }, + { + "epoch": 1.8640054127198917, + "grad_norm": 0.9653692368265662, + "learning_rate": 3.7687565783002754e-06, + "loss": 0.3947, + "step": 2755 + }, + { + "epoch": 1.86468200270636, + "grad_norm": 1.0032055690109953, + "learning_rate": 3.7649413556091047e-06, + "loss": 0.3941, + "step": 2756 + }, + { + "epoch": 1.8653585926928282, + "grad_norm": 0.980333510274981, + "learning_rate": 3.7611268985877213e-06, + "loss": 0.3968, + "step": 2757 + }, + { + "epoch": 1.8660351826792962, + "grad_norm": 0.9941678857876616, + "learning_rate": 3.7573132096008843e-06, + "loss": 0.3887, + "step": 2758 + }, + { + "epoch": 1.8667117726657647, + "grad_norm": 0.9909540797694224, + "learning_rate": 3.753500291012874e-06, + "loss": 0.3804, + "step": 2759 + }, + { + "epoch": 1.8673883626522327, + "grad_norm": 0.9381334042245338, + "learning_rate": 3.749688145187497e-06, + "loss": 0.3924, + "step": 2760 + }, + { + "epoch": 1.868064952638701, + "grad_norm": 1.0047585975417583, + "learning_rate": 3.7458767744880763e-06, + "loss": 0.3909, + "step": 2761 + }, + { + "epoch": 1.8687415426251692, + "grad_norm": 0.9883657107026129, + "learning_rate": 3.7420661812774577e-06, + "loss": 0.3899, + "step": 2762 + }, + { + "epoch": 1.8694181326116373, + "grad_norm": 1.0175792405139148, + "learning_rate": 3.738256367918004e-06, + "loss": 0.3928, + "step": 2763 + }, + { + "epoch": 1.8700947225981055, + "grad_norm": 0.9411438913245966, + "learning_rate": 3.734447336771591e-06, + "loss": 0.3717, + "step": 2764 + }, + { + "epoch": 1.8707713125845737, + "grad_norm": 0.9939326799564947, + "learning_rate": 3.730639090199616e-06, + "loss": 0.3985, + "step": 2765 + }, + { + "epoch": 1.871447902571042, + "grad_norm": 1.0061801509678485, + "learning_rate": 3.7268316305629836e-06, + "loss": 0.3993, + "step": 2766 + }, + { + "epoch": 1.8721244925575102, + "grad_norm": 0.9492586830061073, + "learning_rate": 3.7230249602221163e-06, + "loss": 0.3708, + "step": 2767 + }, + { + "epoch": 1.8728010825439783, + "grad_norm": 0.9653781636902337, + "learning_rate": 3.719219081536942e-06, + "loss": 0.3783, + "step": 2768 + }, + { + "epoch": 1.8734776725304465, + "grad_norm": 0.9620951517242323, + "learning_rate": 3.7154139968669043e-06, + "loss": 0.4085, + "step": 2769 + }, + { + "epoch": 1.8741542625169147, + "grad_norm": 0.9590883410512174, + "learning_rate": 3.711609708570948e-06, + "loss": 0.3918, + "step": 2770 + }, + { + "epoch": 1.874830852503383, + "grad_norm": 0.9876900472948228, + "learning_rate": 3.7078062190075264e-06, + "loss": 0.4055, + "step": 2771 + }, + { + "epoch": 1.8755074424898512, + "grad_norm": 0.961043824549599, + "learning_rate": 3.704003530534597e-06, + "loss": 0.3919, + "step": 2772 + }, + { + "epoch": 1.8761840324763193, + "grad_norm": 0.9505863236459713, + "learning_rate": 3.7002016455096247e-06, + "loss": 0.3778, + "step": 2773 + }, + { + "epoch": 1.8768606224627875, + "grad_norm": 0.974330024617366, + "learning_rate": 3.696400566289571e-06, + "loss": 0.3891, + "step": 2774 + }, + { + "epoch": 1.8775372124492558, + "grad_norm": 0.9519792071742638, + "learning_rate": 3.6926002952309015e-06, + "loss": 0.3826, + "step": 2775 + }, + { + "epoch": 1.878213802435724, + "grad_norm": 0.9790413464789157, + "learning_rate": 3.6888008346895797e-06, + "loss": 0.3984, + "step": 2776 + }, + { + "epoch": 1.8788903924221922, + "grad_norm": 0.9446688806491784, + "learning_rate": 3.685002187021064e-06, + "loss": 0.3953, + "step": 2777 + }, + { + "epoch": 1.8795669824086603, + "grad_norm": 0.9755760917432024, + "learning_rate": 3.681204354580313e-06, + "loss": 0.4098, + "step": 2778 + }, + { + "epoch": 1.8802435723951285, + "grad_norm": 0.9818430663351374, + "learning_rate": 3.6774073397217786e-06, + "loss": 0.4019, + "step": 2779 + }, + { + "epoch": 1.8809201623815968, + "grad_norm": 0.9552511405776277, + "learning_rate": 3.6736111447994026e-06, + "loss": 0.4035, + "step": 2780 + }, + { + "epoch": 1.881596752368065, + "grad_norm": 0.9552607028054969, + "learning_rate": 3.669815772166625e-06, + "loss": 0.3849, + "step": 2781 + }, + { + "epoch": 1.8822733423545333, + "grad_norm": 0.9536352716432847, + "learning_rate": 3.6660212241763692e-06, + "loss": 0.3911, + "step": 2782 + }, + { + "epoch": 1.8829499323410013, + "grad_norm": 0.9921563560458461, + "learning_rate": 3.662227503181054e-06, + "loss": 0.4034, + "step": 2783 + }, + { + "epoch": 1.8836265223274695, + "grad_norm": 1.0185812571841373, + "learning_rate": 3.658434611532578e-06, + "loss": 0.3915, + "step": 2784 + }, + { + "epoch": 1.8843031123139378, + "grad_norm": 1.0241495689199167, + "learning_rate": 3.65464255158233e-06, + "loss": 0.4069, + "step": 2785 + }, + { + "epoch": 1.8849797023004058, + "grad_norm": 1.0183859549395882, + "learning_rate": 3.6508513256811856e-06, + "loss": 0.4062, + "step": 2786 + }, + { + "epoch": 1.8856562922868743, + "grad_norm": 0.982689482340641, + "learning_rate": 3.6470609361794972e-06, + "loss": 0.3803, + "step": 2787 + }, + { + "epoch": 1.8863328822733423, + "grad_norm": 0.9794251537636517, + "learning_rate": 3.643271385427105e-06, + "loss": 0.3851, + "step": 2788 + }, + { + "epoch": 1.8870094722598105, + "grad_norm": 0.9682502978651679, + "learning_rate": 3.639482675773324e-06, + "loss": 0.4056, + "step": 2789 + }, + { + "epoch": 1.8876860622462788, + "grad_norm": 0.937772382058869, + "learning_rate": 3.635694809566954e-06, + "loss": 0.3946, + "step": 2790 + }, + { + "epoch": 1.8883626522327468, + "grad_norm": 0.9495086893245787, + "learning_rate": 3.6319077891562616e-06, + "loss": 0.4001, + "step": 2791 + }, + { + "epoch": 1.8890392422192153, + "grad_norm": 1.0283747007480388, + "learning_rate": 3.6281216168889993e-06, + "loss": 0.412, + "step": 2792 + }, + { + "epoch": 1.8897158322056833, + "grad_norm": 0.9069792284601774, + "learning_rate": 3.624336295112388e-06, + "loss": 0.3674, + "step": 2793 + }, + { + "epoch": 1.8903924221921515, + "grad_norm": 0.9522981535658404, + "learning_rate": 3.6205518261731247e-06, + "loss": 0.387, + "step": 2794 + }, + { + "epoch": 1.8910690121786198, + "grad_norm": 0.9582503735982182, + "learning_rate": 3.616768212417375e-06, + "loss": 0.3877, + "step": 2795 + }, + { + "epoch": 1.8917456021650878, + "grad_norm": 0.957293919962844, + "learning_rate": 3.6129854561907786e-06, + "loss": 0.3827, + "step": 2796 + }, + { + "epoch": 1.8924221921515563, + "grad_norm": 0.946894126538796, + "learning_rate": 3.6092035598384356e-06, + "loss": 0.3799, + "step": 2797 + }, + { + "epoch": 1.8930987821380243, + "grad_norm": 0.9447382438618128, + "learning_rate": 3.6054225257049204e-06, + "loss": 0.3708, + "step": 2798 + }, + { + "epoch": 1.8937753721244925, + "grad_norm": 0.9500877877986935, + "learning_rate": 3.6016423561342707e-06, + "loss": 0.3773, + "step": 2799 + }, + { + "epoch": 1.8944519621109608, + "grad_norm": 0.9641975593944571, + "learning_rate": 3.5978630534699873e-06, + "loss": 0.3795, + "step": 2800 + }, + { + "epoch": 1.8951285520974288, + "grad_norm": 0.9453746819066781, + "learning_rate": 3.5940846200550327e-06, + "loss": 0.3957, + "step": 2801 + }, + { + "epoch": 1.8958051420838973, + "grad_norm": 0.9508531731708235, + "learning_rate": 3.5903070582318356e-06, + "loss": 0.3838, + "step": 2802 + }, + { + "epoch": 1.8964817320703653, + "grad_norm": 0.9608120145748116, + "learning_rate": 3.5865303703422794e-06, + "loss": 0.39, + "step": 2803 + }, + { + "epoch": 1.8971583220568335, + "grad_norm": 0.9692517729007015, + "learning_rate": 3.5827545587277033e-06, + "loss": 0.3946, + "step": 2804 + }, + { + "epoch": 1.8978349120433018, + "grad_norm": 0.9670414702909218, + "learning_rate": 3.5789796257289117e-06, + "loss": 0.3763, + "step": 2805 + }, + { + "epoch": 1.8985115020297698, + "grad_norm": 0.9630253511275844, + "learning_rate": 3.5752055736861567e-06, + "loss": 0.391, + "step": 2806 + }, + { + "epoch": 1.8991880920162383, + "grad_norm": 0.9201773183133836, + "learning_rate": 3.571432404939149e-06, + "loss": 0.368, + "step": 2807 + }, + { + "epoch": 1.8998646820027063, + "grad_norm": 1.0000228594068894, + "learning_rate": 3.567660121827048e-06, + "loss": 0.3884, + "step": 2808 + }, + { + "epoch": 1.9005412719891746, + "grad_norm": 1.001379977382614, + "learning_rate": 3.5638887266884682e-06, + "loss": 0.3985, + "step": 2809 + }, + { + "epoch": 1.9012178619756428, + "grad_norm": 1.0050413652915795, + "learning_rate": 3.5601182218614706e-06, + "loss": 0.4064, + "step": 2810 + }, + { + "epoch": 1.9018944519621108, + "grad_norm": 0.942357973561415, + "learning_rate": 3.5563486096835643e-06, + "loss": 0.3868, + "step": 2811 + }, + { + "epoch": 1.9025710419485793, + "grad_norm": 0.9452019830664797, + "learning_rate": 3.552579892491704e-06, + "loss": 0.3906, + "step": 2812 + }, + { + "epoch": 1.9032476319350473, + "grad_norm": 0.9744015353774492, + "learning_rate": 3.548812072622294e-06, + "loss": 0.381, + "step": 2813 + }, + { + "epoch": 1.9039242219215156, + "grad_norm": 0.971122503753358, + "learning_rate": 3.545045152411178e-06, + "loss": 0.3837, + "step": 2814 + }, + { + "epoch": 1.9046008119079838, + "grad_norm": 0.9527853306453381, + "learning_rate": 3.5412791341936446e-06, + "loss": 0.3846, + "step": 2815 + }, + { + "epoch": 1.9052774018944518, + "grad_norm": 0.9159435943064962, + "learning_rate": 3.5375140203044233e-06, + "loss": 0.3792, + "step": 2816 + }, + { + "epoch": 1.9059539918809203, + "grad_norm": 0.9578571255864694, + "learning_rate": 3.533749813077677e-06, + "loss": 0.3731, + "step": 2817 + }, + { + "epoch": 1.9066305818673883, + "grad_norm": 0.9605758862853653, + "learning_rate": 3.5299865148470157e-06, + "loss": 0.3922, + "step": 2818 + }, + { + "epoch": 1.9073071718538566, + "grad_norm": 1.0324707495534615, + "learning_rate": 3.526224127945479e-06, + "loss": 0.4137, + "step": 2819 + }, + { + "epoch": 1.9079837618403248, + "grad_norm": 0.9866128094125127, + "learning_rate": 3.5224626547055463e-06, + "loss": 0.387, + "step": 2820 + }, + { + "epoch": 1.9086603518267928, + "grad_norm": 0.9579418544230219, + "learning_rate": 3.518702097459126e-06, + "loss": 0.3788, + "step": 2821 + }, + { + "epoch": 1.9093369418132613, + "grad_norm": 1.0191306852982904, + "learning_rate": 3.5149424585375623e-06, + "loss": 0.4013, + "step": 2822 + }, + { + "epoch": 1.9100135317997293, + "grad_norm": 0.973425893268513, + "learning_rate": 3.5111837402716297e-06, + "loss": 0.3799, + "step": 2823 + }, + { + "epoch": 1.9106901217861976, + "grad_norm": 0.945748236263133, + "learning_rate": 3.507425944991529e-06, + "loss": 0.3794, + "step": 2824 + }, + { + "epoch": 1.9113667117726658, + "grad_norm": 0.9875665785501894, + "learning_rate": 3.5036690750268897e-06, + "loss": 0.3863, + "step": 2825 + }, + { + "epoch": 1.9120433017591338, + "grad_norm": 0.9574025280037418, + "learning_rate": 3.499913132706771e-06, + "loss": 0.3903, + "step": 2826 + }, + { + "epoch": 1.9127198917456023, + "grad_norm": 0.9934220298931912, + "learning_rate": 3.496158120359653e-06, + "loss": 0.374, + "step": 2827 + }, + { + "epoch": 1.9133964817320703, + "grad_norm": 0.9842609953846655, + "learning_rate": 3.492404040313443e-06, + "loss": 0.3896, + "step": 2828 + }, + { + "epoch": 1.9140730717185386, + "grad_norm": 0.9350775444620787, + "learning_rate": 3.4886508948954656e-06, + "loss": 0.3931, + "step": 2829 + }, + { + "epoch": 1.9147496617050068, + "grad_norm": 0.9529282501576434, + "learning_rate": 3.484898686432473e-06, + "loss": 0.378, + "step": 2830 + }, + { + "epoch": 1.9154262516914748, + "grad_norm": 0.9054300867056541, + "learning_rate": 3.4811474172506277e-06, + "loss": 0.3606, + "step": 2831 + }, + { + "epoch": 1.9161028416779433, + "grad_norm": 0.9730209052798792, + "learning_rate": 3.4773970896755167e-06, + "loss": 0.402, + "step": 2832 + }, + { + "epoch": 1.9167794316644113, + "grad_norm": 0.9711823319274516, + "learning_rate": 3.4736477060321387e-06, + "loss": 0.397, + "step": 2833 + }, + { + "epoch": 1.9174560216508796, + "grad_norm": 0.9724124107493505, + "learning_rate": 3.469899268644913e-06, + "loss": 0.3985, + "step": 2834 + }, + { + "epoch": 1.9181326116373478, + "grad_norm": 0.9626689911589725, + "learning_rate": 3.466151779837665e-06, + "loss": 0.3871, + "step": 2835 + }, + { + "epoch": 1.9188092016238159, + "grad_norm": 0.9288249770414444, + "learning_rate": 3.4624052419336395e-06, + "loss": 0.3974, + "step": 2836 + }, + { + "epoch": 1.9194857916102843, + "grad_norm": 0.9648098540021439, + "learning_rate": 3.458659657255486e-06, + "loss": 0.3914, + "step": 2837 + }, + { + "epoch": 1.9201623815967523, + "grad_norm": 1.0116173148389211, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.4055, + "step": 2838 + }, + { + "epoch": 1.9208389715832206, + "grad_norm": 0.9648934805064656, + "learning_rate": 3.4511713568644432e-06, + "loss": 0.4077, + "step": 2839 + }, + { + "epoch": 1.9215155615696888, + "grad_norm": 0.9505351658348619, + "learning_rate": 3.4474286457938976e-06, + "loss": 0.3908, + "step": 2840 + }, + { + "epoch": 1.9221921515561569, + "grad_norm": 0.9178824448310808, + "learning_rate": 3.4436868972339073e-06, + "loss": 0.378, + "step": 2841 + }, + { + "epoch": 1.9228687415426253, + "grad_norm": 0.934040816987798, + "learning_rate": 3.4399461135041525e-06, + "loss": 0.3667, + "step": 2842 + }, + { + "epoch": 1.9235453315290933, + "grad_norm": 0.9379242534960136, + "learning_rate": 3.4362062969237227e-06, + "loss": 0.3941, + "step": 2843 + }, + { + "epoch": 1.9242219215155616, + "grad_norm": 0.9514297065230649, + "learning_rate": 3.4324674498110956e-06, + "loss": 0.3855, + "step": 2844 + }, + { + "epoch": 1.9248985115020298, + "grad_norm": 0.9103341771896453, + "learning_rate": 3.4287295744841588e-06, + "loss": 0.374, + "step": 2845 + }, + { + "epoch": 1.9255751014884979, + "grad_norm": 0.9761131062101809, + "learning_rate": 3.4249926732601914e-06, + "loss": 0.3738, + "step": 2846 + }, + { + "epoch": 1.9262516914749663, + "grad_norm": 0.9696485524449143, + "learning_rate": 3.4212567484558735e-06, + "loss": 0.3919, + "step": 2847 + }, + { + "epoch": 1.9269282814614344, + "grad_norm": 0.9538678927713922, + "learning_rate": 3.4175218023872753e-06, + "loss": 0.3911, + "step": 2848 + }, + { + "epoch": 1.9276048714479026, + "grad_norm": 0.9389077060372707, + "learning_rate": 3.413787837369863e-06, + "loss": 0.3895, + "step": 2849 + }, + { + "epoch": 1.9282814614343708, + "grad_norm": 0.9972066459954332, + "learning_rate": 3.4100548557184944e-06, + "loss": 0.4015, + "step": 2850 + }, + { + "epoch": 1.9289580514208389, + "grad_norm": 0.9216920339702025, + "learning_rate": 3.4063228597474133e-06, + "loss": 0.3687, + "step": 2851 + }, + { + "epoch": 1.9296346414073073, + "grad_norm": 0.9738682835054276, + "learning_rate": 3.40259185177026e-06, + "loss": 0.3874, + "step": 2852 + }, + { + "epoch": 1.9303112313937754, + "grad_norm": 0.9608396408178609, + "learning_rate": 3.3988618341000566e-06, + "loss": 0.3761, + "step": 2853 + }, + { + "epoch": 1.9309878213802436, + "grad_norm": 1.010039770864873, + "learning_rate": 3.395132809049212e-06, + "loss": 0.3998, + "step": 2854 + }, + { + "epoch": 1.9316644113667119, + "grad_norm": 0.9708243237418063, + "learning_rate": 3.391404778929523e-06, + "loss": 0.3907, + "step": 2855 + }, + { + "epoch": 1.9323410013531799, + "grad_norm": 0.9047366472542193, + "learning_rate": 3.3876777460521647e-06, + "loss": 0.3585, + "step": 2856 + }, + { + "epoch": 1.9330175913396481, + "grad_norm": 0.9734212954300306, + "learning_rate": 3.383951712727701e-06, + "loss": 0.3929, + "step": 2857 + }, + { + "epoch": 1.9336941813261164, + "grad_norm": 0.9324070495018218, + "learning_rate": 3.3802266812660674e-06, + "loss": 0.3788, + "step": 2858 + }, + { + "epoch": 1.9343707713125846, + "grad_norm": 0.947309948772134, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.3821, + "step": 2859 + }, + { + "epoch": 1.9350473612990529, + "grad_norm": 0.9021359226034089, + "learning_rate": 3.372779633167946e-06, + "loss": 0.3718, + "step": 2860 + }, + { + "epoch": 1.9357239512855209, + "grad_norm": 0.9935978264460849, + "learning_rate": 3.369057621148227e-06, + "loss": 0.3821, + "step": 2861 + }, + { + "epoch": 1.9364005412719891, + "grad_norm": 0.9164059370888998, + "learning_rate": 3.3653366202248738e-06, + "loss": 0.3814, + "step": 2862 + }, + { + "epoch": 1.9370771312584574, + "grad_norm": 0.9674431295211486, + "learning_rate": 3.3616166327047084e-06, + "loss": 0.3935, + "step": 2863 + }, + { + "epoch": 1.9377537212449256, + "grad_norm": 0.915352582882395, + "learning_rate": 3.3578976608939184e-06, + "loss": 0.3819, + "step": 2864 + }, + { + "epoch": 1.9384303112313939, + "grad_norm": 0.9567493191761282, + "learning_rate": 3.3541797070980663e-06, + "loss": 0.3864, + "step": 2865 + }, + { + "epoch": 1.939106901217862, + "grad_norm": 0.91384633516482, + "learning_rate": 3.3504627736220863e-06, + "loss": 0.3624, + "step": 2866 + }, + { + "epoch": 1.9397834912043301, + "grad_norm": 0.931640281843564, + "learning_rate": 3.3467468627702736e-06, + "loss": 0.3748, + "step": 2867 + }, + { + "epoch": 1.9404600811907984, + "grad_norm": 0.9584167372969533, + "learning_rate": 3.3430319768462956e-06, + "loss": 0.3855, + "step": 2868 + }, + { + "epoch": 1.9411366711772666, + "grad_norm": 0.9405848069357464, + "learning_rate": 3.3393181181531785e-06, + "loss": 0.3758, + "step": 2869 + }, + { + "epoch": 1.9418132611637349, + "grad_norm": 1.0128862798860447, + "learning_rate": 3.3356052889933177e-06, + "loss": 0.3944, + "step": 2870 + }, + { + "epoch": 1.942489851150203, + "grad_norm": 0.9654630010379878, + "learning_rate": 3.331893491668464e-06, + "loss": 0.3848, + "step": 2871 + }, + { + "epoch": 1.9431664411366711, + "grad_norm": 0.9471093982546952, + "learning_rate": 3.3281827284797317e-06, + "loss": 0.401, + "step": 2872 + }, + { + "epoch": 1.9438430311231394, + "grad_norm": 0.9104591170027402, + "learning_rate": 3.3244730017275974e-06, + "loss": 0.3756, + "step": 2873 + }, + { + "epoch": 1.9445196211096076, + "grad_norm": 0.9585672602525391, + "learning_rate": 3.3207643137118872e-06, + "loss": 0.3787, + "step": 2874 + }, + { + "epoch": 1.9451962110960759, + "grad_norm": 0.9051463480050137, + "learning_rate": 3.3170566667317917e-06, + "loss": 0.3672, + "step": 2875 + }, + { + "epoch": 1.945872801082544, + "grad_norm": 0.9602883465334724, + "learning_rate": 3.3133500630858507e-06, + "loss": 0.3868, + "step": 2876 + }, + { + "epoch": 1.9465493910690121, + "grad_norm": 0.8848662794491753, + "learning_rate": 3.309644505071959e-06, + "loss": 0.3656, + "step": 2877 + }, + { + "epoch": 1.9472259810554804, + "grad_norm": 0.8776402634641406, + "learning_rate": 3.3059399949873605e-06, + "loss": 0.3613, + "step": 2878 + }, + { + "epoch": 1.9479025710419484, + "grad_norm": 0.9278448142055523, + "learning_rate": 3.3022365351286545e-06, + "loss": 0.3688, + "step": 2879 + }, + { + "epoch": 1.9485791610284169, + "grad_norm": 0.9318795655942258, + "learning_rate": 3.298534127791785e-06, + "loss": 0.3845, + "step": 2880 + }, + { + "epoch": 1.949255751014885, + "grad_norm": 0.9702668985938906, + "learning_rate": 3.2948327752720464e-06, + "loss": 0.3986, + "step": 2881 + }, + { + "epoch": 1.9499323410013532, + "grad_norm": 0.9421255263990418, + "learning_rate": 3.2911324798640764e-06, + "loss": 0.396, + "step": 2882 + }, + { + "epoch": 1.9506089309878214, + "grad_norm": 0.9636564263109836, + "learning_rate": 3.2874332438618607e-06, + "loss": 0.3935, + "step": 2883 + }, + { + "epoch": 1.9512855209742894, + "grad_norm": 0.9357187148746272, + "learning_rate": 3.2837350695587237e-06, + "loss": 0.3829, + "step": 2884 + }, + { + "epoch": 1.951962110960758, + "grad_norm": 0.9276320040386208, + "learning_rate": 3.280037959247336e-06, + "loss": 0.3847, + "step": 2885 + }, + { + "epoch": 1.952638700947226, + "grad_norm": 0.9247564963846984, + "learning_rate": 3.276341915219704e-06, + "loss": 0.3819, + "step": 2886 + }, + { + "epoch": 1.9533152909336942, + "grad_norm": 0.9518951118579614, + "learning_rate": 3.2726469397671797e-06, + "loss": 0.3871, + "step": 2887 + }, + { + "epoch": 1.9539918809201624, + "grad_norm": 0.9502773764867274, + "learning_rate": 3.268953035180445e-06, + "loss": 0.3835, + "step": 2888 + }, + { + "epoch": 1.9546684709066304, + "grad_norm": 0.9853605160786062, + "learning_rate": 3.2652602037495247e-06, + "loss": 0.388, + "step": 2889 + }, + { + "epoch": 1.955345060893099, + "grad_norm": 0.9808710233742232, + "learning_rate": 3.261568447763775e-06, + "loss": 0.4045, + "step": 2890 + }, + { + "epoch": 1.956021650879567, + "grad_norm": 0.9350573077338281, + "learning_rate": 3.2578777695118822e-06, + "loss": 0.3916, + "step": 2891 + }, + { + "epoch": 1.9566982408660352, + "grad_norm": 0.9990549083597429, + "learning_rate": 3.254188171281871e-06, + "loss": 0.3927, + "step": 2892 + }, + { + "epoch": 1.9573748308525034, + "grad_norm": 0.9300835015869638, + "learning_rate": 3.2504996553610924e-06, + "loss": 0.3829, + "step": 2893 + }, + { + "epoch": 1.9580514208389714, + "grad_norm": 0.9337900156826742, + "learning_rate": 3.2468122240362287e-06, + "loss": 0.3834, + "step": 2894 + }, + { + "epoch": 1.95872801082544, + "grad_norm": 0.9748744128707281, + "learning_rate": 3.2431258795932863e-06, + "loss": 0.3924, + "step": 2895 + }, + { + "epoch": 1.959404600811908, + "grad_norm": 0.9355614717781006, + "learning_rate": 3.2394406243176025e-06, + "loss": 0.3784, + "step": 2896 + }, + { + "epoch": 1.9600811907983762, + "grad_norm": 0.9587164673655073, + "learning_rate": 3.2357564604938363e-06, + "loss": 0.3854, + "step": 2897 + }, + { + "epoch": 1.9607577807848444, + "grad_norm": 0.8825622603252513, + "learning_rate": 3.232073390405969e-06, + "loss": 0.361, + "step": 2898 + }, + { + "epoch": 1.9614343707713124, + "grad_norm": 0.9285993721908563, + "learning_rate": 3.2283914163373064e-06, + "loss": 0.3789, + "step": 2899 + }, + { + "epoch": 1.962110960757781, + "grad_norm": 0.9564243877471528, + "learning_rate": 3.224710540570475e-06, + "loss": 0.3867, + "step": 2900 + }, + { + "epoch": 1.962787550744249, + "grad_norm": 1.019897269612813, + "learning_rate": 3.2210307653874175e-06, + "loss": 0.4011, + "step": 2901 + }, + { + "epoch": 1.9634641407307172, + "grad_norm": 0.9139846983316788, + "learning_rate": 3.2173520930693987e-06, + "loss": 0.3662, + "step": 2902 + }, + { + "epoch": 1.9641407307171854, + "grad_norm": 0.920949944630482, + "learning_rate": 3.2136745258969965e-06, + "loss": 0.3789, + "step": 2903 + }, + { + "epoch": 1.9648173207036534, + "grad_norm": 0.9650036235115824, + "learning_rate": 3.2099980661501016e-06, + "loss": 0.4025, + "step": 2904 + }, + { + "epoch": 1.965493910690122, + "grad_norm": 0.9547398265775282, + "learning_rate": 3.2063227161079234e-06, + "loss": 0.3871, + "step": 2905 + }, + { + "epoch": 1.96617050067659, + "grad_norm": 0.9612879639781636, + "learning_rate": 3.202648478048981e-06, + "loss": 0.3795, + "step": 2906 + }, + { + "epoch": 1.9668470906630582, + "grad_norm": 0.9477117701441659, + "learning_rate": 3.1989753542511016e-06, + "loss": 0.3732, + "step": 2907 + }, + { + "epoch": 1.9675236806495264, + "grad_norm": 0.9761982483445683, + "learning_rate": 3.1953033469914273e-06, + "loss": 0.3894, + "step": 2908 + }, + { + "epoch": 1.9682002706359945, + "grad_norm": 0.9215681550973517, + "learning_rate": 3.191632458546401e-06, + "loss": 0.3722, + "step": 2909 + }, + { + "epoch": 1.968876860622463, + "grad_norm": 0.9412949533312766, + "learning_rate": 3.1879626911917806e-06, + "loss": 0.3901, + "step": 2910 + }, + { + "epoch": 1.969553450608931, + "grad_norm": 0.9691348369623856, + "learning_rate": 3.1842940472026194e-06, + "loss": 0.392, + "step": 2911 + }, + { + "epoch": 1.9702300405953992, + "grad_norm": 0.9758249542217725, + "learning_rate": 3.18062652885328e-06, + "loss": 0.3757, + "step": 2912 + }, + { + "epoch": 1.9709066305818674, + "grad_norm": 0.9751745215266456, + "learning_rate": 3.1769601384174274e-06, + "loss": 0.3986, + "step": 2913 + }, + { + "epoch": 1.9715832205683355, + "grad_norm": 0.9087155948453963, + "learning_rate": 3.173294878168025e-06, + "loss": 0.3661, + "step": 2914 + }, + { + "epoch": 1.972259810554804, + "grad_norm": 0.9523954700522769, + "learning_rate": 3.169630750377337e-06, + "loss": 0.3713, + "step": 2915 + }, + { + "epoch": 1.972936400541272, + "grad_norm": 0.9720553313694448, + "learning_rate": 3.165967757316925e-06, + "loss": 0.388, + "step": 2916 + }, + { + "epoch": 1.9736129905277402, + "grad_norm": 0.9590252306028271, + "learning_rate": 3.16230590125765e-06, + "loss": 0.3943, + "step": 2917 + }, + { + "epoch": 1.9742895805142084, + "grad_norm": 0.9783369003095155, + "learning_rate": 3.1586451844696596e-06, + "loss": 0.3923, + "step": 2918 + }, + { + "epoch": 1.9749661705006765, + "grad_norm": 0.9707047467522676, + "learning_rate": 3.154985609222405e-06, + "loss": 0.3897, + "step": 2919 + }, + { + "epoch": 1.975642760487145, + "grad_norm": 0.9834293043937559, + "learning_rate": 3.1513271777846244e-06, + "loss": 0.3967, + "step": 2920 + }, + { + "epoch": 1.976319350473613, + "grad_norm": 0.9513142124211352, + "learning_rate": 3.1476698924243487e-06, + "loss": 0.3982, + "step": 2921 + }, + { + "epoch": 1.9769959404600812, + "grad_norm": 0.9412110601695037, + "learning_rate": 3.1440137554088957e-06, + "loss": 0.387, + "step": 2922 + }, + { + "epoch": 1.9776725304465494, + "grad_norm": 0.9159394025749934, + "learning_rate": 3.1403587690048775e-06, + "loss": 0.3786, + "step": 2923 + }, + { + "epoch": 1.9783491204330175, + "grad_norm": 0.9395174913262756, + "learning_rate": 3.1367049354781854e-06, + "loss": 0.3946, + "step": 2924 + }, + { + "epoch": 1.979025710419486, + "grad_norm": 0.9383214231914938, + "learning_rate": 3.1330522570939987e-06, + "loss": 0.3842, + "step": 2925 + }, + { + "epoch": 1.979702300405954, + "grad_norm": 0.9686656631817997, + "learning_rate": 3.129400736116783e-06, + "loss": 0.3975, + "step": 2926 + }, + { + "epoch": 1.9803788903924222, + "grad_norm": 0.9671352879487586, + "learning_rate": 3.125750374810283e-06, + "loss": 0.3767, + "step": 2927 + }, + { + "epoch": 1.9810554803788905, + "grad_norm": 0.9242821609719624, + "learning_rate": 3.1221011754375275e-06, + "loss": 0.383, + "step": 2928 + }, + { + "epoch": 1.9817320703653585, + "grad_norm": 0.959463781802831, + "learning_rate": 3.118453140260823e-06, + "loss": 0.3792, + "step": 2929 + }, + { + "epoch": 1.982408660351827, + "grad_norm": 0.9515821291385462, + "learning_rate": 3.1148062715417553e-06, + "loss": 0.3903, + "step": 2930 + }, + { + "epoch": 1.983085250338295, + "grad_norm": 0.9327741393852783, + "learning_rate": 3.111160571541183e-06, + "loss": 0.3777, + "step": 2931 + }, + { + "epoch": 1.9837618403247632, + "grad_norm": 0.8922039273045852, + "learning_rate": 3.107516042519248e-06, + "loss": 0.3535, + "step": 2932 + }, + { + "epoch": 1.9844384303112315, + "grad_norm": 0.966783229592581, + "learning_rate": 3.1038726867353587e-06, + "loss": 0.3761, + "step": 2933 + }, + { + "epoch": 1.9851150202976995, + "grad_norm": 0.9620160626649537, + "learning_rate": 3.1002305064482006e-06, + "loss": 0.3754, + "step": 2934 + }, + { + "epoch": 1.985791610284168, + "grad_norm": 0.9700042274732317, + "learning_rate": 3.096589503915729e-06, + "loss": 0.3779, + "step": 2935 + }, + { + "epoch": 1.986468200270636, + "grad_norm": 0.9842938956826476, + "learning_rate": 3.09294968139517e-06, + "loss": 0.3882, + "step": 2936 + }, + { + "epoch": 1.9871447902571042, + "grad_norm": 0.9773708083210262, + "learning_rate": 3.089311041143017e-06, + "loss": 0.3811, + "step": 2937 + }, + { + "epoch": 1.9878213802435725, + "grad_norm": 0.9484892252396426, + "learning_rate": 3.085673585415031e-06, + "loss": 0.3884, + "step": 2938 + }, + { + "epoch": 1.9884979702300405, + "grad_norm": 0.9783067505673053, + "learning_rate": 3.082037316466236e-06, + "loss": 0.3886, + "step": 2939 + }, + { + "epoch": 1.989174560216509, + "grad_norm": 0.9345842184278825, + "learning_rate": 3.078402236550926e-06, + "loss": 0.3762, + "step": 2940 + }, + { + "epoch": 1.989851150202977, + "grad_norm": 0.9368630543795368, + "learning_rate": 3.074768347922652e-06, + "loss": 0.3741, + "step": 2941 + }, + { + "epoch": 1.9905277401894452, + "grad_norm": 0.9335098843050984, + "learning_rate": 3.0711356528342316e-06, + "loss": 0.3712, + "step": 2942 + }, + { + "epoch": 1.9912043301759135, + "grad_norm": 0.9786673322188582, + "learning_rate": 3.06750415353774e-06, + "loss": 0.3958, + "step": 2943 + }, + { + "epoch": 1.9918809201623815, + "grad_norm": 0.9166916609198876, + "learning_rate": 3.063873852284508e-06, + "loss": 0.3788, + "step": 2944 + }, + { + "epoch": 1.9925575101488497, + "grad_norm": 0.9407536988683556, + "learning_rate": 3.0602447513251287e-06, + "loss": 0.389, + "step": 2945 + }, + { + "epoch": 1.993234100135318, + "grad_norm": 0.9169991498718381, + "learning_rate": 3.0566168529094485e-06, + "loss": 0.3808, + "step": 2946 + }, + { + "epoch": 1.9939106901217862, + "grad_norm": 0.925987917772624, + "learning_rate": 3.0529901592865705e-06, + "loss": 0.3706, + "step": 2947 + }, + { + "epoch": 1.9945872801082545, + "grad_norm": 0.9076401766987651, + "learning_rate": 3.0493646727048463e-06, + "loss": 0.3646, + "step": 2948 + }, + { + "epoch": 1.9952638700947225, + "grad_norm": 0.9495977118673681, + "learning_rate": 3.045740395411886e-06, + "loss": 0.3805, + "step": 2949 + }, + { + "epoch": 1.9959404600811907, + "grad_norm": 1.0197104166964828, + "learning_rate": 3.042117329654544e-06, + "loss": 0.3983, + "step": 2950 + }, + { + "epoch": 1.996617050067659, + "grad_norm": 0.9726730375147471, + "learning_rate": 3.0384954776789255e-06, + "loss": 0.3996, + "step": 2951 + }, + { + "epoch": 1.9972936400541272, + "grad_norm": 0.9158795363181512, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.3733, + "step": 2952 + }, + { + "epoch": 1.9979702300405955, + "grad_norm": 0.9505212869269329, + "learning_rate": 3.0312554240535166e-06, + "loss": 0.3865, + "step": 2953 + }, + { + "epoch": 1.9986468200270635, + "grad_norm": 0.9862754977243918, + "learning_rate": 3.0276372268921694e-06, + "loss": 0.3877, + "step": 2954 + }, + { + "epoch": 1.9993234100135318, + "grad_norm": 1.0263494141291425, + "learning_rate": 3.0240202524894304e-06, + "loss": 0.3886, + "step": 2955 + }, + { + "epoch": 2.0, + "grad_norm": 0.8873739972552815, + "learning_rate": 3.0204045030876267e-06, + "loss": 0.3638, + "step": 2956 + }, + { + "epoch": 2.0, + "eval_loss": 0.40424153208732605, + "eval_runtime": 431.6998, + "eval_samples_per_second": 23.06, + "eval_steps_per_second": 0.723, + "step": 2956 + }, + { + "epoch": 2.000676589986468, + "grad_norm": 0.9038130925946041, + "learning_rate": 3.016789980928331e-06, + "loss": 0.3329, + "step": 2957 + }, + { + "epoch": 2.0013531799729365, + "grad_norm": 0.8816872368815492, + "learning_rate": 3.013176688252349e-06, + "loss": 0.3185, + "step": 2958 + }, + { + "epoch": 2.0020297699594045, + "grad_norm": 0.9200538669123807, + "learning_rate": 3.009564627299728e-06, + "loss": 0.3411, + "step": 2959 + }, + { + "epoch": 2.002706359945873, + "grad_norm": 0.9047575100079928, + "learning_rate": 3.005953800309752e-06, + "loss": 0.3248, + "step": 2960 + }, + { + "epoch": 2.003382949932341, + "grad_norm": 0.9004158664370083, + "learning_rate": 3.0023442095209386e-06, + "loss": 0.3231, + "step": 2961 + }, + { + "epoch": 2.004059539918809, + "grad_norm": 0.9776540657457075, + "learning_rate": 2.9987358571710394e-06, + "loss": 0.3456, + "step": 2962 + }, + { + "epoch": 2.0047361299052775, + "grad_norm": 0.9376307261051864, + "learning_rate": 2.9951287454970405e-06, + "loss": 0.3278, + "step": 2963 + }, + { + "epoch": 2.0054127198917455, + "grad_norm": 1.0297889212940174, + "learning_rate": 2.991522876735154e-06, + "loss": 0.3374, + "step": 2964 + }, + { + "epoch": 2.006089309878214, + "grad_norm": 1.0007457037767937, + "learning_rate": 2.987918253120824e-06, + "loss": 0.3315, + "step": 2965 + }, + { + "epoch": 2.006765899864682, + "grad_norm": 0.9459797377430096, + "learning_rate": 2.984314876888725e-06, + "loss": 0.3109, + "step": 2966 + }, + { + "epoch": 2.00744248985115, + "grad_norm": 1.0540213518299297, + "learning_rate": 2.980712750272754e-06, + "loss": 0.3395, + "step": 2967 + }, + { + "epoch": 2.0081190798376185, + "grad_norm": 1.0139479525332433, + "learning_rate": 2.9771118755060368e-06, + "loss": 0.3369, + "step": 2968 + }, + { + "epoch": 2.0087956698240865, + "grad_norm": 0.9501841209981149, + "learning_rate": 2.9735122548209204e-06, + "loss": 0.3225, + "step": 2969 + }, + { + "epoch": 2.009472259810555, + "grad_norm": 0.9104085672574963, + "learning_rate": 2.96991389044898e-06, + "loss": 0.3173, + "step": 2970 + }, + { + "epoch": 2.010148849797023, + "grad_norm": 0.9795858637459921, + "learning_rate": 2.966316784621e-06, + "loss": 0.3297, + "step": 2971 + }, + { + "epoch": 2.010825439783491, + "grad_norm": 0.9498513959342357, + "learning_rate": 2.9627209395669978e-06, + "loss": 0.327, + "step": 2972 + }, + { + "epoch": 2.0115020297699595, + "grad_norm": 0.9486617766874432, + "learning_rate": 2.9591263575162e-06, + "loss": 0.3171, + "step": 2973 + }, + { + "epoch": 2.0121786197564275, + "grad_norm": 0.9425205170580246, + "learning_rate": 2.9555330406970568e-06, + "loss": 0.3262, + "step": 2974 + }, + { + "epoch": 2.012855209742896, + "grad_norm": 0.9553225339558391, + "learning_rate": 2.9519409913372286e-06, + "loss": 0.3257, + "step": 2975 + }, + { + "epoch": 2.013531799729364, + "grad_norm": 1.0181726236530662, + "learning_rate": 2.9483502116635943e-06, + "loss": 0.3312, + "step": 2976 + }, + { + "epoch": 2.014208389715832, + "grad_norm": 0.9212436349490788, + "learning_rate": 2.9447607039022443e-06, + "loss": 0.3088, + "step": 2977 + }, + { + "epoch": 2.0148849797023005, + "grad_norm": 0.9826074890506608, + "learning_rate": 2.9411724702784762e-06, + "loss": 0.3315, + "step": 2978 + }, + { + "epoch": 2.0155615696887685, + "grad_norm": 0.952169968543756, + "learning_rate": 2.9375855130168046e-06, + "loss": 0.3155, + "step": 2979 + }, + { + "epoch": 2.016238159675237, + "grad_norm": 0.986823165347708, + "learning_rate": 2.9339998343409484e-06, + "loss": 0.3278, + "step": 2980 + }, + { + "epoch": 2.016914749661705, + "grad_norm": 0.9663696510862468, + "learning_rate": 2.9304154364738358e-06, + "loss": 0.3151, + "step": 2981 + }, + { + "epoch": 2.017591339648173, + "grad_norm": 1.0038324147866482, + "learning_rate": 2.9268323216375997e-06, + "loss": 0.3411, + "step": 2982 + }, + { + "epoch": 2.0182679296346415, + "grad_norm": 0.9538096600942009, + "learning_rate": 2.92325049205358e-06, + "loss": 0.3286, + "step": 2983 + }, + { + "epoch": 2.0189445196211095, + "grad_norm": 1.0710026705897027, + "learning_rate": 2.9196699499423143e-06, + "loss": 0.3306, + "step": 2984 + }, + { + "epoch": 2.019621109607578, + "grad_norm": 0.9978318187088097, + "learning_rate": 2.9160906975235493e-06, + "loss": 0.3161, + "step": 2985 + }, + { + "epoch": 2.020297699594046, + "grad_norm": 1.0036118000027836, + "learning_rate": 2.9125127370162253e-06, + "loss": 0.3357, + "step": 2986 + }, + { + "epoch": 2.020974289580514, + "grad_norm": 0.9728050363976315, + "learning_rate": 2.908936070638487e-06, + "loss": 0.3199, + "step": 2987 + }, + { + "epoch": 2.0216508795669825, + "grad_norm": 0.9872888844627106, + "learning_rate": 2.9053607006076766e-06, + "loss": 0.3295, + "step": 2988 + }, + { + "epoch": 2.0223274695534506, + "grad_norm": 0.9950866097902137, + "learning_rate": 2.9017866291403275e-06, + "loss": 0.3154, + "step": 2989 + }, + { + "epoch": 2.023004059539919, + "grad_norm": 0.9994860688314967, + "learning_rate": 2.8982138584521734e-06, + "loss": 0.3144, + "step": 2990 + }, + { + "epoch": 2.023680649526387, + "grad_norm": 0.9921616855581498, + "learning_rate": 2.8946423907581377e-06, + "loss": 0.3295, + "step": 2991 + }, + { + "epoch": 2.024357239512855, + "grad_norm": 1.0299146759340774, + "learning_rate": 2.8910722282723404e-06, + "loss": 0.3275, + "step": 2992 + }, + { + "epoch": 2.0250338294993235, + "grad_norm": 1.029696683812635, + "learning_rate": 2.8875033732080865e-06, + "loss": 0.3433, + "step": 2993 + }, + { + "epoch": 2.0257104194857916, + "grad_norm": 1.0043041380089484, + "learning_rate": 2.8839358277778758e-06, + "loss": 0.3259, + "step": 2994 + }, + { + "epoch": 2.02638700947226, + "grad_norm": 1.0166006226414344, + "learning_rate": 2.8803695941933933e-06, + "loss": 0.3407, + "step": 2995 + }, + { + "epoch": 2.027063599458728, + "grad_norm": 1.0217692062128187, + "learning_rate": 2.876804674665515e-06, + "loss": 0.3376, + "step": 2996 + }, + { + "epoch": 2.027740189445196, + "grad_norm": 0.9666269732092508, + "learning_rate": 2.873241071404296e-06, + "loss": 0.3162, + "step": 2997 + }, + { + "epoch": 2.0284167794316645, + "grad_norm": 1.0139647308942759, + "learning_rate": 2.869678786618976e-06, + "loss": 0.3352, + "step": 2998 + }, + { + "epoch": 2.0290933694181326, + "grad_norm": 0.9972898048555547, + "learning_rate": 2.866117822517982e-06, + "loss": 0.3262, + "step": 2999 + }, + { + "epoch": 2.029769959404601, + "grad_norm": 0.9967207575777589, + "learning_rate": 2.86255818130892e-06, + "loss": 0.3294, + "step": 3000 + }, + { + "epoch": 2.030446549391069, + "grad_norm": 1.0790267719312128, + "learning_rate": 2.8589998651985775e-06, + "loss": 0.3399, + "step": 3001 + }, + { + "epoch": 2.031123139377537, + "grad_norm": 1.0059416399173047, + "learning_rate": 2.855442876392914e-06, + "loss": 0.3351, + "step": 3002 + }, + { + "epoch": 2.0317997293640055, + "grad_norm": 1.0123183878980389, + "learning_rate": 2.8518872170970758e-06, + "loss": 0.3146, + "step": 3003 + }, + { + "epoch": 2.0324763193504736, + "grad_norm": 1.0662257057660793, + "learning_rate": 2.848332889515375e-06, + "loss": 0.3343, + "step": 3004 + }, + { + "epoch": 2.033152909336942, + "grad_norm": 1.0578278376530537, + "learning_rate": 2.8447798958513082e-06, + "loss": 0.3338, + "step": 3005 + }, + { + "epoch": 2.03382949932341, + "grad_norm": 1.0526404579909534, + "learning_rate": 2.8412282383075362e-06, + "loss": 0.3319, + "step": 3006 + }, + { + "epoch": 2.034506089309878, + "grad_norm": 1.0372705102399975, + "learning_rate": 2.837677919085896e-06, + "loss": 0.3273, + "step": 3007 + }, + { + "epoch": 2.0351826792963466, + "grad_norm": 0.983954642138402, + "learning_rate": 2.8341289403873952e-06, + "loss": 0.3102, + "step": 3008 + }, + { + "epoch": 2.0358592692828146, + "grad_norm": 1.0214465256758594, + "learning_rate": 2.83058130441221e-06, + "loss": 0.3255, + "step": 3009 + }, + { + "epoch": 2.0365358592692826, + "grad_norm": 1.0569444044633176, + "learning_rate": 2.8270350133596824e-06, + "loss": 0.3398, + "step": 3010 + }, + { + "epoch": 2.037212449255751, + "grad_norm": 0.9854356222554426, + "learning_rate": 2.82349006942832e-06, + "loss": 0.3214, + "step": 3011 + }, + { + "epoch": 2.037889039242219, + "grad_norm": 1.0208905020757164, + "learning_rate": 2.8199464748157983e-06, + "loss": 0.328, + "step": 3012 + }, + { + "epoch": 2.0385656292286876, + "grad_norm": 0.9829155508559776, + "learning_rate": 2.816404231718958e-06, + "loss": 0.3116, + "step": 3013 + }, + { + "epoch": 2.0392422192151556, + "grad_norm": 1.0189846361053592, + "learning_rate": 2.8128633423337932e-06, + "loss": 0.3219, + "step": 3014 + }, + { + "epoch": 2.0399188092016236, + "grad_norm": 0.9756658654015941, + "learning_rate": 2.8093238088554676e-06, + "loss": 0.3164, + "step": 3015 + }, + { + "epoch": 2.040595399188092, + "grad_norm": 1.0397077430063677, + "learning_rate": 2.8057856334783006e-06, + "loss": 0.3316, + "step": 3016 + }, + { + "epoch": 2.04127198917456, + "grad_norm": 1.0399646537929785, + "learning_rate": 2.802248818395773e-06, + "loss": 0.3311, + "step": 3017 + }, + { + "epoch": 2.0419485791610286, + "grad_norm": 1.0206729775037218, + "learning_rate": 2.7987133658005174e-06, + "loss": 0.3213, + "step": 3018 + }, + { + "epoch": 2.0426251691474966, + "grad_norm": 1.063125117670267, + "learning_rate": 2.795179277884321e-06, + "loss": 0.3318, + "step": 3019 + }, + { + "epoch": 2.0433017591339646, + "grad_norm": 1.0226033067382583, + "learning_rate": 2.79164655683813e-06, + "loss": 0.32, + "step": 3020 + }, + { + "epoch": 2.043978349120433, + "grad_norm": 1.028570262460247, + "learning_rate": 2.788115204852042e-06, + "loss": 0.3211, + "step": 3021 + }, + { + "epoch": 2.044654939106901, + "grad_norm": 1.007828137941707, + "learning_rate": 2.7845852241153063e-06, + "loss": 0.3212, + "step": 3022 + }, + { + "epoch": 2.0453315290933696, + "grad_norm": 1.057705082850331, + "learning_rate": 2.781056616816319e-06, + "loss": 0.328, + "step": 3023 + }, + { + "epoch": 2.0460081190798376, + "grad_norm": 1.0025355670518719, + "learning_rate": 2.7775293851426233e-06, + "loss": 0.3166, + "step": 3024 + }, + { + "epoch": 2.0466847090663056, + "grad_norm": 1.0481239008378884, + "learning_rate": 2.7740035312809153e-06, + "loss": 0.3192, + "step": 3025 + }, + { + "epoch": 2.047361299052774, + "grad_norm": 1.0411246643951915, + "learning_rate": 2.7704790574170372e-06, + "loss": 0.3219, + "step": 3026 + }, + { + "epoch": 2.048037889039242, + "grad_norm": 1.02782978110486, + "learning_rate": 2.766955965735968e-06, + "loss": 0.3224, + "step": 3027 + }, + { + "epoch": 2.0487144790257106, + "grad_norm": 1.053754933522437, + "learning_rate": 2.7634342584218364e-06, + "loss": 0.3221, + "step": 3028 + }, + { + "epoch": 2.0493910690121786, + "grad_norm": 1.072464684504966, + "learning_rate": 2.759913937657912e-06, + "loss": 0.3417, + "step": 3029 + }, + { + "epoch": 2.0500676589986466, + "grad_norm": 1.0565733183676842, + "learning_rate": 2.7563950056266053e-06, + "loss": 0.3328, + "step": 3030 + }, + { + "epoch": 2.050744248985115, + "grad_norm": 1.0473899303853187, + "learning_rate": 2.752877464509463e-06, + "loss": 0.3203, + "step": 3031 + }, + { + "epoch": 2.051420838971583, + "grad_norm": 1.0455928615028476, + "learning_rate": 2.7493613164871678e-06, + "loss": 0.3341, + "step": 3032 + }, + { + "epoch": 2.0520974289580516, + "grad_norm": 1.0529692345026416, + "learning_rate": 2.745846563739546e-06, + "loss": 0.3256, + "step": 3033 + }, + { + "epoch": 2.0527740189445196, + "grad_norm": 0.991111229617875, + "learning_rate": 2.7423332084455543e-06, + "loss": 0.3151, + "step": 3034 + }, + { + "epoch": 2.0534506089309876, + "grad_norm": 1.0354432823377364, + "learning_rate": 2.7388212527832814e-06, + "loss": 0.3343, + "step": 3035 + }, + { + "epoch": 2.054127198917456, + "grad_norm": 1.0505842582985414, + "learning_rate": 2.7353106989299528e-06, + "loss": 0.3274, + "step": 3036 + }, + { + "epoch": 2.054803788903924, + "grad_norm": 1.0331790888807864, + "learning_rate": 2.731801549061923e-06, + "loss": 0.3185, + "step": 3037 + }, + { + "epoch": 2.0554803788903926, + "grad_norm": 1.0306134117878722, + "learning_rate": 2.7282938053546727e-06, + "loss": 0.3262, + "step": 3038 + }, + { + "epoch": 2.0561569688768606, + "grad_norm": 1.0289563017047423, + "learning_rate": 2.7247874699828186e-06, + "loss": 0.3202, + "step": 3039 + }, + { + "epoch": 2.0568335588633286, + "grad_norm": 1.0242389485495078, + "learning_rate": 2.7212825451200942e-06, + "loss": 0.3264, + "step": 3040 + }, + { + "epoch": 2.057510148849797, + "grad_norm": 0.9845036838281035, + "learning_rate": 2.7177790329393674e-06, + "loss": 0.3104, + "step": 3041 + }, + { + "epoch": 2.058186738836265, + "grad_norm": 1.0301246381202998, + "learning_rate": 2.7142769356126258e-06, + "loss": 0.325, + "step": 3042 + }, + { + "epoch": 2.0588633288227336, + "grad_norm": 1.030844098927739, + "learning_rate": 2.710776255310984e-06, + "loss": 0.3184, + "step": 3043 + }, + { + "epoch": 2.0595399188092016, + "grad_norm": 1.0694238947846768, + "learning_rate": 2.7072769942046716e-06, + "loss": 0.3288, + "step": 3044 + }, + { + "epoch": 2.0602165087956696, + "grad_norm": 1.0592291058504668, + "learning_rate": 2.7037791544630414e-06, + "loss": 0.3336, + "step": 3045 + }, + { + "epoch": 2.060893098782138, + "grad_norm": 1.0218749998590817, + "learning_rate": 2.700282738254567e-06, + "loss": 0.3309, + "step": 3046 + }, + { + "epoch": 2.061569688768606, + "grad_norm": 1.016657007346522, + "learning_rate": 2.6967877477468394e-06, + "loss": 0.3148, + "step": 3047 + }, + { + "epoch": 2.0622462787550746, + "grad_norm": 1.020059008465994, + "learning_rate": 2.693294185106562e-06, + "loss": 0.3068, + "step": 3048 + }, + { + "epoch": 2.0629228687415426, + "grad_norm": 1.055423523804485, + "learning_rate": 2.689802052499555e-06, + "loss": 0.3365, + "step": 3049 + }, + { + "epoch": 2.0635994587280106, + "grad_norm": 1.0444281385591616, + "learning_rate": 2.686311352090756e-06, + "loss": 0.3245, + "step": 3050 + }, + { + "epoch": 2.064276048714479, + "grad_norm": 1.0947021393057752, + "learning_rate": 2.682822086044206e-06, + "loss": 0.3406, + "step": 3051 + }, + { + "epoch": 2.064952638700947, + "grad_norm": 1.0466321232172768, + "learning_rate": 2.6793342565230675e-06, + "loss": 0.3209, + "step": 3052 + }, + { + "epoch": 2.0656292286874156, + "grad_norm": 1.0259706505060404, + "learning_rate": 2.6758478656896015e-06, + "loss": 0.31, + "step": 3053 + }, + { + "epoch": 2.0663058186738836, + "grad_norm": 1.0604318032232756, + "learning_rate": 2.6723629157051844e-06, + "loss": 0.3257, + "step": 3054 + }, + { + "epoch": 2.0669824086603517, + "grad_norm": 1.0293439262553075, + "learning_rate": 2.6688794087302993e-06, + "loss": 0.3142, + "step": 3055 + }, + { + "epoch": 2.06765899864682, + "grad_norm": 1.0161790362445395, + "learning_rate": 2.66539734692453e-06, + "loss": 0.314, + "step": 3056 + }, + { + "epoch": 2.068335588633288, + "grad_norm": 1.0612915633711357, + "learning_rate": 2.66191673244657e-06, + "loss": 0.32, + "step": 3057 + }, + { + "epoch": 2.0690121786197566, + "grad_norm": 1.048609064297326, + "learning_rate": 2.658437567454209e-06, + "loss": 0.3259, + "step": 3058 + }, + { + "epoch": 2.0696887686062246, + "grad_norm": 1.0385520165410107, + "learning_rate": 2.6549598541043433e-06, + "loss": 0.3186, + "step": 3059 + }, + { + "epoch": 2.0703653585926927, + "grad_norm": 0.9993832964664853, + "learning_rate": 2.6514835945529706e-06, + "loss": 0.3191, + "step": 3060 + }, + { + "epoch": 2.071041948579161, + "grad_norm": 1.0049445279801426, + "learning_rate": 2.64800879095518e-06, + "loss": 0.307, + "step": 3061 + }, + { + "epoch": 2.071718538565629, + "grad_norm": 1.0329942836922585, + "learning_rate": 2.644535445465164e-06, + "loss": 0.311, + "step": 3062 + }, + { + "epoch": 2.0723951285520976, + "grad_norm": 1.0773788016239711, + "learning_rate": 2.641063560236212e-06, + "loss": 0.3234, + "step": 3063 + }, + { + "epoch": 2.0730717185385656, + "grad_norm": 1.0514388681976856, + "learning_rate": 2.637593137420702e-06, + "loss": 0.3132, + "step": 3064 + }, + { + "epoch": 2.0737483085250337, + "grad_norm": 1.060199126971947, + "learning_rate": 2.6341241791701126e-06, + "loss": 0.3237, + "step": 3065 + }, + { + "epoch": 2.074424898511502, + "grad_norm": 1.0328324282426826, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.3056, + "step": 3066 + }, + { + "epoch": 2.07510148849797, + "grad_norm": 1.0398636123520217, + "learning_rate": 2.627190664965046e-06, + "loss": 0.3251, + "step": 3067 + }, + { + "epoch": 2.0757780784844386, + "grad_norm": 1.0276240513510593, + "learning_rate": 2.623726113308977e-06, + "loss": 0.3218, + "step": 3068 + }, + { + "epoch": 2.0764546684709067, + "grad_norm": 1.0142210211703218, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.3082, + "step": 3069 + }, + { + "epoch": 2.0771312584573747, + "grad_norm": 1.028746549621249, + "learning_rate": 2.616801431628938e-06, + "loss": 0.3153, + "step": 3070 + }, + { + "epoch": 2.077807848443843, + "grad_norm": 1.0925764452314193, + "learning_rate": 2.613341305897898e-06, + "loss": 0.3198, + "step": 3071 + }, + { + "epoch": 2.078484438430311, + "grad_norm": 1.0249065357540452, + "learning_rate": 2.609882659766605e-06, + "loss": 0.3219, + "step": 3072 + }, + { + "epoch": 2.0791610284167796, + "grad_norm": 1.0530121262543295, + "learning_rate": 2.6064254953792344e-06, + "loss": 0.3128, + "step": 3073 + }, + { + "epoch": 2.0798376184032477, + "grad_norm": 1.0569473958278888, + "learning_rate": 2.6029698148790392e-06, + "loss": 0.3343, + "step": 3074 + }, + { + "epoch": 2.0805142083897157, + "grad_norm": 1.0286464058137517, + "learning_rate": 2.5995156204083573e-06, + "loss": 0.3124, + "step": 3075 + }, + { + "epoch": 2.081190798376184, + "grad_norm": 1.0300719279849928, + "learning_rate": 2.5960629141086014e-06, + "loss": 0.3094, + "step": 3076 + }, + { + "epoch": 2.081867388362652, + "grad_norm": 1.0508843236502377, + "learning_rate": 2.5926116981202688e-06, + "loss": 0.3163, + "step": 3077 + }, + { + "epoch": 2.0825439783491206, + "grad_norm": 1.0466602530117037, + "learning_rate": 2.5891619745829184e-06, + "loss": 0.3213, + "step": 3078 + }, + { + "epoch": 2.0832205683355887, + "grad_norm": 1.0550055994518281, + "learning_rate": 2.585713745635197e-06, + "loss": 0.3182, + "step": 3079 + }, + { + "epoch": 2.0838971583220567, + "grad_norm": 1.0494429080200303, + "learning_rate": 2.5822670134148216e-06, + "loss": 0.3207, + "step": 3080 + }, + { + "epoch": 2.084573748308525, + "grad_norm": 1.07663821750747, + "learning_rate": 2.5788217800585812e-06, + "loss": 0.3307, + "step": 3081 + }, + { + "epoch": 2.085250338294993, + "grad_norm": 1.0633110377892574, + "learning_rate": 2.5753780477023314e-06, + "loss": 0.3269, + "step": 3082 + }, + { + "epoch": 2.0859269282814616, + "grad_norm": 1.0242153916380263, + "learning_rate": 2.571935818481005e-06, + "loss": 0.3129, + "step": 3083 + }, + { + "epoch": 2.0866035182679297, + "grad_norm": 1.0754341891045889, + "learning_rate": 2.5684950945285937e-06, + "loss": 0.3314, + "step": 3084 + }, + { + "epoch": 2.0872801082543977, + "grad_norm": 1.0115318342883304, + "learning_rate": 2.5650558779781635e-06, + "loss": 0.3218, + "step": 3085 + }, + { + "epoch": 2.087956698240866, + "grad_norm": 1.0557808811469453, + "learning_rate": 2.5616181709618447e-06, + "loss": 0.3083, + "step": 3086 + }, + { + "epoch": 2.088633288227334, + "grad_norm": 1.1107005877830372, + "learning_rate": 2.558181975610827e-06, + "loss": 0.3409, + "step": 3087 + }, + { + "epoch": 2.089309878213802, + "grad_norm": 1.0489520532540877, + "learning_rate": 2.5547472940553685e-06, + "loss": 0.3235, + "step": 3088 + }, + { + "epoch": 2.0899864682002707, + "grad_norm": 1.0736035785016134, + "learning_rate": 2.551314128424788e-06, + "loss": 0.3362, + "step": 3089 + }, + { + "epoch": 2.0906630581867387, + "grad_norm": 1.0545894547228865, + "learning_rate": 2.5478824808474613e-06, + "loss": 0.3155, + "step": 3090 + }, + { + "epoch": 2.091339648173207, + "grad_norm": 1.0130201067249343, + "learning_rate": 2.5444523534508225e-06, + "loss": 0.3, + "step": 3091 + }, + { + "epoch": 2.092016238159675, + "grad_norm": 1.0883982795950715, + "learning_rate": 2.5410237483613685e-06, + "loss": 0.3324, + "step": 3092 + }, + { + "epoch": 2.092692828146143, + "grad_norm": 1.035595955465196, + "learning_rate": 2.53759666770465e-06, + "loss": 0.3082, + "step": 3093 + }, + { + "epoch": 2.0933694181326117, + "grad_norm": 1.089704057262123, + "learning_rate": 2.5341711136052728e-06, + "loss": 0.3301, + "step": 3094 + }, + { + "epoch": 2.0940460081190797, + "grad_norm": 1.0588476791242767, + "learning_rate": 2.530747088186893e-06, + "loss": 0.3172, + "step": 3095 + }, + { + "epoch": 2.094722598105548, + "grad_norm": 1.0835982309517147, + "learning_rate": 2.527324593572223e-06, + "loss": 0.3385, + "step": 3096 + }, + { + "epoch": 2.095399188092016, + "grad_norm": 1.0416893482873841, + "learning_rate": 2.523903631883028e-06, + "loss": 0.3247, + "step": 3097 + }, + { + "epoch": 2.096075778078484, + "grad_norm": 1.043808058353486, + "learning_rate": 2.520484205240116e-06, + "loss": 0.3177, + "step": 3098 + }, + { + "epoch": 2.0967523680649527, + "grad_norm": 1.0492203272402414, + "learning_rate": 2.517066315763348e-06, + "loss": 0.3256, + "step": 3099 + }, + { + "epoch": 2.0974289580514207, + "grad_norm": 1.0702990144584885, + "learning_rate": 2.5136499655716306e-06, + "loss": 0.3219, + "step": 3100 + }, + { + "epoch": 2.098105548037889, + "grad_norm": 1.0308577581436713, + "learning_rate": 2.5102351567829187e-06, + "loss": 0.3161, + "step": 3101 + }, + { + "epoch": 2.098782138024357, + "grad_norm": 1.054329882960072, + "learning_rate": 2.5068218915142093e-06, + "loss": 0.3235, + "step": 3102 + }, + { + "epoch": 2.0994587280108252, + "grad_norm": 1.1157230257088258, + "learning_rate": 2.503410171881544e-06, + "loss": 0.3348, + "step": 3103 + }, + { + "epoch": 2.1001353179972937, + "grad_norm": 1.067196763066268, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.3196, + "step": 3104 + }, + { + "epoch": 2.1008119079837617, + "grad_norm": 1.0844946213056303, + "learning_rate": 2.496591377983706e-06, + "loss": 0.3288, + "step": 3105 + }, + { + "epoch": 2.10148849797023, + "grad_norm": 1.1023322609921864, + "learning_rate": 2.49318430794582e-06, + "loss": 0.3162, + "step": 3106 + }, + { + "epoch": 2.102165087956698, + "grad_norm": 1.1388866487761733, + "learning_rate": 2.4897787919985457e-06, + "loss": 0.3422, + "step": 3107 + }, + { + "epoch": 2.1028416779431662, + "grad_norm": 1.1183577118989485, + "learning_rate": 2.4863748322531144e-06, + "loss": 0.3356, + "step": 3108 + }, + { + "epoch": 2.1035182679296347, + "grad_norm": 1.0407826855860005, + "learning_rate": 2.4829724308198003e-06, + "loss": 0.3128, + "step": 3109 + }, + { + "epoch": 2.1041948579161027, + "grad_norm": 1.0743978433089592, + "learning_rate": 2.4795715898079116e-06, + "loss": 0.3233, + "step": 3110 + }, + { + "epoch": 2.104871447902571, + "grad_norm": 1.0903473041474492, + "learning_rate": 2.476172311325783e-06, + "loss": 0.3319, + "step": 3111 + }, + { + "epoch": 2.105548037889039, + "grad_norm": 1.024409836189495, + "learning_rate": 2.472774597480783e-06, + "loss": 0.3026, + "step": 3112 + }, + { + "epoch": 2.1062246278755072, + "grad_norm": 1.064844122177811, + "learning_rate": 2.4693784503793128e-06, + "loss": 0.3237, + "step": 3113 + }, + { + "epoch": 2.1069012178619757, + "grad_norm": 1.083568204903442, + "learning_rate": 2.4659838721268005e-06, + "loss": 0.3303, + "step": 3114 + }, + { + "epoch": 2.1075778078484437, + "grad_norm": 1.0366527980044367, + "learning_rate": 2.462590864827703e-06, + "loss": 0.3202, + "step": 3115 + }, + { + "epoch": 2.108254397834912, + "grad_norm": 1.062597365245354, + "learning_rate": 2.4591994305854988e-06, + "loss": 0.321, + "step": 3116 + }, + { + "epoch": 2.10893098782138, + "grad_norm": 1.0989405982152978, + "learning_rate": 2.4558095715026975e-06, + "loss": 0.3188, + "step": 3117 + }, + { + "epoch": 2.1096075778078482, + "grad_norm": 1.0543491430785725, + "learning_rate": 2.4524212896808265e-06, + "loss": 0.3195, + "step": 3118 + }, + { + "epoch": 2.1102841677943167, + "grad_norm": 1.0698713036562926, + "learning_rate": 2.4490345872204403e-06, + "loss": 0.3211, + "step": 3119 + }, + { + "epoch": 2.1109607577807847, + "grad_norm": 1.0829067748911252, + "learning_rate": 2.4456494662211082e-06, + "loss": 0.3219, + "step": 3120 + }, + { + "epoch": 2.111637347767253, + "grad_norm": 1.0624932575492023, + "learning_rate": 2.442265928781426e-06, + "loss": 0.3208, + "step": 3121 + }, + { + "epoch": 2.1123139377537212, + "grad_norm": 1.0750870009935214, + "learning_rate": 2.438883976999003e-06, + "loss": 0.333, + "step": 3122 + }, + { + "epoch": 2.1129905277401893, + "grad_norm": 1.082437516812866, + "learning_rate": 2.43550361297047e-06, + "loss": 0.325, + "step": 3123 + }, + { + "epoch": 2.1136671177266577, + "grad_norm": 1.0730007247696887, + "learning_rate": 2.4321248387914677e-06, + "loss": 0.3374, + "step": 3124 + }, + { + "epoch": 2.1143437077131257, + "grad_norm": 1.0971426609597585, + "learning_rate": 2.4287476565566525e-06, + "loss": 0.319, + "step": 3125 + }, + { + "epoch": 2.115020297699594, + "grad_norm": 1.0933027030657891, + "learning_rate": 2.4253720683596976e-06, + "loss": 0.3204, + "step": 3126 + }, + { + "epoch": 2.1156968876860622, + "grad_norm": 1.092757872470542, + "learning_rate": 2.421998076293285e-06, + "loss": 0.3206, + "step": 3127 + }, + { + "epoch": 2.1163734776725303, + "grad_norm": 1.027898455561416, + "learning_rate": 2.4186256824491106e-06, + "loss": 0.3283, + "step": 3128 + }, + { + "epoch": 2.1170500676589987, + "grad_norm": 1.0309936626773026, + "learning_rate": 2.4152548889178722e-06, + "loss": 0.3196, + "step": 3129 + }, + { + "epoch": 2.1177266576454667, + "grad_norm": 1.035713973438815, + "learning_rate": 2.4118856977892846e-06, + "loss": 0.308, + "step": 3130 + }, + { + "epoch": 2.118403247631935, + "grad_norm": 1.0300287259100047, + "learning_rate": 2.4085181111520607e-06, + "loss": 0.3093, + "step": 3131 + }, + { + "epoch": 2.1190798376184032, + "grad_norm": 1.0390574413392113, + "learning_rate": 2.4051521310939258e-06, + "loss": 0.307, + "step": 3132 + }, + { + "epoch": 2.1197564276048713, + "grad_norm": 1.1116496454790783, + "learning_rate": 2.401787759701603e-06, + "loss": 0.3269, + "step": 3133 + }, + { + "epoch": 2.1204330175913397, + "grad_norm": 1.0872385577378534, + "learning_rate": 2.3984249990608237e-06, + "loss": 0.3262, + "step": 3134 + }, + { + "epoch": 2.1211096075778078, + "grad_norm": 1.0969175153615704, + "learning_rate": 2.3950638512563173e-06, + "loss": 0.3279, + "step": 3135 + }, + { + "epoch": 2.121786197564276, + "grad_norm": 1.0932050847858432, + "learning_rate": 2.3917043183718162e-06, + "loss": 0.3154, + "step": 3136 + }, + { + "epoch": 2.1224627875507442, + "grad_norm": 1.103043564866908, + "learning_rate": 2.3883464024900484e-06, + "loss": 0.3292, + "step": 3137 + }, + { + "epoch": 2.1231393775372123, + "grad_norm": 1.1119240065901193, + "learning_rate": 2.3849901056927383e-06, + "loss": 0.3213, + "step": 3138 + }, + { + "epoch": 2.1238159675236807, + "grad_norm": 1.0420421027785127, + "learning_rate": 2.381635430060611e-06, + "loss": 0.3075, + "step": 3139 + }, + { + "epoch": 2.1244925575101488, + "grad_norm": 1.0646788200773543, + "learning_rate": 2.3782823776733866e-06, + "loss": 0.3227, + "step": 3140 + }, + { + "epoch": 2.1251691474966172, + "grad_norm": 1.056405175632645, + "learning_rate": 2.374930950609773e-06, + "loss": 0.3161, + "step": 3141 + }, + { + "epoch": 2.1258457374830853, + "grad_norm": 1.0414292704194887, + "learning_rate": 2.371581150947476e-06, + "loss": 0.3174, + "step": 3142 + }, + { + "epoch": 2.1265223274695533, + "grad_norm": 1.1142086471166972, + "learning_rate": 2.368232980763194e-06, + "loss": 0.3307, + "step": 3143 + }, + { + "epoch": 2.1271989174560217, + "grad_norm": 1.0212596172221553, + "learning_rate": 2.364886442132606e-06, + "loss": 0.3211, + "step": 3144 + }, + { + "epoch": 2.1278755074424898, + "grad_norm": 1.1024813366872834, + "learning_rate": 2.361541537130392e-06, + "loss": 0.3304, + "step": 3145 + }, + { + "epoch": 2.1285520974289582, + "grad_norm": 1.0885345365994723, + "learning_rate": 2.358198267830206e-06, + "loss": 0.3317, + "step": 3146 + }, + { + "epoch": 2.1292286874154263, + "grad_norm": 1.0637804849114936, + "learning_rate": 2.3548566363046993e-06, + "loss": 0.3187, + "step": 3147 + }, + { + "epoch": 2.1299052774018943, + "grad_norm": 1.056700279491071, + "learning_rate": 2.351516644625502e-06, + "loss": 0.3236, + "step": 3148 + }, + { + "epoch": 2.1305818673883627, + "grad_norm": 1.0990143212098609, + "learning_rate": 2.3481782948632317e-06, + "loss": 0.3177, + "step": 3149 + }, + { + "epoch": 2.1312584573748308, + "grad_norm": 1.046723744246787, + "learning_rate": 2.344841589087482e-06, + "loss": 0.3091, + "step": 3150 + }, + { + "epoch": 2.1319350473612992, + "grad_norm": 1.0384970244552318, + "learning_rate": 2.34150652936683e-06, + "loss": 0.3044, + "step": 3151 + }, + { + "epoch": 2.1326116373477673, + "grad_norm": 1.110070757716026, + "learning_rate": 2.3381731177688346e-06, + "loss": 0.3253, + "step": 3152 + }, + { + "epoch": 2.1332882273342353, + "grad_norm": 1.0876101006040824, + "learning_rate": 2.3348413563600324e-06, + "loss": 0.3246, + "step": 3153 + }, + { + "epoch": 2.1339648173207038, + "grad_norm": 1.118999567282542, + "learning_rate": 2.331511247205933e-06, + "loss": 0.3267, + "step": 3154 + }, + { + "epoch": 2.134641407307172, + "grad_norm": 1.0663102742420723, + "learning_rate": 2.3281827923710265e-06, + "loss": 0.3203, + "step": 3155 + }, + { + "epoch": 2.1353179972936402, + "grad_norm": 1.065598797598251, + "learning_rate": 2.324855993918775e-06, + "loss": 0.327, + "step": 3156 + }, + { + "epoch": 2.1359945872801083, + "grad_norm": 1.0782751262598218, + "learning_rate": 2.321530853911616e-06, + "loss": 0.3328, + "step": 3157 + }, + { + "epoch": 2.1366711772665763, + "grad_norm": 1.071017983244043, + "learning_rate": 2.318207374410956e-06, + "loss": 0.3194, + "step": 3158 + }, + { + "epoch": 2.1373477672530448, + "grad_norm": 1.084835448741031, + "learning_rate": 2.3148855574771706e-06, + "loss": 0.3166, + "step": 3159 + }, + { + "epoch": 2.138024357239513, + "grad_norm": 1.0504154783898043, + "learning_rate": 2.3115654051696097e-06, + "loss": 0.3186, + "step": 3160 + }, + { + "epoch": 2.1387009472259813, + "grad_norm": 1.0803525128154143, + "learning_rate": 2.3082469195465893e-06, + "loss": 0.3214, + "step": 3161 + }, + { + "epoch": 2.1393775372124493, + "grad_norm": 1.1031238529251692, + "learning_rate": 2.304930102665389e-06, + "loss": 0.316, + "step": 3162 + }, + { + "epoch": 2.1400541271989173, + "grad_norm": 1.0611509470802538, + "learning_rate": 2.3016149565822608e-06, + "loss": 0.3191, + "step": 3163 + }, + { + "epoch": 2.1407307171853858, + "grad_norm": 1.1840898020737045, + "learning_rate": 2.2983014833524115e-06, + "loss": 0.3436, + "step": 3164 + }, + { + "epoch": 2.141407307171854, + "grad_norm": 1.100638275146891, + "learning_rate": 2.2949896850300186e-06, + "loss": 0.3245, + "step": 3165 + }, + { + "epoch": 2.1420838971583223, + "grad_norm": 1.0668940636710917, + "learning_rate": 2.2916795636682197e-06, + "loss": 0.3131, + "step": 3166 + }, + { + "epoch": 2.1427604871447903, + "grad_norm": 1.0747765392926185, + "learning_rate": 2.288371121319109e-06, + "loss": 0.3116, + "step": 3167 + }, + { + "epoch": 2.1434370771312583, + "grad_norm": 1.063660703391759, + "learning_rate": 2.2850643600337435e-06, + "loss": 0.3123, + "step": 3168 + }, + { + "epoch": 2.1441136671177268, + "grad_norm": 1.1520263523555183, + "learning_rate": 2.281759281862137e-06, + "loss": 0.3456, + "step": 3169 + }, + { + "epoch": 2.144790257104195, + "grad_norm": 1.0397793529719004, + "learning_rate": 2.278455888853262e-06, + "loss": 0.3139, + "step": 3170 + }, + { + "epoch": 2.1454668470906633, + "grad_norm": 1.1764636653039535, + "learning_rate": 2.2751541830550417e-06, + "loss": 0.3477, + "step": 3171 + }, + { + "epoch": 2.1461434370771313, + "grad_norm": 1.096044087493195, + "learning_rate": 2.2718541665143546e-06, + "loss": 0.3176, + "step": 3172 + }, + { + "epoch": 2.1468200270635993, + "grad_norm": 1.0971650520919818, + "learning_rate": 2.2685558412770344e-06, + "loss": 0.3204, + "step": 3173 + }, + { + "epoch": 2.147496617050068, + "grad_norm": 1.038875194934764, + "learning_rate": 2.265259209387867e-06, + "loss": 0.3108, + "step": 3174 + }, + { + "epoch": 2.148173207036536, + "grad_norm": 1.0673465086445297, + "learning_rate": 2.261964272890582e-06, + "loss": 0.3119, + "step": 3175 + }, + { + "epoch": 2.1488497970230043, + "grad_norm": 1.084616288360263, + "learning_rate": 2.258671033827866e-06, + "loss": 0.3166, + "step": 3176 + }, + { + "epoch": 2.1495263870094723, + "grad_norm": 1.061414923306606, + "learning_rate": 2.2553794942413506e-06, + "loss": 0.3116, + "step": 3177 + }, + { + "epoch": 2.1502029769959403, + "grad_norm": 1.105324830905539, + "learning_rate": 2.2520896561716086e-06, + "loss": 0.3187, + "step": 3178 + }, + { + "epoch": 2.150879566982409, + "grad_norm": 1.0254265156658997, + "learning_rate": 2.248801521658167e-06, + "loss": 0.303, + "step": 3179 + }, + { + "epoch": 2.151556156968877, + "grad_norm": 1.1040647514313622, + "learning_rate": 2.245515092739488e-06, + "loss": 0.3246, + "step": 3180 + }, + { + "epoch": 2.1522327469553453, + "grad_norm": 1.1083311557638789, + "learning_rate": 2.242230371452982e-06, + "loss": 0.3169, + "step": 3181 + }, + { + "epoch": 2.1529093369418133, + "grad_norm": 1.076610648441405, + "learning_rate": 2.2389473598349994e-06, + "loss": 0.3179, + "step": 3182 + }, + { + "epoch": 2.1535859269282813, + "grad_norm": 1.048408774628365, + "learning_rate": 2.2356660599208335e-06, + "loss": 0.3018, + "step": 3183 + }, + { + "epoch": 2.15426251691475, + "grad_norm": 1.1348917295010865, + "learning_rate": 2.2323864737447067e-06, + "loss": 0.3282, + "step": 3184 + }, + { + "epoch": 2.154939106901218, + "grad_norm": 1.1325651005433037, + "learning_rate": 2.229108603339789e-06, + "loss": 0.3279, + "step": 3185 + }, + { + "epoch": 2.1556156968876863, + "grad_norm": 1.0881271183176577, + "learning_rate": 2.2258324507381834e-06, + "loss": 0.3198, + "step": 3186 + }, + { + "epoch": 2.1562922868741543, + "grad_norm": 1.09201320779718, + "learning_rate": 2.2225580179709303e-06, + "loss": 0.3227, + "step": 3187 + }, + { + "epoch": 2.1569688768606223, + "grad_norm": 1.0944716419384257, + "learning_rate": 2.219285307067997e-06, + "loss": 0.3277, + "step": 3188 + }, + { + "epoch": 2.157645466847091, + "grad_norm": 1.0874606281070929, + "learning_rate": 2.2160143200582906e-06, + "loss": 0.3159, + "step": 3189 + }, + { + "epoch": 2.158322056833559, + "grad_norm": 1.1209681196560373, + "learning_rate": 2.2127450589696475e-06, + "loss": 0.3168, + "step": 3190 + }, + { + "epoch": 2.1589986468200273, + "grad_norm": 1.098721684394217, + "learning_rate": 2.209477525828831e-06, + "loss": 0.3255, + "step": 3191 + }, + { + "epoch": 2.1596752368064953, + "grad_norm": 1.1409105232572194, + "learning_rate": 2.2062117226615375e-06, + "loss": 0.3271, + "step": 3192 + }, + { + "epoch": 2.1603518267929633, + "grad_norm": 1.0909547430292323, + "learning_rate": 2.202947651492387e-06, + "loss": 0.3185, + "step": 3193 + }, + { + "epoch": 2.161028416779432, + "grad_norm": 1.026728556296274, + "learning_rate": 2.1996853143449285e-06, + "loss": 0.3061, + "step": 3194 + }, + { + "epoch": 2.1617050067659, + "grad_norm": 1.0788020345952956, + "learning_rate": 2.1964247132416373e-06, + "loss": 0.3146, + "step": 3195 + }, + { + "epoch": 2.1623815967523683, + "grad_norm": 1.0693341232447195, + "learning_rate": 2.1931658502039067e-06, + "loss": 0.3279, + "step": 3196 + }, + { + "epoch": 2.1630581867388363, + "grad_norm": 1.1088375426769643, + "learning_rate": 2.1899087272520596e-06, + "loss": 0.3271, + "step": 3197 + }, + { + "epoch": 2.1637347767253043, + "grad_norm": 1.102380907668985, + "learning_rate": 2.186653346405333e-06, + "loss": 0.3209, + "step": 3198 + }, + { + "epoch": 2.164411366711773, + "grad_norm": 1.0593601678630866, + "learning_rate": 2.1833997096818897e-06, + "loss": 0.316, + "step": 3199 + }, + { + "epoch": 2.165087956698241, + "grad_norm": 1.1126938288072634, + "learning_rate": 2.1801478190988107e-06, + "loss": 0.3289, + "step": 3200 + }, + { + "epoch": 2.1657645466847093, + "grad_norm": 1.0980632775517076, + "learning_rate": 2.1768976766720896e-06, + "loss": 0.3212, + "step": 3201 + }, + { + "epoch": 2.1664411366711773, + "grad_norm": 1.0952565927822706, + "learning_rate": 2.1736492844166406e-06, + "loss": 0.3109, + "step": 3202 + }, + { + "epoch": 2.1671177266576453, + "grad_norm": 1.0878279545464584, + "learning_rate": 2.170402644346294e-06, + "loss": 0.3221, + "step": 3203 + }, + { + "epoch": 2.167794316644114, + "grad_norm": 1.173408318144088, + "learning_rate": 2.16715775847379e-06, + "loss": 0.3411, + "step": 3204 + }, + { + "epoch": 2.168470906630582, + "grad_norm": 1.068753888253633, + "learning_rate": 2.163914628810781e-06, + "loss": 0.3158, + "step": 3205 + }, + { + "epoch": 2.16914749661705, + "grad_norm": 1.1375964881134428, + "learning_rate": 2.1606732573678344e-06, + "loss": 0.3256, + "step": 3206 + }, + { + "epoch": 2.1698240866035183, + "grad_norm": 1.054216379354541, + "learning_rate": 2.157433646154426e-06, + "loss": 0.3102, + "step": 3207 + }, + { + "epoch": 2.1705006765899864, + "grad_norm": 1.0995668409591008, + "learning_rate": 2.154195797178941e-06, + "loss": 0.318, + "step": 3208 + }, + { + "epoch": 2.171177266576455, + "grad_norm": 1.0364530645555354, + "learning_rate": 2.1509597124486693e-06, + "loss": 0.308, + "step": 3209 + }, + { + "epoch": 2.171853856562923, + "grad_norm": 1.0969483375908629, + "learning_rate": 2.147725393969811e-06, + "loss": 0.3231, + "step": 3210 + }, + { + "epoch": 2.172530446549391, + "grad_norm": 1.0786800183544625, + "learning_rate": 2.1444928437474667e-06, + "loss": 0.3167, + "step": 3211 + }, + { + "epoch": 2.1732070365358593, + "grad_norm": 1.0898201172124493, + "learning_rate": 2.1412620637856445e-06, + "loss": 0.3105, + "step": 3212 + }, + { + "epoch": 2.1738836265223274, + "grad_norm": 1.0684289989990967, + "learning_rate": 2.138033056087256e-06, + "loss": 0.319, + "step": 3213 + }, + { + "epoch": 2.174560216508796, + "grad_norm": 1.0753064396583398, + "learning_rate": 2.1348058226541072e-06, + "loss": 0.3133, + "step": 3214 + }, + { + "epoch": 2.175236806495264, + "grad_norm": 1.1097920340549594, + "learning_rate": 2.1315803654869125e-06, + "loss": 0.3231, + "step": 3215 + }, + { + "epoch": 2.175913396481732, + "grad_norm": 1.1131696393830561, + "learning_rate": 2.1283566865852824e-06, + "loss": 0.329, + "step": 3216 + }, + { + "epoch": 2.1765899864682003, + "grad_norm": 1.0978277518042483, + "learning_rate": 2.1251347879477217e-06, + "loss": 0.329, + "step": 3217 + }, + { + "epoch": 2.1772665764546684, + "grad_norm": 1.08525798764089, + "learning_rate": 2.1219146715716332e-06, + "loss": 0.3227, + "step": 3218 + }, + { + "epoch": 2.177943166441137, + "grad_norm": 1.0584478988081187, + "learning_rate": 2.1186963394533165e-06, + "loss": 0.3088, + "step": 3219 + }, + { + "epoch": 2.178619756427605, + "grad_norm": 1.1183388150044842, + "learning_rate": 2.1154797935879647e-06, + "loss": 0.3311, + "step": 3220 + }, + { + "epoch": 2.179296346414073, + "grad_norm": 1.0851888500390343, + "learning_rate": 2.112265035969664e-06, + "loss": 0.3211, + "step": 3221 + }, + { + "epoch": 2.1799729364005414, + "grad_norm": 1.1071261649066217, + "learning_rate": 2.1090520685913874e-06, + "loss": 0.3198, + "step": 3222 + }, + { + "epoch": 2.1806495263870094, + "grad_norm": 1.0996624108543835, + "learning_rate": 2.1058408934450055e-06, + "loss": 0.324, + "step": 3223 + }, + { + "epoch": 2.181326116373478, + "grad_norm": 1.1979147047573786, + "learning_rate": 2.102631512521269e-06, + "loss": 0.3285, + "step": 3224 + }, + { + "epoch": 2.182002706359946, + "grad_norm": 1.0688814010406351, + "learning_rate": 2.099423927809826e-06, + "loss": 0.3093, + "step": 3225 + }, + { + "epoch": 2.182679296346414, + "grad_norm": 1.1007338667357167, + "learning_rate": 2.096218141299203e-06, + "loss": 0.3132, + "step": 3226 + }, + { + "epoch": 2.1833558863328824, + "grad_norm": 1.03037091500764, + "learning_rate": 2.0930141549768145e-06, + "loss": 0.3025, + "step": 3227 + }, + { + "epoch": 2.1840324763193504, + "grad_norm": 1.0833469799514677, + "learning_rate": 2.089811970828961e-06, + "loss": 0.3168, + "step": 3228 + }, + { + "epoch": 2.184709066305819, + "grad_norm": 1.1265826881938268, + "learning_rate": 2.086611590840826e-06, + "loss": 0.3189, + "step": 3229 + }, + { + "epoch": 2.185385656292287, + "grad_norm": 1.0948813622373013, + "learning_rate": 2.0834130169964695e-06, + "loss": 0.3181, + "step": 3230 + }, + { + "epoch": 2.186062246278755, + "grad_norm": 1.0918852153574339, + "learning_rate": 2.0802162512788337e-06, + "loss": 0.3211, + "step": 3231 + }, + { + "epoch": 2.1867388362652234, + "grad_norm": 1.09555208352549, + "learning_rate": 2.0770212956697435e-06, + "loss": 0.3291, + "step": 3232 + }, + { + "epoch": 2.1874154262516914, + "grad_norm": 1.0875727207058634, + "learning_rate": 2.073828152149898e-06, + "loss": 0.3328, + "step": 3233 + }, + { + "epoch": 2.18809201623816, + "grad_norm": 1.1287418139427734, + "learning_rate": 2.0706368226988772e-06, + "loss": 0.327, + "step": 3234 + }, + { + "epoch": 2.188768606224628, + "grad_norm": 1.1205981844748083, + "learning_rate": 2.0674473092951286e-06, + "loss": 0.3139, + "step": 3235 + }, + { + "epoch": 2.189445196211096, + "grad_norm": 1.1117176246248053, + "learning_rate": 2.064259613915981e-06, + "loss": 0.3301, + "step": 3236 + }, + { + "epoch": 2.1901217861975644, + "grad_norm": 1.0804406611551591, + "learning_rate": 2.061073738537635e-06, + "loss": 0.3145, + "step": 3237 + }, + { + "epoch": 2.1907983761840324, + "grad_norm": 1.1115435682461963, + "learning_rate": 2.0578896851351606e-06, + "loss": 0.3223, + "step": 3238 + }, + { + "epoch": 2.191474966170501, + "grad_norm": 1.152146545504736, + "learning_rate": 2.0547074556824964e-06, + "loss": 0.3362, + "step": 3239 + }, + { + "epoch": 2.192151556156969, + "grad_norm": 1.1197461664326775, + "learning_rate": 2.0515270521524562e-06, + "loss": 0.317, + "step": 3240 + }, + { + "epoch": 2.192828146143437, + "grad_norm": 1.1004599108012882, + "learning_rate": 2.0483484765167172e-06, + "loss": 0.3128, + "step": 3241 + }, + { + "epoch": 2.1935047361299054, + "grad_norm": 1.0878516434129133, + "learning_rate": 2.0451717307458287e-06, + "loss": 0.3088, + "step": 3242 + }, + { + "epoch": 2.1941813261163734, + "grad_norm": 1.0415435208065689, + "learning_rate": 2.041996816809197e-06, + "loss": 0.2941, + "step": 3243 + }, + { + "epoch": 2.194857916102842, + "grad_norm": 1.142413372852838, + "learning_rate": 2.0388237366751005e-06, + "loss": 0.3242, + "step": 3244 + }, + { + "epoch": 2.19553450608931, + "grad_norm": 1.1143746222524338, + "learning_rate": 2.0356524923106763e-06, + "loss": 0.3205, + "step": 3245 + }, + { + "epoch": 2.196211096075778, + "grad_norm": 1.1433323908983573, + "learning_rate": 2.032483085681927e-06, + "loss": 0.3283, + "step": 3246 + }, + { + "epoch": 2.1968876860622464, + "grad_norm": 1.1046803882827345, + "learning_rate": 2.029315518753711e-06, + "loss": 0.3182, + "step": 3247 + }, + { + "epoch": 2.1975642760487144, + "grad_norm": 1.0799027923360895, + "learning_rate": 2.0261497934897507e-06, + "loss": 0.3165, + "step": 3248 + }, + { + "epoch": 2.198240866035183, + "grad_norm": 1.0719060596376357, + "learning_rate": 2.0229859118526244e-06, + "loss": 0.3104, + "step": 3249 + }, + { + "epoch": 2.198917456021651, + "grad_norm": 1.0967964847012261, + "learning_rate": 2.019823875803771e-06, + "loss": 0.3195, + "step": 3250 + }, + { + "epoch": 2.199594046008119, + "grad_norm": 1.0935366419862107, + "learning_rate": 2.0166636873034807e-06, + "loss": 0.3127, + "step": 3251 + }, + { + "epoch": 2.2002706359945874, + "grad_norm": 1.0607589844761274, + "learning_rate": 2.0135053483108973e-06, + "loss": 0.3138, + "step": 3252 + }, + { + "epoch": 2.2009472259810554, + "grad_norm": 1.0647223788232456, + "learning_rate": 2.0103488607840233e-06, + "loss": 0.3202, + "step": 3253 + }, + { + "epoch": 2.201623815967524, + "grad_norm": 1.089194003129358, + "learning_rate": 2.00719422667971e-06, + "loss": 0.3161, + "step": 3254 + }, + { + "epoch": 2.202300405953992, + "grad_norm": 1.1041258251832875, + "learning_rate": 2.004041447953663e-06, + "loss": 0.3181, + "step": 3255 + }, + { + "epoch": 2.20297699594046, + "grad_norm": 1.0599321669780764, + "learning_rate": 2.0008905265604316e-06, + "loss": 0.3136, + "step": 3256 + }, + { + "epoch": 2.2036535859269284, + "grad_norm": 1.0851323571905946, + "learning_rate": 1.9977414644534206e-06, + "loss": 0.3267, + "step": 3257 + }, + { + "epoch": 2.2043301759133964, + "grad_norm": 1.0985336027662924, + "learning_rate": 1.9945942635848745e-06, + "loss": 0.314, + "step": 3258 + }, + { + "epoch": 2.205006765899865, + "grad_norm": 1.085049523892315, + "learning_rate": 1.9914489259058933e-06, + "loss": 0.3175, + "step": 3259 + }, + { + "epoch": 2.205683355886333, + "grad_norm": 1.0936457064241567, + "learning_rate": 1.9883054533664128e-06, + "loss": 0.3112, + "step": 3260 + }, + { + "epoch": 2.206359945872801, + "grad_norm": 1.137587536853472, + "learning_rate": 1.985163847915217e-06, + "loss": 0.3261, + "step": 3261 + }, + { + "epoch": 2.2070365358592694, + "grad_norm": 1.1502869126601807, + "learning_rate": 1.9820241114999334e-06, + "loss": 0.3335, + "step": 3262 + }, + { + "epoch": 2.2077131258457374, + "grad_norm": 1.084424883963079, + "learning_rate": 1.9788862460670305e-06, + "loss": 0.3133, + "step": 3263 + }, + { + "epoch": 2.208389715832206, + "grad_norm": 1.0941965681535513, + "learning_rate": 1.9757502535618137e-06, + "loss": 0.322, + "step": 3264 + }, + { + "epoch": 2.209066305818674, + "grad_norm": 1.1036070624995395, + "learning_rate": 1.9726161359284283e-06, + "loss": 0.3169, + "step": 3265 + }, + { + "epoch": 2.209742895805142, + "grad_norm": 1.1342514431296868, + "learning_rate": 1.96948389510986e-06, + "loss": 0.3324, + "step": 3266 + }, + { + "epoch": 2.2104194857916104, + "grad_norm": 1.1379243987804681, + "learning_rate": 1.9663535330479305e-06, + "loss": 0.3246, + "step": 3267 + }, + { + "epoch": 2.2110960757780784, + "grad_norm": 1.08204091272448, + "learning_rate": 1.963225051683292e-06, + "loss": 0.3155, + "step": 3268 + }, + { + "epoch": 2.2117726657645465, + "grad_norm": 1.0590795572638303, + "learning_rate": 1.9600984529554366e-06, + "loss": 0.3157, + "step": 3269 + }, + { + "epoch": 2.212449255751015, + "grad_norm": 1.1260939943685944, + "learning_rate": 1.956973738802689e-06, + "loss": 0.3262, + "step": 3270 + }, + { + "epoch": 2.213125845737483, + "grad_norm": 1.0964961612693531, + "learning_rate": 1.953850911162199e-06, + "loss": 0.3111, + "step": 3271 + }, + { + "epoch": 2.2138024357239514, + "grad_norm": 1.084500344709526, + "learning_rate": 1.950729971969955e-06, + "loss": 0.3114, + "step": 3272 + }, + { + "epoch": 2.2144790257104194, + "grad_norm": 1.0741713297188433, + "learning_rate": 1.9476109231607687e-06, + "loss": 0.3211, + "step": 3273 + }, + { + "epoch": 2.2151556156968875, + "grad_norm": 1.1128583013601934, + "learning_rate": 1.9444937666682834e-06, + "loss": 0.3282, + "step": 3274 + }, + { + "epoch": 2.215832205683356, + "grad_norm": 1.0783496600813218, + "learning_rate": 1.941378504424968e-06, + "loss": 0.3188, + "step": 3275 + }, + { + "epoch": 2.216508795669824, + "grad_norm": 1.142615372525608, + "learning_rate": 1.938265138362118e-06, + "loss": 0.3415, + "step": 3276 + }, + { + "epoch": 2.2171853856562924, + "grad_norm": 1.1176536155289774, + "learning_rate": 1.935153670409853e-06, + "loss": 0.3242, + "step": 3277 + }, + { + "epoch": 2.2178619756427604, + "grad_norm": 1.1162769030229303, + "learning_rate": 1.9320441024971113e-06, + "loss": 0.3249, + "step": 3278 + }, + { + "epoch": 2.2185385656292285, + "grad_norm": 1.1016011369705836, + "learning_rate": 1.928936436551661e-06, + "loss": 0.3244, + "step": 3279 + }, + { + "epoch": 2.219215155615697, + "grad_norm": 1.12092491179393, + "learning_rate": 1.925830674500088e-06, + "loss": 0.3326, + "step": 3280 + }, + { + "epoch": 2.219891745602165, + "grad_norm": 1.0612791964845465, + "learning_rate": 1.922726818267795e-06, + "loss": 0.3052, + "step": 3281 + }, + { + "epoch": 2.2205683355886334, + "grad_norm": 1.1058906838071672, + "learning_rate": 1.9196248697790066e-06, + "loss": 0.3153, + "step": 3282 + }, + { + "epoch": 2.2212449255751014, + "grad_norm": 1.0743562816872771, + "learning_rate": 1.916524830956763e-06, + "loss": 0.3144, + "step": 3283 + }, + { + "epoch": 2.2219215155615695, + "grad_norm": 1.0751463117493012, + "learning_rate": 1.913426703722924e-06, + "loss": 0.3158, + "step": 3284 + }, + { + "epoch": 2.222598105548038, + "grad_norm": 1.1144507027554167, + "learning_rate": 1.9103304899981603e-06, + "loss": 0.3091, + "step": 3285 + }, + { + "epoch": 2.223274695534506, + "grad_norm": 1.0893710367294782, + "learning_rate": 1.9072361917019538e-06, + "loss": 0.3055, + "step": 3286 + }, + { + "epoch": 2.2239512855209744, + "grad_norm": 1.077157428644335, + "learning_rate": 1.9041438107526055e-06, + "loss": 0.3162, + "step": 3287 + }, + { + "epoch": 2.2246278755074425, + "grad_norm": 1.0969452093928271, + "learning_rate": 1.901053349067225e-06, + "loss": 0.3106, + "step": 3288 + }, + { + "epoch": 2.2253044654939105, + "grad_norm": 1.0875299012338826, + "learning_rate": 1.8979648085617342e-06, + "loss": 0.3163, + "step": 3289 + }, + { + "epoch": 2.225981055480379, + "grad_norm": 1.0762528308437525, + "learning_rate": 1.894878191150859e-06, + "loss": 0.3148, + "step": 3290 + }, + { + "epoch": 2.226657645466847, + "grad_norm": 1.1518906662810515, + "learning_rate": 1.891793498748134e-06, + "loss": 0.3331, + "step": 3291 + }, + { + "epoch": 2.2273342354533154, + "grad_norm": 1.1317721373872631, + "learning_rate": 1.888710733265905e-06, + "loss": 0.3285, + "step": 3292 + }, + { + "epoch": 2.2280108254397835, + "grad_norm": 1.0903729072189, + "learning_rate": 1.8856298966153214e-06, + "loss": 0.3116, + "step": 3293 + }, + { + "epoch": 2.2286874154262515, + "grad_norm": 1.0947288957335477, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.3233, + "step": 3294 + }, + { + "epoch": 2.22936400541272, + "grad_norm": 1.0993037404763306, + "learning_rate": 1.8794740174476966e-06, + "loss": 0.316, + "step": 3295 + }, + { + "epoch": 2.230040595399188, + "grad_norm": 1.108437016598285, + "learning_rate": 1.87639897874697e-06, + "loss": 0.3184, + "step": 3296 + }, + { + "epoch": 2.2307171853856564, + "grad_norm": 1.114164944749697, + "learning_rate": 1.8733258765105129e-06, + "loss": 0.3203, + "step": 3297 + }, + { + "epoch": 2.2313937753721245, + "grad_norm": 1.1083368432880243, + "learning_rate": 1.8702547126434818e-06, + "loss": 0.3183, + "step": 3298 + }, + { + "epoch": 2.2320703653585925, + "grad_norm": 1.1175177439313144, + "learning_rate": 1.8671854890498308e-06, + "loss": 0.3245, + "step": 3299 + }, + { + "epoch": 2.232746955345061, + "grad_norm": 1.1254683235367513, + "learning_rate": 1.864118207632315e-06, + "loss": 0.32, + "step": 3300 + }, + { + "epoch": 2.233423545331529, + "grad_norm": 1.0877107601148008, + "learning_rate": 1.8610528702924851e-06, + "loss": 0.3154, + "step": 3301 + }, + { + "epoch": 2.2341001353179974, + "grad_norm": 1.1089743598037287, + "learning_rate": 1.8579894789306813e-06, + "loss": 0.3241, + "step": 3302 + }, + { + "epoch": 2.2347767253044655, + "grad_norm": 1.0782980078206152, + "learning_rate": 1.8549280354460437e-06, + "loss": 0.3098, + "step": 3303 + }, + { + "epoch": 2.2354533152909335, + "grad_norm": 1.0864053175720063, + "learning_rate": 1.851868541736503e-06, + "loss": 0.31, + "step": 3304 + }, + { + "epoch": 2.236129905277402, + "grad_norm": 1.1387094568267146, + "learning_rate": 1.8488109996987774e-06, + "loss": 0.3072, + "step": 3305 + }, + { + "epoch": 2.23680649526387, + "grad_norm": 1.0956085211301947, + "learning_rate": 1.845755411228382e-06, + "loss": 0.3101, + "step": 3306 + }, + { + "epoch": 2.2374830852503385, + "grad_norm": 1.1485300394760511, + "learning_rate": 1.8427017782196126e-06, + "loss": 0.3279, + "step": 3307 + }, + { + "epoch": 2.2381596752368065, + "grad_norm": 1.1474997281514587, + "learning_rate": 1.8396501025655594e-06, + "loss": 0.3234, + "step": 3308 + }, + { + "epoch": 2.2388362652232745, + "grad_norm": 1.0913245328103054, + "learning_rate": 1.8366003861580966e-06, + "loss": 0.3169, + "step": 3309 + }, + { + "epoch": 2.239512855209743, + "grad_norm": 1.123623576016734, + "learning_rate": 1.8335526308878877e-06, + "loss": 0.3242, + "step": 3310 + }, + { + "epoch": 2.240189445196211, + "grad_norm": 1.116077851758487, + "learning_rate": 1.8305068386443696e-06, + "loss": 0.3152, + "step": 3311 + }, + { + "epoch": 2.2408660351826795, + "grad_norm": 1.060396643319164, + "learning_rate": 1.8274630113157727e-06, + "loss": 0.3067, + "step": 3312 + }, + { + "epoch": 2.2415426251691475, + "grad_norm": 1.127108287277122, + "learning_rate": 1.8244211507891064e-06, + "loss": 0.328, + "step": 3313 + }, + { + "epoch": 2.2422192151556155, + "grad_norm": 1.0690531696713788, + "learning_rate": 1.8213812589501611e-06, + "loss": 0.307, + "step": 3314 + }, + { + "epoch": 2.242895805142084, + "grad_norm": 1.122336479062686, + "learning_rate": 1.818343337683503e-06, + "loss": 0.3227, + "step": 3315 + }, + { + "epoch": 2.243572395128552, + "grad_norm": 1.1359795459881408, + "learning_rate": 1.815307388872481e-06, + "loss": 0.3262, + "step": 3316 + }, + { + "epoch": 2.2442489851150205, + "grad_norm": 1.0622564695149823, + "learning_rate": 1.8122734143992216e-06, + "loss": 0.3005, + "step": 3317 + }, + { + "epoch": 2.2449255751014885, + "grad_norm": 1.0814269861374335, + "learning_rate": 1.8092414161446225e-06, + "loss": 0.3107, + "step": 3318 + }, + { + "epoch": 2.2456021650879565, + "grad_norm": 1.1459700024946626, + "learning_rate": 1.8062113959883616e-06, + "loss": 0.3297, + "step": 3319 + }, + { + "epoch": 2.246278755074425, + "grad_norm": 1.1131065589915317, + "learning_rate": 1.8031833558088858e-06, + "loss": 0.3139, + "step": 3320 + }, + { + "epoch": 2.246955345060893, + "grad_norm": 1.1666161981810397, + "learning_rate": 1.8001572974834169e-06, + "loss": 0.3375, + "step": 3321 + }, + { + "epoch": 2.2476319350473615, + "grad_norm": 1.113297941437187, + "learning_rate": 1.7971332228879518e-06, + "loss": 0.3205, + "step": 3322 + }, + { + "epoch": 2.2483085250338295, + "grad_norm": 1.102423882613334, + "learning_rate": 1.7941111338972484e-06, + "loss": 0.3114, + "step": 3323 + }, + { + "epoch": 2.2489851150202975, + "grad_norm": 1.0811960853990636, + "learning_rate": 1.7910910323848435e-06, + "loss": 0.3027, + "step": 3324 + }, + { + "epoch": 2.249661705006766, + "grad_norm": 1.0805295002823965, + "learning_rate": 1.7880729202230334e-06, + "loss": 0.3221, + "step": 3325 + }, + { + "epoch": 2.250338294993234, + "grad_norm": 1.092699611162999, + "learning_rate": 1.7850567992828865e-06, + "loss": 0.3118, + "step": 3326 + }, + { + "epoch": 2.2510148849797025, + "grad_norm": 1.12779435947042, + "learning_rate": 1.7820426714342375e-06, + "loss": 0.3082, + "step": 3327 + }, + { + "epoch": 2.2516914749661705, + "grad_norm": 1.1207219680619251, + "learning_rate": 1.7790305385456797e-06, + "loss": 0.3339, + "step": 3328 + }, + { + "epoch": 2.2523680649526385, + "grad_norm": 1.1140847565481993, + "learning_rate": 1.7760204024845745e-06, + "loss": 0.3261, + "step": 3329 + }, + { + "epoch": 2.253044654939107, + "grad_norm": 1.116370655879667, + "learning_rate": 1.7730122651170457e-06, + "loss": 0.3204, + "step": 3330 + }, + { + "epoch": 2.253721244925575, + "grad_norm": 1.1105220215816718, + "learning_rate": 1.7700061283079744e-06, + "loss": 0.3191, + "step": 3331 + }, + { + "epoch": 2.2543978349120435, + "grad_norm": 1.1039621507071047, + "learning_rate": 1.7670019939210025e-06, + "loss": 0.3206, + "step": 3332 + }, + { + "epoch": 2.2550744248985115, + "grad_norm": 1.1903598350264117, + "learning_rate": 1.763999863818533e-06, + "loss": 0.3247, + "step": 3333 + }, + { + "epoch": 2.2557510148849795, + "grad_norm": 1.108812204563875, + "learning_rate": 1.760999739861724e-06, + "loss": 0.3186, + "step": 3334 + }, + { + "epoch": 2.256427604871448, + "grad_norm": 1.1401390865079983, + "learning_rate": 1.7580016239104924e-06, + "loss": 0.3288, + "step": 3335 + }, + { + "epoch": 2.257104194857916, + "grad_norm": 1.1144430665962406, + "learning_rate": 1.755005517823506e-06, + "loss": 0.3144, + "step": 3336 + }, + { + "epoch": 2.2577807848443845, + "grad_norm": 1.0842963872846536, + "learning_rate": 1.7520114234581914e-06, + "loss": 0.3057, + "step": 3337 + }, + { + "epoch": 2.2584573748308525, + "grad_norm": 1.1424795584299616, + "learning_rate": 1.7490193426707236e-06, + "loss": 0.3242, + "step": 3338 + }, + { + "epoch": 2.2591339648173205, + "grad_norm": 1.1355605059209823, + "learning_rate": 1.7460292773160315e-06, + "loss": 0.326, + "step": 3339 + }, + { + "epoch": 2.259810554803789, + "grad_norm": 1.1011906814131458, + "learning_rate": 1.7430412292477978e-06, + "loss": 0.312, + "step": 3340 + }, + { + "epoch": 2.260487144790257, + "grad_norm": 1.138540676717295, + "learning_rate": 1.7400552003184463e-06, + "loss": 0.3127, + "step": 3341 + }, + { + "epoch": 2.2611637347767255, + "grad_norm": 1.1270333385021083, + "learning_rate": 1.7370711923791567e-06, + "loss": 0.3122, + "step": 3342 + }, + { + "epoch": 2.2618403247631935, + "grad_norm": 1.1015075611748257, + "learning_rate": 1.7340892072798544e-06, + "loss": 0.3164, + "step": 3343 + }, + { + "epoch": 2.2625169147496615, + "grad_norm": 1.140531977166998, + "learning_rate": 1.7311092468692082e-06, + "loss": 0.3227, + "step": 3344 + }, + { + "epoch": 2.26319350473613, + "grad_norm": 1.0957238030600263, + "learning_rate": 1.7281313129946302e-06, + "loss": 0.3045, + "step": 3345 + }, + { + "epoch": 2.263870094722598, + "grad_norm": 1.104971454612028, + "learning_rate": 1.725155407502282e-06, + "loss": 0.3232, + "step": 3346 + }, + { + "epoch": 2.2645466847090665, + "grad_norm": 1.1322417425026432, + "learning_rate": 1.7221815322370633e-06, + "loss": 0.3251, + "step": 3347 + }, + { + "epoch": 2.2652232746955345, + "grad_norm": 1.1350080826911368, + "learning_rate": 1.7192096890426192e-06, + "loss": 0.3247, + "step": 3348 + }, + { + "epoch": 2.2658998646820026, + "grad_norm": 1.0942392860611383, + "learning_rate": 1.7162398797613284e-06, + "loss": 0.3075, + "step": 3349 + }, + { + "epoch": 2.266576454668471, + "grad_norm": 1.1402895028747502, + "learning_rate": 1.7132721062343156e-06, + "loss": 0.3386, + "step": 3350 + }, + { + "epoch": 2.267253044654939, + "grad_norm": 1.1146478475025061, + "learning_rate": 1.7103063703014372e-06, + "loss": 0.3134, + "step": 3351 + }, + { + "epoch": 2.2679296346414075, + "grad_norm": 1.0618294213987924, + "learning_rate": 1.7073426738012939e-06, + "loss": 0.2987, + "step": 3352 + }, + { + "epoch": 2.2686062246278755, + "grad_norm": 1.118063298485389, + "learning_rate": 1.7043810185712135e-06, + "loss": 0.3193, + "step": 3353 + }, + { + "epoch": 2.2692828146143436, + "grad_norm": 1.0807042302554075, + "learning_rate": 1.7014214064472646e-06, + "loss": 0.3087, + "step": 3354 + }, + { + "epoch": 2.269959404600812, + "grad_norm": 1.141316907501067, + "learning_rate": 1.6984638392642467e-06, + "loss": 0.3253, + "step": 3355 + }, + { + "epoch": 2.27063599458728, + "grad_norm": 1.1220599605407322, + "learning_rate": 1.6955083188556947e-06, + "loss": 0.3262, + "step": 3356 + }, + { + "epoch": 2.2713125845737485, + "grad_norm": 1.0923070298453927, + "learning_rate": 1.6925548470538695e-06, + "loss": 0.3087, + "step": 3357 + }, + { + "epoch": 2.2719891745602165, + "grad_norm": 1.1350411400521894, + "learning_rate": 1.6896034256897626e-06, + "loss": 0.3199, + "step": 3358 + }, + { + "epoch": 2.2726657645466846, + "grad_norm": 1.0965410078432, + "learning_rate": 1.686654056593099e-06, + "loss": 0.3166, + "step": 3359 + }, + { + "epoch": 2.273342354533153, + "grad_norm": 1.101257652156372, + "learning_rate": 1.683706741592327e-06, + "loss": 0.3201, + "step": 3360 + }, + { + "epoch": 2.274018944519621, + "grad_norm": 1.1066956228417268, + "learning_rate": 1.6807614825146258e-06, + "loss": 0.3144, + "step": 3361 + }, + { + "epoch": 2.2746955345060895, + "grad_norm": 1.1346421941125058, + "learning_rate": 1.6778182811858934e-06, + "loss": 0.3215, + "step": 3362 + }, + { + "epoch": 2.2753721244925575, + "grad_norm": 1.0664768712631463, + "learning_rate": 1.6748771394307584e-06, + "loss": 0.3003, + "step": 3363 + }, + { + "epoch": 2.2760487144790256, + "grad_norm": 1.058773094429667, + "learning_rate": 1.671938059072571e-06, + "loss": 0.2911, + "step": 3364 + }, + { + "epoch": 2.276725304465494, + "grad_norm": 1.1096086763501996, + "learning_rate": 1.6690010419334008e-06, + "loss": 0.3125, + "step": 3365 + }, + { + "epoch": 2.277401894451962, + "grad_norm": 1.0957051269097933, + "learning_rate": 1.6660660898340392e-06, + "loss": 0.3105, + "step": 3366 + }, + { + "epoch": 2.2780784844384305, + "grad_norm": 1.1054903398293534, + "learning_rate": 1.6631332045939996e-06, + "loss": 0.3222, + "step": 3367 + }, + { + "epoch": 2.2787550744248986, + "grad_norm": 1.13905856570075, + "learning_rate": 1.6602023880315126e-06, + "loss": 0.3189, + "step": 3368 + }, + { + "epoch": 2.2794316644113666, + "grad_norm": 1.0891361743846752, + "learning_rate": 1.6572736419635288e-06, + "loss": 0.3123, + "step": 3369 + }, + { + "epoch": 2.280108254397835, + "grad_norm": 1.1544619993742387, + "learning_rate": 1.6543469682057105e-06, + "loss": 0.3324, + "step": 3370 + }, + { + "epoch": 2.280784844384303, + "grad_norm": 1.121760153736156, + "learning_rate": 1.651422368572436e-06, + "loss": 0.3034, + "step": 3371 + }, + { + "epoch": 2.2814614343707715, + "grad_norm": 1.0785468954532595, + "learning_rate": 1.648499844876802e-06, + "loss": 0.305, + "step": 3372 + }, + { + "epoch": 2.2821380243572396, + "grad_norm": 1.1182286766661833, + "learning_rate": 1.6455793989306169e-06, + "loss": 0.3097, + "step": 3373 + }, + { + "epoch": 2.2828146143437076, + "grad_norm": 1.0734889722576963, + "learning_rate": 1.642661032544396e-06, + "loss": 0.3085, + "step": 3374 + }, + { + "epoch": 2.283491204330176, + "grad_norm": 1.108891989678797, + "learning_rate": 1.639744747527371e-06, + "loss": 0.318, + "step": 3375 + }, + { + "epoch": 2.284167794316644, + "grad_norm": 1.1205993282113098, + "learning_rate": 1.636830545687481e-06, + "loss": 0.3318, + "step": 3376 + }, + { + "epoch": 2.2848443843031125, + "grad_norm": 1.0510503977154908, + "learning_rate": 1.6339184288313769e-06, + "loss": 0.3025, + "step": 3377 + }, + { + "epoch": 2.2855209742895806, + "grad_norm": 1.1120851647899683, + "learning_rate": 1.631008398764412e-06, + "loss": 0.3144, + "step": 3378 + }, + { + "epoch": 2.2861975642760486, + "grad_norm": 1.149039813226276, + "learning_rate": 1.6281004572906462e-06, + "loss": 0.3222, + "step": 3379 + }, + { + "epoch": 2.286874154262517, + "grad_norm": 1.0715588590696454, + "learning_rate": 1.6251946062128482e-06, + "loss": 0.2923, + "step": 3380 + }, + { + "epoch": 2.287550744248985, + "grad_norm": 1.1148798759956529, + "learning_rate": 1.6222908473324889e-06, + "loss": 0.3116, + "step": 3381 + }, + { + "epoch": 2.2882273342354535, + "grad_norm": 1.1030688885896307, + "learning_rate": 1.6193891824497438e-06, + "loss": 0.3274, + "step": 3382 + }, + { + "epoch": 2.2889039242219216, + "grad_norm": 1.1351302355210737, + "learning_rate": 1.616489613363486e-06, + "loss": 0.3161, + "step": 3383 + }, + { + "epoch": 2.2895805142083896, + "grad_norm": 1.0873188603335708, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.3095, + "step": 3384 + }, + { + "epoch": 2.290257104194858, + "grad_norm": 1.099302104968843, + "learning_rate": 1.6106967697694442e-06, + "loss": 0.3178, + "step": 3385 + }, + { + "epoch": 2.290933694181326, + "grad_norm": 1.0932057979976018, + "learning_rate": 1.6078034988529112e-06, + "loss": 0.3082, + "step": 3386 + }, + { + "epoch": 2.2916102841677946, + "grad_norm": 1.1199446924872456, + "learning_rate": 1.604912330915364e-06, + "loss": 0.3204, + "step": 3387 + }, + { + "epoch": 2.2922868741542626, + "grad_norm": 1.1172734074807096, + "learning_rate": 1.6020232677491732e-06, + "loss": 0.3165, + "step": 3388 + }, + { + "epoch": 2.2929634641407306, + "grad_norm": 1.1258570181484318, + "learning_rate": 1.5991363111454023e-06, + "loss": 0.3185, + "step": 3389 + }, + { + "epoch": 2.293640054127199, + "grad_norm": 1.1459021280959485, + "learning_rate": 1.5962514628938103e-06, + "loss": 0.3202, + "step": 3390 + }, + { + "epoch": 2.294316644113667, + "grad_norm": 1.0794144926972977, + "learning_rate": 1.5933687247828462e-06, + "loss": 0.3115, + "step": 3391 + }, + { + "epoch": 2.2949932341001356, + "grad_norm": 1.1517526760721986, + "learning_rate": 1.59048809859965e-06, + "loss": 0.3306, + "step": 3392 + }, + { + "epoch": 2.2956698240866036, + "grad_norm": 1.1561322037872452, + "learning_rate": 1.5876095861300567e-06, + "loss": 0.3189, + "step": 3393 + }, + { + "epoch": 2.2963464140730716, + "grad_norm": 1.1427339573349151, + "learning_rate": 1.5847331891585888e-06, + "loss": 0.3204, + "step": 3394 + }, + { + "epoch": 2.29702300405954, + "grad_norm": 1.1220120342409396, + "learning_rate": 1.5818589094684594e-06, + "loss": 0.3173, + "step": 3395 + }, + { + "epoch": 2.297699594046008, + "grad_norm": 1.1166275446306264, + "learning_rate": 1.5789867488415633e-06, + "loss": 0.3195, + "step": 3396 + }, + { + "epoch": 2.2983761840324766, + "grad_norm": 1.127764635480915, + "learning_rate": 1.5761167090584885e-06, + "loss": 0.3104, + "step": 3397 + }, + { + "epoch": 2.2990527740189446, + "grad_norm": 1.10975894657015, + "learning_rate": 1.5732487918985017e-06, + "loss": 0.3178, + "step": 3398 + }, + { + "epoch": 2.2997293640054126, + "grad_norm": 1.0738324905777639, + "learning_rate": 1.5703829991395602e-06, + "loss": 0.3055, + "step": 3399 + }, + { + "epoch": 2.300405953991881, + "grad_norm": 1.137934750987506, + "learning_rate": 1.5675193325582983e-06, + "loss": 0.3141, + "step": 3400 + }, + { + "epoch": 2.301082543978349, + "grad_norm": 1.1329215898229208, + "learning_rate": 1.5646577939300362e-06, + "loss": 0.3121, + "step": 3401 + }, + { + "epoch": 2.301759133964817, + "grad_norm": 1.102345858270791, + "learning_rate": 1.5617983850287737e-06, + "loss": 0.3084, + "step": 3402 + }, + { + "epoch": 2.3024357239512856, + "grad_norm": 1.1021439141083313, + "learning_rate": 1.5589411076271916e-06, + "loss": 0.3081, + "step": 3403 + }, + { + "epoch": 2.3031123139377536, + "grad_norm": 1.1394970532425004, + "learning_rate": 1.5560859634966457e-06, + "loss": 0.3211, + "step": 3404 + }, + { + "epoch": 2.303788903924222, + "grad_norm": 1.129693204042089, + "learning_rate": 1.5532329544071712e-06, + "loss": 0.3194, + "step": 3405 + }, + { + "epoch": 2.30446549391069, + "grad_norm": 1.107859806206584, + "learning_rate": 1.5503820821274812e-06, + "loss": 0.3205, + "step": 3406 + }, + { + "epoch": 2.305142083897158, + "grad_norm": 1.114577905263349, + "learning_rate": 1.5475333484249633e-06, + "loss": 0.3164, + "step": 3407 + }, + { + "epoch": 2.3058186738836266, + "grad_norm": 1.169566832221299, + "learning_rate": 1.544686755065677e-06, + "loss": 0.3304, + "step": 3408 + }, + { + "epoch": 2.3064952638700946, + "grad_norm": 1.1347548120296829, + "learning_rate": 1.5418423038143576e-06, + "loss": 0.3097, + "step": 3409 + }, + { + "epoch": 2.307171853856563, + "grad_norm": 1.1445595073032628, + "learning_rate": 1.5389999964344138e-06, + "loss": 0.325, + "step": 3410 + }, + { + "epoch": 2.307848443843031, + "grad_norm": 1.1650905686187547, + "learning_rate": 1.5361598346879193e-06, + "loss": 0.3262, + "step": 3411 + }, + { + "epoch": 2.308525033829499, + "grad_norm": 1.0740847149600803, + "learning_rate": 1.5333218203356243e-06, + "loss": 0.2951, + "step": 3412 + }, + { + "epoch": 2.3092016238159676, + "grad_norm": 1.1487668739745571, + "learning_rate": 1.5304859551369417e-06, + "loss": 0.333, + "step": 3413 + }, + { + "epoch": 2.3098782138024356, + "grad_norm": 1.0967164133714282, + "learning_rate": 1.5276522408499567e-06, + "loss": 0.3073, + "step": 3414 + }, + { + "epoch": 2.310554803788904, + "grad_norm": 1.1216155389870373, + "learning_rate": 1.5248206792314197e-06, + "loss": 0.332, + "step": 3415 + }, + { + "epoch": 2.311231393775372, + "grad_norm": 1.1108707580735167, + "learning_rate": 1.5219912720367474e-06, + "loss": 0.3186, + "step": 3416 + }, + { + "epoch": 2.31190798376184, + "grad_norm": 1.128033667017073, + "learning_rate": 1.5191640210200186e-06, + "loss": 0.318, + "step": 3417 + }, + { + "epoch": 2.3125845737483086, + "grad_norm": 1.1257536737162772, + "learning_rate": 1.5163389279339746e-06, + "loss": 0.3179, + "step": 3418 + }, + { + "epoch": 2.3132611637347766, + "grad_norm": 1.1114635673002509, + "learning_rate": 1.5135159945300232e-06, + "loss": 0.3135, + "step": 3419 + }, + { + "epoch": 2.313937753721245, + "grad_norm": 1.113924567701934, + "learning_rate": 1.5106952225582312e-06, + "loss": 0.3161, + "step": 3420 + }, + { + "epoch": 2.314614343707713, + "grad_norm": 1.1127315917413891, + "learning_rate": 1.5078766137673229e-06, + "loss": 0.3275, + "step": 3421 + }, + { + "epoch": 2.315290933694181, + "grad_norm": 1.1070729147789264, + "learning_rate": 1.5050601699046852e-06, + "loss": 0.3138, + "step": 3422 + }, + { + "epoch": 2.3159675236806496, + "grad_norm": 1.1263929228177223, + "learning_rate": 1.5022458927163618e-06, + "loss": 0.3198, + "step": 3423 + }, + { + "epoch": 2.3166441136671176, + "grad_norm": 1.086673640477385, + "learning_rate": 1.499433783947054e-06, + "loss": 0.3238, + "step": 3424 + }, + { + "epoch": 2.317320703653586, + "grad_norm": 1.133152214240818, + "learning_rate": 1.4966238453401161e-06, + "loss": 0.327, + "step": 3425 + }, + { + "epoch": 2.317997293640054, + "grad_norm": 1.1056603902423796, + "learning_rate": 1.4938160786375571e-06, + "loss": 0.3035, + "step": 3426 + }, + { + "epoch": 2.318673883626522, + "grad_norm": 1.1143015228059376, + "learning_rate": 1.4910104855800429e-06, + "loss": 0.3076, + "step": 3427 + }, + { + "epoch": 2.3193504736129906, + "grad_norm": 1.1329223685252325, + "learning_rate": 1.488207067906891e-06, + "loss": 0.3231, + "step": 3428 + }, + { + "epoch": 2.3200270635994586, + "grad_norm": 1.081248642990499, + "learning_rate": 1.4854058273560667e-06, + "loss": 0.3142, + "step": 3429 + }, + { + "epoch": 2.320703653585927, + "grad_norm": 1.1368462902893035, + "learning_rate": 1.4826067656641912e-06, + "loss": 0.3229, + "step": 3430 + }, + { + "epoch": 2.321380243572395, + "grad_norm": 1.1569343820591698, + "learning_rate": 1.479809884566528e-06, + "loss": 0.3266, + "step": 3431 + }, + { + "epoch": 2.322056833558863, + "grad_norm": 1.0941622455360218, + "learning_rate": 1.477015185796995e-06, + "loss": 0.3027, + "step": 3432 + }, + { + "epoch": 2.3227334235453316, + "grad_norm": 1.077477715606638, + "learning_rate": 1.4742226710881558e-06, + "loss": 0.3058, + "step": 3433 + }, + { + "epoch": 2.3234100135317997, + "grad_norm": 1.1232028136964634, + "learning_rate": 1.4714323421712163e-06, + "loss": 0.3106, + "step": 3434 + }, + { + "epoch": 2.324086603518268, + "grad_norm": 1.1789796526479444, + "learning_rate": 1.4686442007760315e-06, + "loss": 0.3343, + "step": 3435 + }, + { + "epoch": 2.324763193504736, + "grad_norm": 1.10183200367498, + "learning_rate": 1.465858248631099e-06, + "loss": 0.3107, + "step": 3436 + }, + { + "epoch": 2.325439783491204, + "grad_norm": 1.1268963270372936, + "learning_rate": 1.4630744874635611e-06, + "loss": 0.3143, + "step": 3437 + }, + { + "epoch": 2.3261163734776726, + "grad_norm": 1.1019247955445997, + "learning_rate": 1.460292918999195e-06, + "loss": 0.3029, + "step": 3438 + }, + { + "epoch": 2.3267929634641407, + "grad_norm": 1.1103547542753434, + "learning_rate": 1.4575135449624251e-06, + "loss": 0.3126, + "step": 3439 + }, + { + "epoch": 2.3274695534506087, + "grad_norm": 1.1700727872289949, + "learning_rate": 1.4547363670763138e-06, + "loss": 0.3133, + "step": 3440 + }, + { + "epoch": 2.328146143437077, + "grad_norm": 1.102597220280328, + "learning_rate": 1.4519613870625632e-06, + "loss": 0.303, + "step": 3441 + }, + { + "epoch": 2.328822733423545, + "grad_norm": 1.0846839260550367, + "learning_rate": 1.4491886066415084e-06, + "loss": 0.3003, + "step": 3442 + }, + { + "epoch": 2.3294993234100136, + "grad_norm": 1.158268108344539, + "learning_rate": 1.4464180275321255e-06, + "loss": 0.3206, + "step": 3443 + }, + { + "epoch": 2.3301759133964817, + "grad_norm": 1.1399961003317345, + "learning_rate": 1.4436496514520253e-06, + "loss": 0.3177, + "step": 3444 + }, + { + "epoch": 2.3308525033829497, + "grad_norm": 1.164233915510866, + "learning_rate": 1.4408834801174492e-06, + "loss": 0.3269, + "step": 3445 + }, + { + "epoch": 2.331529093369418, + "grad_norm": 1.1350971318386764, + "learning_rate": 1.438119515243277e-06, + "loss": 0.3118, + "step": 3446 + }, + { + "epoch": 2.332205683355886, + "grad_norm": 1.1451605130743008, + "learning_rate": 1.4353577585430152e-06, + "loss": 0.3265, + "step": 3447 + }, + { + "epoch": 2.3328822733423547, + "grad_norm": 1.1346672692917785, + "learning_rate": 1.4325982117288052e-06, + "loss": 0.3218, + "step": 3448 + }, + { + "epoch": 2.3335588633288227, + "grad_norm": 1.0893708151229857, + "learning_rate": 1.4298408765114191e-06, + "loss": 0.3106, + "step": 3449 + }, + { + "epoch": 2.3342354533152907, + "grad_norm": 1.1403266646337968, + "learning_rate": 1.4270857546002548e-06, + "loss": 0.3138, + "step": 3450 + }, + { + "epoch": 2.334912043301759, + "grad_norm": 1.0884277973793046, + "learning_rate": 1.4243328477033369e-06, + "loss": 0.3169, + "step": 3451 + }, + { + "epoch": 2.335588633288227, + "grad_norm": 1.117831052522907, + "learning_rate": 1.4215821575273219e-06, + "loss": 0.297, + "step": 3452 + }, + { + "epoch": 2.3362652232746957, + "grad_norm": 1.1159003918941182, + "learning_rate": 1.4188336857774892e-06, + "loss": 0.3152, + "step": 3453 + }, + { + "epoch": 2.3369418132611637, + "grad_norm": 1.1312782791706404, + "learning_rate": 1.4160874341577447e-06, + "loss": 0.3234, + "step": 3454 + }, + { + "epoch": 2.3376184032476317, + "grad_norm": 1.111147460466852, + "learning_rate": 1.413343404370613e-06, + "loss": 0.3147, + "step": 3455 + }, + { + "epoch": 2.3382949932341, + "grad_norm": 1.0705415087278132, + "learning_rate": 1.410601598117246e-06, + "loss": 0.3079, + "step": 3456 + }, + { + "epoch": 2.338971583220568, + "grad_norm": 1.1260681822779628, + "learning_rate": 1.4078620170974178e-06, + "loss": 0.3162, + "step": 3457 + }, + { + "epoch": 2.3396481732070367, + "grad_norm": 1.061992178188652, + "learning_rate": 1.4051246630095195e-06, + "loss": 0.3001, + "step": 3458 + }, + { + "epoch": 2.3403247631935047, + "grad_norm": 1.146386396406404, + "learning_rate": 1.4023895375505608e-06, + "loss": 0.3188, + "step": 3459 + }, + { + "epoch": 2.3410013531799727, + "grad_norm": 1.1256372502555996, + "learning_rate": 1.3996566424161746e-06, + "loss": 0.3267, + "step": 3460 + }, + { + "epoch": 2.341677943166441, + "grad_norm": 1.1170601039612857, + "learning_rate": 1.396925979300608e-06, + "loss": 0.313, + "step": 3461 + }, + { + "epoch": 2.342354533152909, + "grad_norm": 1.068555772983513, + "learning_rate": 1.3941975498967265e-06, + "loss": 0.2924, + "step": 3462 + }, + { + "epoch": 2.3430311231393777, + "grad_norm": 1.0854556175917942, + "learning_rate": 1.3914713558960064e-06, + "loss": 0.3109, + "step": 3463 + }, + { + "epoch": 2.3437077131258457, + "grad_norm": 1.140675727614274, + "learning_rate": 1.3887473989885441e-06, + "loss": 0.3198, + "step": 3464 + }, + { + "epoch": 2.3443843031123137, + "grad_norm": 1.1438387264589154, + "learning_rate": 1.3860256808630429e-06, + "loss": 0.3188, + "step": 3465 + }, + { + "epoch": 2.345060893098782, + "grad_norm": 1.0739040090609289, + "learning_rate": 1.383306203206823e-06, + "loss": 0.3096, + "step": 3466 + }, + { + "epoch": 2.34573748308525, + "grad_norm": 1.1026162523417182, + "learning_rate": 1.3805889677058148e-06, + "loss": 0.3151, + "step": 3467 + }, + { + "epoch": 2.3464140730717187, + "grad_norm": 1.143705999203667, + "learning_rate": 1.3778739760445552e-06, + "loss": 0.3306, + "step": 3468 + }, + { + "epoch": 2.3470906630581867, + "grad_norm": 1.0775382767982196, + "learning_rate": 1.375161229906195e-06, + "loss": 0.3095, + "step": 3469 + }, + { + "epoch": 2.3477672530446547, + "grad_norm": 1.171476023297586, + "learning_rate": 1.372450730972491e-06, + "loss": 0.3255, + "step": 3470 + }, + { + "epoch": 2.348443843031123, + "grad_norm": 1.121195342490227, + "learning_rate": 1.3697424809238058e-06, + "loss": 0.3165, + "step": 3471 + }, + { + "epoch": 2.349120433017591, + "grad_norm": 1.1290495914505805, + "learning_rate": 1.3670364814391062e-06, + "loss": 0.3286, + "step": 3472 + }, + { + "epoch": 2.3497970230040597, + "grad_norm": 1.159182727891767, + "learning_rate": 1.3643327341959684e-06, + "loss": 0.3287, + "step": 3473 + }, + { + "epoch": 2.3504736129905277, + "grad_norm": 1.1507321436774485, + "learning_rate": 1.361631240870569e-06, + "loss": 0.3233, + "step": 3474 + }, + { + "epoch": 2.3511502029769957, + "grad_norm": 1.112282443636384, + "learning_rate": 1.35893200313769e-06, + "loss": 0.3213, + "step": 3475 + }, + { + "epoch": 2.351826792963464, + "grad_norm": 1.140080162349322, + "learning_rate": 1.3562350226707106e-06, + "loss": 0.3233, + "step": 3476 + }, + { + "epoch": 2.352503382949932, + "grad_norm": 1.1200162002913803, + "learning_rate": 1.3535403011416159e-06, + "loss": 0.3194, + "step": 3477 + }, + { + "epoch": 2.3531799729364007, + "grad_norm": 1.122094756099197, + "learning_rate": 1.3508478402209858e-06, + "loss": 0.3222, + "step": 3478 + }, + { + "epoch": 2.3538565629228687, + "grad_norm": 1.1236134275566492, + "learning_rate": 1.3481576415780035e-06, + "loss": 0.3211, + "step": 3479 + }, + { + "epoch": 2.3545331529093367, + "grad_norm": 1.0899993630047502, + "learning_rate": 1.3454697068804434e-06, + "loss": 0.3041, + "step": 3480 + }, + { + "epoch": 2.355209742895805, + "grad_norm": 1.1664970485021213, + "learning_rate": 1.3427840377946826e-06, + "loss": 0.3195, + "step": 3481 + }, + { + "epoch": 2.3558863328822732, + "grad_norm": 1.1441128613778109, + "learning_rate": 1.3401006359856916e-06, + "loss": 0.3263, + "step": 3482 + }, + { + "epoch": 2.3565629228687417, + "grad_norm": 1.1061354898928037, + "learning_rate": 1.337419503117035e-06, + "loss": 0.3041, + "step": 3483 + }, + { + "epoch": 2.3572395128552097, + "grad_norm": 1.1073889865134197, + "learning_rate": 1.3347406408508695e-06, + "loss": 0.3039, + "step": 3484 + }, + { + "epoch": 2.3579161028416777, + "grad_norm": 1.128298768612049, + "learning_rate": 1.332064050847945e-06, + "loss": 0.3204, + "step": 3485 + }, + { + "epoch": 2.358592692828146, + "grad_norm": 1.143187580828165, + "learning_rate": 1.3293897347676032e-06, + "loss": 0.3195, + "step": 3486 + }, + { + "epoch": 2.3592692828146142, + "grad_norm": 1.1433164543433265, + "learning_rate": 1.3267176942677763e-06, + "loss": 0.3168, + "step": 3487 + }, + { + "epoch": 2.3599458728010827, + "grad_norm": 1.1095837297397795, + "learning_rate": 1.324047931004987e-06, + "loss": 0.3213, + "step": 3488 + }, + { + "epoch": 2.3606224627875507, + "grad_norm": 1.181348578466833, + "learning_rate": 1.321380446634342e-06, + "loss": 0.3142, + "step": 3489 + }, + { + "epoch": 2.3612990527740187, + "grad_norm": 1.1002417074246151, + "learning_rate": 1.31871524280954e-06, + "loss": 0.3046, + "step": 3490 + }, + { + "epoch": 2.361975642760487, + "grad_norm": 1.1461889181285505, + "learning_rate": 1.3160523211828612e-06, + "loss": 0.3255, + "step": 3491 + }, + { + "epoch": 2.3626522327469552, + "grad_norm": 1.1683634006387886, + "learning_rate": 1.313391683405177e-06, + "loss": 0.3304, + "step": 3492 + }, + { + "epoch": 2.3633288227334237, + "grad_norm": 1.1264272724900595, + "learning_rate": 1.310733331125935e-06, + "loss": 0.2987, + "step": 3493 + }, + { + "epoch": 2.3640054127198917, + "grad_norm": 1.1110478101900172, + "learning_rate": 1.3080772659931728e-06, + "loss": 0.3115, + "step": 3494 + }, + { + "epoch": 2.3646820027063598, + "grad_norm": 1.1120632764431906, + "learning_rate": 1.305423489653508e-06, + "loss": 0.3215, + "step": 3495 + }, + { + "epoch": 2.365358592692828, + "grad_norm": 1.1074600373325922, + "learning_rate": 1.3027720037521397e-06, + "loss": 0.3067, + "step": 3496 + }, + { + "epoch": 2.3660351826792962, + "grad_norm": 1.3468500227329978, + "learning_rate": 1.3001228099328445e-06, + "loss": 0.3019, + "step": 3497 + }, + { + "epoch": 2.3667117726657647, + "grad_norm": 1.1118779652272481, + "learning_rate": 1.297475909837979e-06, + "loss": 0.3175, + "step": 3498 + }, + { + "epoch": 2.3673883626522327, + "grad_norm": 1.1453009657972386, + "learning_rate": 1.29483130510848e-06, + "loss": 0.3391, + "step": 3499 + }, + { + "epoch": 2.3680649526387008, + "grad_norm": 1.1385287275376463, + "learning_rate": 1.2921889973838591e-06, + "loss": 0.3217, + "step": 3500 + }, + { + "epoch": 2.3687415426251692, + "grad_norm": 1.113156750144898, + "learning_rate": 1.289548988302207e-06, + "loss": 0.3083, + "step": 3501 + }, + { + "epoch": 2.3694181326116373, + "grad_norm": 1.1411851415754353, + "learning_rate": 1.2869112795001836e-06, + "loss": 0.323, + "step": 3502 + }, + { + "epoch": 2.3700947225981057, + "grad_norm": 1.1124203263727617, + "learning_rate": 1.2842758726130283e-06, + "loss": 0.3232, + "step": 3503 + }, + { + "epoch": 2.3707713125845737, + "grad_norm": 1.0845956699853068, + "learning_rate": 1.281642769274552e-06, + "loss": 0.3101, + "step": 3504 + }, + { + "epoch": 2.3714479025710418, + "grad_norm": 1.1343768372333758, + "learning_rate": 1.2790119711171356e-06, + "loss": 0.3124, + "step": 3505 + }, + { + "epoch": 2.3721244925575102, + "grad_norm": 1.1211238121509965, + "learning_rate": 1.2763834797717312e-06, + "loss": 0.311, + "step": 3506 + }, + { + "epoch": 2.3728010825439783, + "grad_norm": 1.1325954165849101, + "learning_rate": 1.2737572968678624e-06, + "loss": 0.3122, + "step": 3507 + }, + { + "epoch": 2.3734776725304467, + "grad_norm": 1.1217703491721847, + "learning_rate": 1.2711334240336216e-06, + "loss": 0.3152, + "step": 3508 + }, + { + "epoch": 2.3741542625169147, + "grad_norm": 1.163818266836355, + "learning_rate": 1.26851186289567e-06, + "loss": 0.325, + "step": 3509 + }, + { + "epoch": 2.3748308525033828, + "grad_norm": 1.1343757479060412, + "learning_rate": 1.2658926150792321e-06, + "loss": 0.3318, + "step": 3510 + }, + { + "epoch": 2.3755074424898512, + "grad_norm": 1.0987108606316085, + "learning_rate": 1.2632756822081e-06, + "loss": 0.3112, + "step": 3511 + }, + { + "epoch": 2.3761840324763193, + "grad_norm": 1.1484261921654009, + "learning_rate": 1.2606610659046314e-06, + "loss": 0.3193, + "step": 3512 + }, + { + "epoch": 2.3768606224627877, + "grad_norm": 1.1631813605994743, + "learning_rate": 1.2580487677897496e-06, + "loss": 0.3256, + "step": 3513 + }, + { + "epoch": 2.3775372124492558, + "grad_norm": 1.1514927477839263, + "learning_rate": 1.255438789482935e-06, + "loss": 0.323, + "step": 3514 + }, + { + "epoch": 2.378213802435724, + "grad_norm": 1.1259927778312828, + "learning_rate": 1.2528311326022364e-06, + "loss": 0.3093, + "step": 3515 + }, + { + "epoch": 2.3788903924221922, + "grad_norm": 1.1632793982771636, + "learning_rate": 1.250225798764259e-06, + "loss": 0.3299, + "step": 3516 + }, + { + "epoch": 2.3795669824086603, + "grad_norm": 1.1730467726536382, + "learning_rate": 1.2476227895841714e-06, + "loss": 0.3149, + "step": 3517 + }, + { + "epoch": 2.3802435723951287, + "grad_norm": 1.10142032031152, + "learning_rate": 1.2450221066756973e-06, + "loss": 0.3234, + "step": 3518 + }, + { + "epoch": 2.3809201623815968, + "grad_norm": 1.1095396209156525, + "learning_rate": 1.242423751651119e-06, + "loss": 0.3169, + "step": 3519 + }, + { + "epoch": 2.381596752368065, + "grad_norm": 1.1286624368515654, + "learning_rate": 1.2398277261212777e-06, + "loss": 0.3287, + "step": 3520 + }, + { + "epoch": 2.3822733423545333, + "grad_norm": 1.1408801632686176, + "learning_rate": 1.2372340316955694e-06, + "loss": 0.3255, + "step": 3521 + }, + { + "epoch": 2.3829499323410013, + "grad_norm": 1.1051274743799062, + "learning_rate": 1.234642669981946e-06, + "loss": 0.3131, + "step": 3522 + }, + { + "epoch": 2.3836265223274697, + "grad_norm": 1.124440199110235, + "learning_rate": 1.232053642586909e-06, + "loss": 0.3162, + "step": 3523 + }, + { + "epoch": 2.3843031123139378, + "grad_norm": 1.1538161779736553, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.324, + "step": 3524 + }, + { + "epoch": 2.384979702300406, + "grad_norm": 1.085256280637149, + "learning_rate": 1.2268825971713833e-06, + "loss": 0.3147, + "step": 3525 + }, + { + "epoch": 2.3856562922868743, + "grad_norm": 1.141637889491272, + "learning_rate": 1.2243005823566638e-06, + "loss": 0.3165, + "step": 3526 + }, + { + "epoch": 2.3863328822733423, + "grad_norm": 1.1273807671838103, + "learning_rate": 1.2217209082720677e-06, + "loss": 0.321, + "step": 3527 + }, + { + "epoch": 2.3870094722598107, + "grad_norm": 1.1446425478149072, + "learning_rate": 1.2191435765168557e-06, + "loss": 0.3085, + "step": 3528 + }, + { + "epoch": 2.3876860622462788, + "grad_norm": 1.138276430666935, + "learning_rate": 1.2165685886888346e-06, + "loss": 0.3232, + "step": 3529 + }, + { + "epoch": 2.388362652232747, + "grad_norm": 1.1092810231609838, + "learning_rate": 1.2139959463843593e-06, + "loss": 0.3014, + "step": 3530 + }, + { + "epoch": 2.3890392422192153, + "grad_norm": 1.1060100759802696, + "learning_rate": 1.2114256511983274e-06, + "loss": 0.3088, + "step": 3531 + }, + { + "epoch": 2.3897158322056833, + "grad_norm": 1.1611344190662929, + "learning_rate": 1.2088577047241834e-06, + "loss": 0.3336, + "step": 3532 + }, + { + "epoch": 2.3903924221921518, + "grad_norm": 1.1127045231776416, + "learning_rate": 1.2062921085539152e-06, + "loss": 0.3165, + "step": 3533 + }, + { + "epoch": 2.39106901217862, + "grad_norm": 1.1749429541623873, + "learning_rate": 1.2037288642780575e-06, + "loss": 0.3349, + "step": 3534 + }, + { + "epoch": 2.391745602165088, + "grad_norm": 1.1400185826065576, + "learning_rate": 1.2011679734856796e-06, + "loss": 0.3231, + "step": 3535 + }, + { + "epoch": 2.3924221921515563, + "grad_norm": 1.1196237995790301, + "learning_rate": 1.1986094377643976e-06, + "loss": 0.3134, + "step": 3536 + }, + { + "epoch": 2.3930987821380243, + "grad_norm": 1.1317444371874341, + "learning_rate": 1.1960532587003666e-06, + "loss": 0.304, + "step": 3537 + }, + { + "epoch": 2.3937753721244928, + "grad_norm": 1.200027462821917, + "learning_rate": 1.193499437878277e-06, + "loss": 0.3248, + "step": 3538 + }, + { + "epoch": 2.394451962110961, + "grad_norm": 1.1030282179931525, + "learning_rate": 1.1909479768813641e-06, + "loss": 0.3128, + "step": 3539 + }, + { + "epoch": 2.395128552097429, + "grad_norm": 1.1240595568357, + "learning_rate": 1.1883988772913924e-06, + "loss": 0.3139, + "step": 3540 + }, + { + "epoch": 2.3958051420838973, + "grad_norm": 1.1469755819623062, + "learning_rate": 1.1858521406886674e-06, + "loss": 0.3224, + "step": 3541 + }, + { + "epoch": 2.3964817320703653, + "grad_norm": 1.1327222168099689, + "learning_rate": 1.183307768652029e-06, + "loss": 0.3146, + "step": 3542 + }, + { + "epoch": 2.3971583220568338, + "grad_norm": 1.112350043947951, + "learning_rate": 1.180765762758852e-06, + "loss": 0.3168, + "step": 3543 + }, + { + "epoch": 2.397834912043302, + "grad_norm": 1.1050977029991658, + "learning_rate": 1.1782261245850417e-06, + "loss": 0.3214, + "step": 3544 + }, + { + "epoch": 2.39851150202977, + "grad_norm": 1.120220381893809, + "learning_rate": 1.1756888557050356e-06, + "loss": 0.3258, + "step": 3545 + }, + { + "epoch": 2.3991880920162383, + "grad_norm": 1.0982917353481483, + "learning_rate": 1.173153957691805e-06, + "loss": 0.3028, + "step": 3546 + }, + { + "epoch": 2.3998646820027063, + "grad_norm": 1.1290017099554381, + "learning_rate": 1.1706214321168513e-06, + "loss": 0.316, + "step": 3547 + }, + { + "epoch": 2.4005412719891748, + "grad_norm": 1.1122629928280499, + "learning_rate": 1.1680912805502008e-06, + "loss": 0.3099, + "step": 3548 + }, + { + "epoch": 2.401217861975643, + "grad_norm": 1.1318301757696607, + "learning_rate": 1.165563504560413e-06, + "loss": 0.3254, + "step": 3549 + }, + { + "epoch": 2.401894451962111, + "grad_norm": 1.1000619570162524, + "learning_rate": 1.1630381057145735e-06, + "loss": 0.2996, + "step": 3550 + }, + { + "epoch": 2.4025710419485793, + "grad_norm": 1.1444264595967888, + "learning_rate": 1.1605150855782916e-06, + "loss": 0.3231, + "step": 3551 + }, + { + "epoch": 2.4032476319350473, + "grad_norm": 1.110204315005672, + "learning_rate": 1.157994445715706e-06, + "loss": 0.3154, + "step": 3552 + }, + { + "epoch": 2.403924221921516, + "grad_norm": 1.1053213243824813, + "learning_rate": 1.155476187689475e-06, + "loss": 0.3133, + "step": 3553 + }, + { + "epoch": 2.404600811907984, + "grad_norm": 1.15353184046916, + "learning_rate": 1.1529603130607837e-06, + "loss": 0.3167, + "step": 3554 + }, + { + "epoch": 2.405277401894452, + "grad_norm": 1.1362190763890363, + "learning_rate": 1.1504468233893408e-06, + "loss": 0.3198, + "step": 3555 + }, + { + "epoch": 2.4059539918809203, + "grad_norm": 1.179608172565361, + "learning_rate": 1.1479357202333707e-06, + "loss": 0.323, + "step": 3556 + }, + { + "epoch": 2.4066305818673883, + "grad_norm": 1.087135666523029, + "learning_rate": 1.1454270051496264e-06, + "loss": 0.3042, + "step": 3557 + }, + { + "epoch": 2.407307171853857, + "grad_norm": 1.181982189311583, + "learning_rate": 1.1429206796933717e-06, + "loss": 0.3278, + "step": 3558 + }, + { + "epoch": 2.407983761840325, + "grad_norm": 1.1681944729415519, + "learning_rate": 1.1404167454183957e-06, + "loss": 0.3277, + "step": 3559 + }, + { + "epoch": 2.408660351826793, + "grad_norm": 1.1340177948443406, + "learning_rate": 1.137915203877003e-06, + "loss": 0.3221, + "step": 3560 + }, + { + "epoch": 2.4093369418132613, + "grad_norm": 1.1176795014616392, + "learning_rate": 1.1354160566200128e-06, + "loss": 0.3026, + "step": 3561 + }, + { + "epoch": 2.4100135317997293, + "grad_norm": 1.144195183861141, + "learning_rate": 1.132919305196763e-06, + "loss": 0.3098, + "step": 3562 + }, + { + "epoch": 2.410690121786198, + "grad_norm": 1.1347214615831074, + "learning_rate": 1.130424951155104e-06, + "loss": 0.3133, + "step": 3563 + }, + { + "epoch": 2.411366711772666, + "grad_norm": 1.126091562477791, + "learning_rate": 1.1279329960414047e-06, + "loss": 0.3092, + "step": 3564 + }, + { + "epoch": 2.412043301759134, + "grad_norm": 1.1729908809034137, + "learning_rate": 1.1254434414005367e-06, + "loss": 0.3202, + "step": 3565 + }, + { + "epoch": 2.4127198917456023, + "grad_norm": 1.1438036146000217, + "learning_rate": 1.1229562887758927e-06, + "loss": 0.3068, + "step": 3566 + }, + { + "epoch": 2.4133964817320703, + "grad_norm": 1.161876628504045, + "learning_rate": 1.1204715397093735e-06, + "loss": 0.3167, + "step": 3567 + }, + { + "epoch": 2.414073071718539, + "grad_norm": 1.1352831581419485, + "learning_rate": 1.1179891957413908e-06, + "loss": 0.3206, + "step": 3568 + }, + { + "epoch": 2.414749661705007, + "grad_norm": 1.1379129348287849, + "learning_rate": 1.1155092584108606e-06, + "loss": 0.329, + "step": 3569 + }, + { + "epoch": 2.415426251691475, + "grad_norm": 1.1582938649505607, + "learning_rate": 1.113031729255214e-06, + "loss": 0.3138, + "step": 3570 + }, + { + "epoch": 2.4161028416779433, + "grad_norm": 1.110109841769803, + "learning_rate": 1.1105566098103825e-06, + "loss": 0.3059, + "step": 3571 + }, + { + "epoch": 2.4167794316644113, + "grad_norm": 1.0712589097819936, + "learning_rate": 1.1080839016108086e-06, + "loss": 0.2975, + "step": 3572 + }, + { + "epoch": 2.41745602165088, + "grad_norm": 1.087618111361069, + "learning_rate": 1.1056136061894386e-06, + "loss": 0.3076, + "step": 3573 + }, + { + "epoch": 2.418132611637348, + "grad_norm": 1.095089762234527, + "learning_rate": 1.1031457250777206e-06, + "loss": 0.3125, + "step": 3574 + }, + { + "epoch": 2.418809201623816, + "grad_norm": 1.1571240889674608, + "learning_rate": 1.1006802598056081e-06, + "loss": 0.316, + "step": 3575 + }, + { + "epoch": 2.4194857916102843, + "grad_norm": 1.1342395599392372, + "learning_rate": 1.0982172119015594e-06, + "loss": 0.3187, + "step": 3576 + }, + { + "epoch": 2.4201623815967523, + "grad_norm": 1.1303956719686752, + "learning_rate": 1.0957565828925292e-06, + "loss": 0.3159, + "step": 3577 + }, + { + "epoch": 2.420838971583221, + "grad_norm": 1.1460891396861255, + "learning_rate": 1.0932983743039739e-06, + "loss": 0.3195, + "step": 3578 + }, + { + "epoch": 2.421515561569689, + "grad_norm": 1.150753764974417, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.3206, + "step": 3579 + }, + { + "epoch": 2.422192151556157, + "grad_norm": 1.1620928025993469, + "learning_rate": 1.0883892244826173e-06, + "loss": 0.3271, + "step": 3580 + }, + { + "epoch": 2.4228687415426253, + "grad_norm": 1.1908733356947119, + "learning_rate": 1.0859382862932255e-06, + "loss": 0.3278, + "step": 3581 + }, + { + "epoch": 2.4235453315290933, + "grad_norm": 1.1332923988073482, + "learning_rate": 1.0834897746111233e-06, + "loss": 0.3172, + "step": 3582 + }, + { + "epoch": 2.424221921515562, + "grad_norm": 1.1109533239182614, + "learning_rate": 1.0810436909542571e-06, + "loss": 0.3126, + "step": 3583 + }, + { + "epoch": 2.42489851150203, + "grad_norm": 1.1261381580784227, + "learning_rate": 1.0786000368390686e-06, + "loss": 0.3127, + "step": 3584 + }, + { + "epoch": 2.425575101488498, + "grad_norm": 1.1294567080481843, + "learning_rate": 1.0761588137804896e-06, + "loss": 0.324, + "step": 3585 + }, + { + "epoch": 2.4262516914749663, + "grad_norm": 1.1844730566208894, + "learning_rate": 1.0737200232919465e-06, + "loss": 0.3321, + "step": 3586 + }, + { + "epoch": 2.4269282814614344, + "grad_norm": 1.14565636803996, + "learning_rate": 1.0712836668853583e-06, + "loss": 0.3156, + "step": 3587 + }, + { + "epoch": 2.4276048714479024, + "grad_norm": 1.1524306442941032, + "learning_rate": 1.0688497460711345e-06, + "loss": 0.333, + "step": 3588 + }, + { + "epoch": 2.428281461434371, + "grad_norm": 1.121942649676295, + "learning_rate": 1.0664182623581777e-06, + "loss": 0.3201, + "step": 3589 + }, + { + "epoch": 2.428958051420839, + "grad_norm": 1.1529333785239038, + "learning_rate": 1.0639892172538734e-06, + "loss": 0.3103, + "step": 3590 + }, + { + "epoch": 2.4296346414073073, + "grad_norm": 1.1193930381983288, + "learning_rate": 1.0615626122640988e-06, + "loss": 0.3104, + "step": 3591 + }, + { + "epoch": 2.4303112313937754, + "grad_norm": 1.1230642424158743, + "learning_rate": 1.0591384488932188e-06, + "loss": 0.3109, + "step": 3592 + }, + { + "epoch": 2.4309878213802434, + "grad_norm": 1.1530014973225775, + "learning_rate": 1.0567167286440844e-06, + "loss": 0.3221, + "step": 3593 + }, + { + "epoch": 2.431664411366712, + "grad_norm": 1.0807062861300396, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.2891, + "step": 3594 + }, + { + "epoch": 2.43234100135318, + "grad_norm": 1.1589482864780765, + "learning_rate": 1.0518806235148814e-06, + "loss": 0.311, + "step": 3595 + }, + { + "epoch": 2.4330175913396483, + "grad_norm": 1.1586065424296987, + "learning_rate": 1.0494662416329366e-06, + "loss": 0.3228, + "step": 3596 + }, + { + "epoch": 2.4336941813261164, + "grad_norm": 1.1502240836292057, + "learning_rate": 1.0470543088689855e-06, + "loss": 0.3094, + "step": 3597 + }, + { + "epoch": 2.4343707713125844, + "grad_norm": 1.112309337032747, + "learning_rate": 1.044644826718295e-06, + "loss": 0.3166, + "step": 3598 + }, + { + "epoch": 2.435047361299053, + "grad_norm": 1.086114625093356, + "learning_rate": 1.0422377966746133e-06, + "loss": 0.3067, + "step": 3599 + }, + { + "epoch": 2.435723951285521, + "grad_norm": 1.1052570795412353, + "learning_rate": 1.0398332202301708e-06, + "loss": 0.3015, + "step": 3600 + }, + { + "epoch": 2.4364005412719894, + "grad_norm": 1.1488816748291193, + "learning_rate": 1.0374310988756747e-06, + "loss": 0.3226, + "step": 3601 + }, + { + "epoch": 2.4370771312584574, + "grad_norm": 1.1266345056168994, + "learning_rate": 1.0350314341003121e-06, + "loss": 0.3096, + "step": 3602 + }, + { + "epoch": 2.4377537212449254, + "grad_norm": 1.1517626607813003, + "learning_rate": 1.0326342273917432e-06, + "loss": 0.3253, + "step": 3603 + }, + { + "epoch": 2.438430311231394, + "grad_norm": 1.145166831273789, + "learning_rate": 1.0302394802361104e-06, + "loss": 0.3154, + "step": 3604 + }, + { + "epoch": 2.439106901217862, + "grad_norm": 1.1386424656308904, + "learning_rate": 1.0278471941180245e-06, + "loss": 0.3291, + "step": 3605 + }, + { + "epoch": 2.4397834912043304, + "grad_norm": 1.1491871424478022, + "learning_rate": 1.0254573705205751e-06, + "loss": 0.3204, + "step": 3606 + }, + { + "epoch": 2.4404600811907984, + "grad_norm": 1.1608825064820834, + "learning_rate": 1.0230700109253255e-06, + "loss": 0.3171, + "step": 3607 + }, + { + "epoch": 2.4411366711772664, + "grad_norm": 1.123465447443952, + "learning_rate": 1.0206851168123078e-06, + "loss": 0.3153, + "step": 3608 + }, + { + "epoch": 2.441813261163735, + "grad_norm": 1.135981167360808, + "learning_rate": 1.0183026896600284e-06, + "loss": 0.3109, + "step": 3609 + }, + { + "epoch": 2.442489851150203, + "grad_norm": 1.193026211766432, + "learning_rate": 1.0159227309454662e-06, + "loss": 0.3332, + "step": 3610 + }, + { + "epoch": 2.4431664411366714, + "grad_norm": 1.1194518586529043, + "learning_rate": 1.0135452421440645e-06, + "loss": 0.3243, + "step": 3611 + }, + { + "epoch": 2.4438430311231394, + "grad_norm": 1.1622110630884213, + "learning_rate": 1.0111702247297372e-06, + "loss": 0.3215, + "step": 3612 + }, + { + "epoch": 2.4445196211096074, + "grad_norm": 1.112150959767557, + "learning_rate": 1.0087976801748694e-06, + "loss": 0.3081, + "step": 3613 + }, + { + "epoch": 2.445196211096076, + "grad_norm": 1.1441128793589497, + "learning_rate": 1.00642760995031e-06, + "loss": 0.3186, + "step": 3614 + }, + { + "epoch": 2.445872801082544, + "grad_norm": 1.1545232417516533, + "learning_rate": 1.0040600155253766e-06, + "loss": 0.306, + "step": 3615 + }, + { + "epoch": 2.4465493910690124, + "grad_norm": 1.119520686574386, + "learning_rate": 1.0016948983678471e-06, + "loss": 0.3152, + "step": 3616 + }, + { + "epoch": 2.4472259810554804, + "grad_norm": 1.1020040761457315, + "learning_rate": 9.993322599439692e-07, + "loss": 0.3116, + "step": 3617 + }, + { + "epoch": 2.4479025710419484, + "grad_norm": 1.1309211532895094, + "learning_rate": 9.969721017184492e-07, + "loss": 0.3124, + "step": 3618 + }, + { + "epoch": 2.448579161028417, + "grad_norm": 1.1247046357851842, + "learning_rate": 9.946144251544604e-07, + "loss": 0.3038, + "step": 3619 + }, + { + "epoch": 2.449255751014885, + "grad_norm": 1.1304169341813408, + "learning_rate": 9.92259231713632e-07, + "loss": 0.3112, + "step": 3620 + }, + { + "epoch": 2.449932341001353, + "grad_norm": 1.1656071517608024, + "learning_rate": 9.899065228560596e-07, + "loss": 0.3155, + "step": 3621 + }, + { + "epoch": 2.4506089309878214, + "grad_norm": 1.1295431320008458, + "learning_rate": 9.87556300040295e-07, + "loss": 0.3236, + "step": 3622 + }, + { + "epoch": 2.4512855209742894, + "grad_norm": 1.0970987982295464, + "learning_rate": 9.852085647233505e-07, + "loss": 0.3032, + "step": 3623 + }, + { + "epoch": 2.451962110960758, + "grad_norm": 1.1159102589736656, + "learning_rate": 9.82863318360695e-07, + "loss": 0.3136, + "step": 3624 + }, + { + "epoch": 2.452638700947226, + "grad_norm": 1.1545687563771772, + "learning_rate": 9.805205624062535e-07, + "loss": 0.3103, + "step": 3625 + }, + { + "epoch": 2.453315290933694, + "grad_norm": 1.1772105291245962, + "learning_rate": 9.781802983124094e-07, + "loss": 0.325, + "step": 3626 + }, + { + "epoch": 2.4539918809201624, + "grad_norm": 1.149176716633529, + "learning_rate": 9.758425275299998e-07, + "loss": 0.3211, + "step": 3627 + }, + { + "epoch": 2.4546684709066304, + "grad_norm": 1.2223225308978591, + "learning_rate": 9.735072515083193e-07, + "loss": 0.3395, + "step": 3628 + }, + { + "epoch": 2.455345060893099, + "grad_norm": 1.109822575849989, + "learning_rate": 9.711744716951093e-07, + "loss": 0.3083, + "step": 3629 + }, + { + "epoch": 2.456021650879567, + "grad_norm": 1.1270779951147665, + "learning_rate": 9.688441895365708e-07, + "loss": 0.3197, + "step": 3630 + }, + { + "epoch": 2.456698240866035, + "grad_norm": 1.1497565738756217, + "learning_rate": 9.665164064773496e-07, + "loss": 0.3256, + "step": 3631 + }, + { + "epoch": 2.4573748308525034, + "grad_norm": 1.0809595186273584, + "learning_rate": 9.641911239605494e-07, + "loss": 0.2968, + "step": 3632 + }, + { + "epoch": 2.4580514208389714, + "grad_norm": 1.1326728787965972, + "learning_rate": 9.618683434277176e-07, + "loss": 0.32, + "step": 3633 + }, + { + "epoch": 2.45872801082544, + "grad_norm": 1.11117448199408, + "learning_rate": 9.595480663188528e-07, + "loss": 0.3083, + "step": 3634 + }, + { + "epoch": 2.459404600811908, + "grad_norm": 1.0851801438798105, + "learning_rate": 9.572302940724032e-07, + "loss": 0.2976, + "step": 3635 + }, + { + "epoch": 2.460081190798376, + "grad_norm": 1.1355557016308093, + "learning_rate": 9.549150281252633e-07, + "loss": 0.3247, + "step": 3636 + }, + { + "epoch": 2.4607577807848444, + "grad_norm": 1.1499066283060941, + "learning_rate": 9.526022699127718e-07, + "loss": 0.3201, + "step": 3637 + }, + { + "epoch": 2.4614343707713124, + "grad_norm": 1.1387358327900405, + "learning_rate": 9.502920208687133e-07, + "loss": 0.3123, + "step": 3638 + }, + { + "epoch": 2.462110960757781, + "grad_norm": 1.075294062624437, + "learning_rate": 9.479842824253182e-07, + "loss": 0.2992, + "step": 3639 + }, + { + "epoch": 2.462787550744249, + "grad_norm": 1.1094037335250049, + "learning_rate": 9.456790560132617e-07, + "loss": 0.3005, + "step": 3640 + }, + { + "epoch": 2.463464140730717, + "grad_norm": 1.123900159236025, + "learning_rate": 9.433763430616577e-07, + "loss": 0.304, + "step": 3641 + }, + { + "epoch": 2.4641407307171854, + "grad_norm": 1.189332240219114, + "learning_rate": 9.410761449980654e-07, + "loss": 0.3239, + "step": 3642 + }, + { + "epoch": 2.4648173207036534, + "grad_norm": 1.1245416558411154, + "learning_rate": 9.387784632484825e-07, + "loss": 0.3155, + "step": 3643 + }, + { + "epoch": 2.465493910690122, + "grad_norm": 1.1019308765739761, + "learning_rate": 9.364832992373501e-07, + "loss": 0.2953, + "step": 3644 + }, + { + "epoch": 2.46617050067659, + "grad_norm": 1.1288751678368858, + "learning_rate": 9.341906543875451e-07, + "loss": 0.3165, + "step": 3645 + }, + { + "epoch": 2.466847090663058, + "grad_norm": 1.160564559428215, + "learning_rate": 9.319005301203821e-07, + "loss": 0.322, + "step": 3646 + }, + { + "epoch": 2.4675236806495264, + "grad_norm": 1.1399809893424149, + "learning_rate": 9.296129278556155e-07, + "loss": 0.3139, + "step": 3647 + }, + { + "epoch": 2.4682002706359945, + "grad_norm": 1.15137782801232, + "learning_rate": 9.273278490114357e-07, + "loss": 0.3216, + "step": 3648 + }, + { + "epoch": 2.468876860622463, + "grad_norm": 1.1225234397219517, + "learning_rate": 9.250452950044702e-07, + "loss": 0.3059, + "step": 3649 + }, + { + "epoch": 2.469553450608931, + "grad_norm": 1.115070313746826, + "learning_rate": 9.227652672497761e-07, + "loss": 0.3012, + "step": 3650 + }, + { + "epoch": 2.470230040595399, + "grad_norm": 1.1311278690032023, + "learning_rate": 9.204877671608515e-07, + "loss": 0.3099, + "step": 3651 + }, + { + "epoch": 2.4709066305818674, + "grad_norm": 1.121211882530858, + "learning_rate": 9.182127961496196e-07, + "loss": 0.3152, + "step": 3652 + }, + { + "epoch": 2.4715832205683355, + "grad_norm": 1.2016290053178065, + "learning_rate": 9.159403556264435e-07, + "loss": 0.324, + "step": 3653 + }, + { + "epoch": 2.472259810554804, + "grad_norm": 1.1272716084337469, + "learning_rate": 9.136704470001101e-07, + "loss": 0.3166, + "step": 3654 + }, + { + "epoch": 2.472936400541272, + "grad_norm": 1.0773651646535676, + "learning_rate": 9.114030716778433e-07, + "loss": 0.2912, + "step": 3655 + }, + { + "epoch": 2.47361299052774, + "grad_norm": 1.112831014496653, + "learning_rate": 9.091382310652925e-07, + "loss": 0.3121, + "step": 3656 + }, + { + "epoch": 2.4742895805142084, + "grad_norm": 1.1228219864597098, + "learning_rate": 9.068759265665384e-07, + "loss": 0.3115, + "step": 3657 + }, + { + "epoch": 2.4749661705006765, + "grad_norm": 1.1035873209666962, + "learning_rate": 9.046161595840858e-07, + "loss": 0.2965, + "step": 3658 + }, + { + "epoch": 2.475642760487145, + "grad_norm": 1.1179364137717536, + "learning_rate": 9.023589315188686e-07, + "loss": 0.3162, + "step": 3659 + }, + { + "epoch": 2.476319350473613, + "grad_norm": 1.1285034156113722, + "learning_rate": 9.001042437702468e-07, + "loss": 0.3164, + "step": 3660 + }, + { + "epoch": 2.476995940460081, + "grad_norm": 1.1429412179668048, + "learning_rate": 8.978520977360067e-07, + "loss": 0.3171, + "step": 3661 + }, + { + "epoch": 2.4776725304465494, + "grad_norm": 1.1121955666599372, + "learning_rate": 8.956024948123549e-07, + "loss": 0.3098, + "step": 3662 + }, + { + "epoch": 2.4783491204330175, + "grad_norm": 1.1697239453051984, + "learning_rate": 8.933554363939256e-07, + "loss": 0.3181, + "step": 3663 + }, + { + "epoch": 2.479025710419486, + "grad_norm": 1.1780879883606745, + "learning_rate": 8.911109238737748e-07, + "loss": 0.321, + "step": 3664 + }, + { + "epoch": 2.479702300405954, + "grad_norm": 1.1493066822023357, + "learning_rate": 8.888689586433768e-07, + "loss": 0.3086, + "step": 3665 + }, + { + "epoch": 2.480378890392422, + "grad_norm": 1.1306115430782535, + "learning_rate": 8.866295420926319e-07, + "loss": 0.3134, + "step": 3666 + }, + { + "epoch": 2.4810554803788905, + "grad_norm": 1.0908449161442337, + "learning_rate": 8.843926756098548e-07, + "loss": 0.3121, + "step": 3667 + }, + { + "epoch": 2.4817320703653585, + "grad_norm": 1.123226274940738, + "learning_rate": 8.821583605817835e-07, + "loss": 0.3117, + "step": 3668 + }, + { + "epoch": 2.482408660351827, + "grad_norm": 1.090837760871994, + "learning_rate": 8.799265983935734e-07, + "loss": 0.2957, + "step": 3669 + }, + { + "epoch": 2.483085250338295, + "grad_norm": 1.1591596583877377, + "learning_rate": 8.776973904287972e-07, + "loss": 0.3272, + "step": 3670 + }, + { + "epoch": 2.483761840324763, + "grad_norm": 1.1741658351855955, + "learning_rate": 8.754707380694427e-07, + "loss": 0.3157, + "step": 3671 + }, + { + "epoch": 2.4844384303112315, + "grad_norm": 1.123503135233597, + "learning_rate": 8.732466426959135e-07, + "loss": 0.3207, + "step": 3672 + }, + { + "epoch": 2.4851150202976995, + "grad_norm": 1.1461377510712878, + "learning_rate": 8.7102510568703e-07, + "loss": 0.3206, + "step": 3673 + }, + { + "epoch": 2.485791610284168, + "grad_norm": 1.1353827801624172, + "learning_rate": 8.688061284200266e-07, + "loss": 0.3147, + "step": 3674 + }, + { + "epoch": 2.486468200270636, + "grad_norm": 1.1246975065115774, + "learning_rate": 8.665897122705463e-07, + "loss": 0.3121, + "step": 3675 + }, + { + "epoch": 2.487144790257104, + "grad_norm": 1.1021770453979274, + "learning_rate": 8.6437585861265e-07, + "loss": 0.3085, + "step": 3676 + }, + { + "epoch": 2.4878213802435725, + "grad_norm": 1.1133565559785967, + "learning_rate": 8.621645688188085e-07, + "loss": 0.3137, + "step": 3677 + }, + { + "epoch": 2.4884979702300405, + "grad_norm": 1.1513927941995301, + "learning_rate": 8.599558442598998e-07, + "loss": 0.3162, + "step": 3678 + }, + { + "epoch": 2.489174560216509, + "grad_norm": 1.130175559035253, + "learning_rate": 8.577496863052165e-07, + "loss": 0.3101, + "step": 3679 + }, + { + "epoch": 2.489851150202977, + "grad_norm": 1.1309432653170697, + "learning_rate": 8.555460963224549e-07, + "loss": 0.3114, + "step": 3680 + }, + { + "epoch": 2.490527740189445, + "grad_norm": 1.1297916506726353, + "learning_rate": 8.53345075677724e-07, + "loss": 0.3112, + "step": 3681 + }, + { + "epoch": 2.4912043301759135, + "grad_norm": 1.1036341209194485, + "learning_rate": 8.511466257355384e-07, + "loss": 0.3048, + "step": 3682 + }, + { + "epoch": 2.4918809201623815, + "grad_norm": 1.1431141396707682, + "learning_rate": 8.48950747858816e-07, + "loss": 0.3288, + "step": 3683 + }, + { + "epoch": 2.49255751014885, + "grad_norm": 1.1329866537932307, + "learning_rate": 8.46757443408886e-07, + "loss": 0.31, + "step": 3684 + }, + { + "epoch": 2.493234100135318, + "grad_norm": 1.1080750786849676, + "learning_rate": 8.44566713745476e-07, + "loss": 0.3125, + "step": 3685 + }, + { + "epoch": 2.493910690121786, + "grad_norm": 1.1099112329245437, + "learning_rate": 8.42378560226722e-07, + "loss": 0.3004, + "step": 3686 + }, + { + "epoch": 2.4945872801082545, + "grad_norm": 1.1417708599820553, + "learning_rate": 8.401929842091616e-07, + "loss": 0.3103, + "step": 3687 + }, + { + "epoch": 2.4952638700947225, + "grad_norm": 1.1374713397857121, + "learning_rate": 8.380099870477321e-07, + "loss": 0.3107, + "step": 3688 + }, + { + "epoch": 2.495940460081191, + "grad_norm": 1.1467448136950438, + "learning_rate": 8.358295700957753e-07, + "loss": 0.3186, + "step": 3689 + }, + { + "epoch": 2.496617050067659, + "grad_norm": 1.1531318849857632, + "learning_rate": 8.336517347050327e-07, + "loss": 0.3219, + "step": 3690 + }, + { + "epoch": 2.497293640054127, + "grad_norm": 1.19361679628736, + "learning_rate": 8.314764822256465e-07, + "loss": 0.3366, + "step": 3691 + }, + { + "epoch": 2.4979702300405955, + "grad_norm": 1.1974777341904552, + "learning_rate": 8.293038140061516e-07, + "loss": 0.332, + "step": 3692 + }, + { + "epoch": 2.4986468200270635, + "grad_norm": 1.1244192531633166, + "learning_rate": 8.271337313934869e-07, + "loss": 0.3193, + "step": 3693 + }, + { + "epoch": 2.499323410013532, + "grad_norm": 1.131779643859016, + "learning_rate": 8.24966235732988e-07, + "loss": 0.3077, + "step": 3694 + }, + { + "epoch": 2.5, + "grad_norm": 1.1221170366058266, + "learning_rate": 8.22801328368385e-07, + "loss": 0.3043, + "step": 3695 + }, + { + "epoch": 2.500676589986468, + "grad_norm": 1.1434580711747846, + "learning_rate": 8.206390106418028e-07, + "loss": 0.3093, + "step": 3696 + }, + { + "epoch": 2.5013531799729365, + "grad_norm": 1.1769857800034191, + "learning_rate": 8.184792838937633e-07, + "loss": 0.3234, + "step": 3697 + }, + { + "epoch": 2.5020297699594045, + "grad_norm": 1.1173881343935332, + "learning_rate": 8.163221494631785e-07, + "loss": 0.2997, + "step": 3698 + }, + { + "epoch": 2.502706359945873, + "grad_norm": 1.1271209057768203, + "learning_rate": 8.141676086873574e-07, + "loss": 0.3104, + "step": 3699 + }, + { + "epoch": 2.503382949932341, + "grad_norm": 1.08754106551674, + "learning_rate": 8.120156629019987e-07, + "loss": 0.3039, + "step": 3700 + }, + { + "epoch": 2.504059539918809, + "grad_norm": 1.1513866419083132, + "learning_rate": 8.098663134411922e-07, + "loss": 0.3116, + "step": 3701 + }, + { + "epoch": 2.5047361299052775, + "grad_norm": 1.13204408908435, + "learning_rate": 8.077195616374184e-07, + "loss": 0.3169, + "step": 3702 + }, + { + "epoch": 2.5054127198917455, + "grad_norm": 1.096123392467275, + "learning_rate": 8.055754088215501e-07, + "loss": 0.2988, + "step": 3703 + }, + { + "epoch": 2.506089309878214, + "grad_norm": 1.1118220265427365, + "learning_rate": 8.03433856322845e-07, + "loss": 0.2933, + "step": 3704 + }, + { + "epoch": 2.506765899864682, + "grad_norm": 1.174466968642932, + "learning_rate": 8.012949054689484e-07, + "loss": 0.3217, + "step": 3705 + }, + { + "epoch": 2.50744248985115, + "grad_norm": 1.1553983762354532, + "learning_rate": 7.991585575858962e-07, + "loss": 0.3216, + "step": 3706 + }, + { + "epoch": 2.5081190798376185, + "grad_norm": 1.1660414694117864, + "learning_rate": 7.970248139981091e-07, + "loss": 0.3265, + "step": 3707 + }, + { + "epoch": 2.5087956698240865, + "grad_norm": 1.1654863540447142, + "learning_rate": 7.948936760283937e-07, + "loss": 0.3184, + "step": 3708 + }, + { + "epoch": 2.509472259810555, + "grad_norm": 1.1519665292909043, + "learning_rate": 7.92765144997939e-07, + "loss": 0.3214, + "step": 3709 + }, + { + "epoch": 2.510148849797023, + "grad_norm": 1.163850272236243, + "learning_rate": 7.906392222263199e-07, + "loss": 0.3162, + "step": 3710 + }, + { + "epoch": 2.510825439783491, + "grad_norm": 1.111935055834014, + "learning_rate": 7.885159090314959e-07, + "loss": 0.3056, + "step": 3711 + }, + { + "epoch": 2.5115020297699595, + "grad_norm": 1.1567415229092517, + "learning_rate": 7.863952067298042e-07, + "loss": 0.324, + "step": 3712 + }, + { + "epoch": 2.5121786197564275, + "grad_norm": 1.1350957183916657, + "learning_rate": 7.842771166359681e-07, + "loss": 0.3152, + "step": 3713 + }, + { + "epoch": 2.512855209742896, + "grad_norm": 1.1029667819067668, + "learning_rate": 7.821616400630866e-07, + "loss": 0.3026, + "step": 3714 + }, + { + "epoch": 2.513531799729364, + "grad_norm": 1.120691799087682, + "learning_rate": 7.80048778322643e-07, + "loss": 0.3087, + "step": 3715 + }, + { + "epoch": 2.514208389715832, + "grad_norm": 1.1487156573969688, + "learning_rate": 7.779385327244987e-07, + "loss": 0.3145, + "step": 3716 + }, + { + "epoch": 2.5148849797023005, + "grad_norm": 1.1070717734849849, + "learning_rate": 7.758309045768908e-07, + "loss": 0.3038, + "step": 3717 + }, + { + "epoch": 2.5155615696887685, + "grad_norm": 1.1813271601173971, + "learning_rate": 7.737258951864341e-07, + "loss": 0.3165, + "step": 3718 + }, + { + "epoch": 2.516238159675237, + "grad_norm": 1.12974817586669, + "learning_rate": 7.716235058581218e-07, + "loss": 0.3149, + "step": 3719 + }, + { + "epoch": 2.516914749661705, + "grad_norm": 1.1863843608386213, + "learning_rate": 7.695237378953224e-07, + "loss": 0.3143, + "step": 3720 + }, + { + "epoch": 2.517591339648173, + "grad_norm": 1.1392727807156302, + "learning_rate": 7.674265925997804e-07, + "loss": 0.315, + "step": 3721 + }, + { + "epoch": 2.5182679296346415, + "grad_norm": 1.1324406042303652, + "learning_rate": 7.653320712716095e-07, + "loss": 0.3157, + "step": 3722 + }, + { + "epoch": 2.5189445196211095, + "grad_norm": 1.1233974281574273, + "learning_rate": 7.632401752093016e-07, + "loss": 0.3143, + "step": 3723 + }, + { + "epoch": 2.519621109607578, + "grad_norm": 1.144356827851246, + "learning_rate": 7.611509057097211e-07, + "loss": 0.3247, + "step": 3724 + }, + { + "epoch": 2.520297699594046, + "grad_norm": 1.151259634891893, + "learning_rate": 7.590642640681012e-07, + "loss": 0.3249, + "step": 3725 + }, + { + "epoch": 2.520974289580514, + "grad_norm": 1.127245847804875, + "learning_rate": 7.569802515780455e-07, + "loss": 0.3027, + "step": 3726 + }, + { + "epoch": 2.5216508795669825, + "grad_norm": 1.1119815832495008, + "learning_rate": 7.548988695315313e-07, + "loss": 0.311, + "step": 3727 + }, + { + "epoch": 2.5223274695534506, + "grad_norm": 1.2063885674474828, + "learning_rate": 7.528201192189028e-07, + "loss": 0.3194, + "step": 3728 + }, + { + "epoch": 2.523004059539919, + "grad_norm": 1.132910892314815, + "learning_rate": 7.507440019288742e-07, + "loss": 0.3125, + "step": 3729 + }, + { + "epoch": 2.523680649526387, + "grad_norm": 1.1723996182680465, + "learning_rate": 7.486705189485243e-07, + "loss": 0.3037, + "step": 3730 + }, + { + "epoch": 2.524357239512855, + "grad_norm": 1.0692107181283899, + "learning_rate": 7.465996715633028e-07, + "loss": 0.2866, + "step": 3731 + }, + { + "epoch": 2.5250338294993235, + "grad_norm": 1.1460005722338504, + "learning_rate": 7.44531461057022e-07, + "loss": 0.3004, + "step": 3732 + }, + { + "epoch": 2.5257104194857916, + "grad_norm": 1.1763829195906508, + "learning_rate": 7.424658887118613e-07, + "loss": 0.3217, + "step": 3733 + }, + { + "epoch": 2.52638700947226, + "grad_norm": 1.1178643648656326, + "learning_rate": 7.404029558083653e-07, + "loss": 0.2952, + "step": 3734 + }, + { + "epoch": 2.527063599458728, + "grad_norm": 1.1434491053925546, + "learning_rate": 7.383426636254392e-07, + "loss": 0.3143, + "step": 3735 + }, + { + "epoch": 2.527740189445196, + "grad_norm": 1.159002784993734, + "learning_rate": 7.362850134403543e-07, + "loss": 0.317, + "step": 3736 + }, + { + "epoch": 2.5284167794316645, + "grad_norm": 1.084385535923329, + "learning_rate": 7.342300065287439e-07, + "loss": 0.2959, + "step": 3737 + }, + { + "epoch": 2.5290933694181326, + "grad_norm": 1.1871165584955947, + "learning_rate": 7.321776441646001e-07, + "loss": 0.3242, + "step": 3738 + }, + { + "epoch": 2.529769959404601, + "grad_norm": 1.1493828961775037, + "learning_rate": 7.301279276202761e-07, + "loss": 0.3118, + "step": 3739 + }, + { + "epoch": 2.530446549391069, + "grad_norm": 1.1895350574165584, + "learning_rate": 7.280808581664866e-07, + "loss": 0.3244, + "step": 3740 + }, + { + "epoch": 2.531123139377537, + "grad_norm": 1.2015739901056524, + "learning_rate": 7.260364370723044e-07, + "loss": 0.3339, + "step": 3741 + }, + { + "epoch": 2.5317997293640055, + "grad_norm": 1.1760931721153907, + "learning_rate": 7.239946656051622e-07, + "loss": 0.3198, + "step": 3742 + }, + { + "epoch": 2.5324763193504736, + "grad_norm": 1.1699124745410474, + "learning_rate": 7.219555450308446e-07, + "loss": 0.3088, + "step": 3743 + }, + { + "epoch": 2.533152909336942, + "grad_norm": 1.2099178006365958, + "learning_rate": 7.199190766135001e-07, + "loss": 0.3436, + "step": 3744 + }, + { + "epoch": 2.53382949932341, + "grad_norm": 1.1029140570740334, + "learning_rate": 7.178852616156262e-07, + "loss": 0.2956, + "step": 3745 + }, + { + "epoch": 2.534506089309878, + "grad_norm": 1.143069202420577, + "learning_rate": 7.158541012980813e-07, + "loss": 0.3029, + "step": 3746 + }, + { + "epoch": 2.5351826792963466, + "grad_norm": 1.1596156867229297, + "learning_rate": 7.138255969200724e-07, + "loss": 0.3184, + "step": 3747 + }, + { + "epoch": 2.5358592692828146, + "grad_norm": 1.1502706362373234, + "learning_rate": 7.117997497391648e-07, + "loss": 0.3114, + "step": 3748 + }, + { + "epoch": 2.536535859269283, + "grad_norm": 1.1384305842959306, + "learning_rate": 7.097765610112745e-07, + "loss": 0.3166, + "step": 3749 + }, + { + "epoch": 2.537212449255751, + "grad_norm": 1.114087712937595, + "learning_rate": 7.077560319906696e-07, + "loss": 0.3088, + "step": 3750 + }, + { + "epoch": 2.537889039242219, + "grad_norm": 1.1327181809454117, + "learning_rate": 7.057381639299693e-07, + "loss": 0.3117, + "step": 3751 + }, + { + "epoch": 2.5385656292286876, + "grad_norm": 1.1488625539922845, + "learning_rate": 7.037229580801414e-07, + "loss": 0.3217, + "step": 3752 + }, + { + "epoch": 2.5392422192151556, + "grad_norm": 1.1475327189341715, + "learning_rate": 7.017104156905058e-07, + "loss": 0.3056, + "step": 3753 + }, + { + "epoch": 2.539918809201624, + "grad_norm": 1.0878976960789086, + "learning_rate": 6.997005380087301e-07, + "loss": 0.2933, + "step": 3754 + }, + { + "epoch": 2.540595399188092, + "grad_norm": 1.108388259536206, + "learning_rate": 6.976933262808322e-07, + "loss": 0.2988, + "step": 3755 + }, + { + "epoch": 2.54127198917456, + "grad_norm": 1.141026512880203, + "learning_rate": 6.95688781751172e-07, + "loss": 0.3227, + "step": 3756 + }, + { + "epoch": 2.5419485791610286, + "grad_norm": 1.1479001023892021, + "learning_rate": 6.936869056624623e-07, + "loss": 0.3138, + "step": 3757 + }, + { + "epoch": 2.5426251691474966, + "grad_norm": 1.104268764099118, + "learning_rate": 6.916876992557553e-07, + "loss": 0.3033, + "step": 3758 + }, + { + "epoch": 2.543301759133965, + "grad_norm": 1.164407872390624, + "learning_rate": 6.896911637704534e-07, + "loss": 0.3319, + "step": 3759 + }, + { + "epoch": 2.543978349120433, + "grad_norm": 1.1571743573343427, + "learning_rate": 6.876973004442988e-07, + "loss": 0.3165, + "step": 3760 + }, + { + "epoch": 2.544654939106901, + "grad_norm": 1.150170656754291, + "learning_rate": 6.85706110513381e-07, + "loss": 0.3249, + "step": 3761 + }, + { + "epoch": 2.5453315290933696, + "grad_norm": 1.1314854694700565, + "learning_rate": 6.837175952121305e-07, + "loss": 0.3062, + "step": 3762 + }, + { + "epoch": 2.5460081190798376, + "grad_norm": 1.0945707231098523, + "learning_rate": 6.8173175577332e-07, + "loss": 0.3095, + "step": 3763 + }, + { + "epoch": 2.546684709066306, + "grad_norm": 1.1289025306727016, + "learning_rate": 6.797485934280618e-07, + "loss": 0.3173, + "step": 3764 + }, + { + "epoch": 2.547361299052774, + "grad_norm": 1.1344543682752084, + "learning_rate": 6.777681094058087e-07, + "loss": 0.3152, + "step": 3765 + }, + { + "epoch": 2.548037889039242, + "grad_norm": 1.138694327919493, + "learning_rate": 6.757903049343556e-07, + "loss": 0.3144, + "step": 3766 + }, + { + "epoch": 2.5487144790257106, + "grad_norm": 1.1474526070049569, + "learning_rate": 6.738151812398353e-07, + "loss": 0.3206, + "step": 3767 + }, + { + "epoch": 2.5493910690121786, + "grad_norm": 1.1194031349989313, + "learning_rate": 6.718427395467165e-07, + "loss": 0.305, + "step": 3768 + }, + { + "epoch": 2.550067658998647, + "grad_norm": 1.1629177925372662, + "learning_rate": 6.698729810778065e-07, + "loss": 0.327, + "step": 3769 + }, + { + "epoch": 2.550744248985115, + "grad_norm": 1.1246626093657746, + "learning_rate": 6.67905907054251e-07, + "loss": 0.3126, + "step": 3770 + }, + { + "epoch": 2.551420838971583, + "grad_norm": 1.137151973605821, + "learning_rate": 6.659415186955298e-07, + "loss": 0.3084, + "step": 3771 + }, + { + "epoch": 2.5520974289580516, + "grad_norm": 1.1324360590500044, + "learning_rate": 6.639798172194567e-07, + "loss": 0.3076, + "step": 3772 + }, + { + "epoch": 2.5527740189445196, + "grad_norm": 1.1106642609606951, + "learning_rate": 6.620208038421805e-07, + "loss": 0.3174, + "step": 3773 + }, + { + "epoch": 2.553450608930988, + "grad_norm": 1.1533882904443011, + "learning_rate": 6.600644797781847e-07, + "loss": 0.3199, + "step": 3774 + }, + { + "epoch": 2.554127198917456, + "grad_norm": 1.1191526362128408, + "learning_rate": 6.581108462402847e-07, + "loss": 0.3048, + "step": 3775 + }, + { + "epoch": 2.554803788903924, + "grad_norm": 1.1285933784815176, + "learning_rate": 6.561599044396288e-07, + "loss": 0.325, + "step": 3776 + }, + { + "epoch": 2.555480378890392, + "grad_norm": 1.1398680877615874, + "learning_rate": 6.542116555856953e-07, + "loss": 0.3194, + "step": 3777 + }, + { + "epoch": 2.5561569688768606, + "grad_norm": 1.167366445221115, + "learning_rate": 6.522661008862918e-07, + "loss": 0.3151, + "step": 3778 + }, + { + "epoch": 2.556833558863329, + "grad_norm": 1.1125732379208815, + "learning_rate": 6.503232415475591e-07, + "loss": 0.2988, + "step": 3779 + }, + { + "epoch": 2.557510148849797, + "grad_norm": 1.1365819307875291, + "learning_rate": 6.483830787739659e-07, + "loss": 0.3076, + "step": 3780 + }, + { + "epoch": 2.558186738836265, + "grad_norm": 1.1350557624411308, + "learning_rate": 6.464456137683061e-07, + "loss": 0.3134, + "step": 3781 + }, + { + "epoch": 2.558863328822733, + "grad_norm": 1.1886131788738359, + "learning_rate": 6.445108477317046e-07, + "loss": 0.3217, + "step": 3782 + }, + { + "epoch": 2.5595399188092016, + "grad_norm": 1.1070398661169027, + "learning_rate": 6.425787818636131e-07, + "loss": 0.3029, + "step": 3783 + }, + { + "epoch": 2.56021650879567, + "grad_norm": 1.1265990379205022, + "learning_rate": 6.406494173618083e-07, + "loss": 0.3008, + "step": 3784 + }, + { + "epoch": 2.560893098782138, + "grad_norm": 1.1167239802165645, + "learning_rate": 6.387227554223918e-07, + "loss": 0.3034, + "step": 3785 + }, + { + "epoch": 2.561569688768606, + "grad_norm": 1.1593679924147031, + "learning_rate": 6.367987972397887e-07, + "loss": 0.3212, + "step": 3786 + }, + { + "epoch": 2.562246278755074, + "grad_norm": 1.146965141827802, + "learning_rate": 6.348775440067507e-07, + "loss": 0.3069, + "step": 3787 + }, + { + "epoch": 2.5629228687415426, + "grad_norm": 1.1656148548941394, + "learning_rate": 6.329589969143518e-07, + "loss": 0.3139, + "step": 3788 + }, + { + "epoch": 2.563599458728011, + "grad_norm": 1.1930517509647356, + "learning_rate": 6.310431571519865e-07, + "loss": 0.3369, + "step": 3789 + }, + { + "epoch": 2.564276048714479, + "grad_norm": 1.14838658629636, + "learning_rate": 6.291300259073724e-07, + "loss": 0.317, + "step": 3790 + }, + { + "epoch": 2.564952638700947, + "grad_norm": 1.1043942451679183, + "learning_rate": 6.27219604366549e-07, + "loss": 0.2992, + "step": 3791 + }, + { + "epoch": 2.565629228687415, + "grad_norm": 1.1846120369078166, + "learning_rate": 6.25311893713873e-07, + "loss": 0.3252, + "step": 3792 + }, + { + "epoch": 2.5663058186738836, + "grad_norm": 1.1718124573906015, + "learning_rate": 6.234068951320243e-07, + "loss": 0.3246, + "step": 3793 + }, + { + "epoch": 2.566982408660352, + "grad_norm": 1.1901228285723628, + "learning_rate": 6.215046098019967e-07, + "loss": 0.3274, + "step": 3794 + }, + { + "epoch": 2.56765899864682, + "grad_norm": 1.1265266987656684, + "learning_rate": 6.196050389031061e-07, + "loss": 0.3106, + "step": 3795 + }, + { + "epoch": 2.568335588633288, + "grad_norm": 1.0981062349044424, + "learning_rate": 6.177081836129833e-07, + "loss": 0.2974, + "step": 3796 + }, + { + "epoch": 2.569012178619756, + "grad_norm": 1.183624626867721, + "learning_rate": 6.158140451075794e-07, + "loss": 0.3178, + "step": 3797 + }, + { + "epoch": 2.5696887686062246, + "grad_norm": 1.1475745017616223, + "learning_rate": 6.139226245611535e-07, + "loss": 0.3075, + "step": 3798 + }, + { + "epoch": 2.5703653585926927, + "grad_norm": 1.130162558637046, + "learning_rate": 6.120339231462862e-07, + "loss": 0.3091, + "step": 3799 + }, + { + "epoch": 2.571041948579161, + "grad_norm": 1.1482852572092048, + "learning_rate": 6.101479420338713e-07, + "loss": 0.3118, + "step": 3800 + }, + { + "epoch": 2.571718538565629, + "grad_norm": 1.1741857009951864, + "learning_rate": 6.082646823931165e-07, + "loss": 0.3227, + "step": 3801 + }, + { + "epoch": 2.572395128552097, + "grad_norm": 1.16590564942958, + "learning_rate": 6.063841453915381e-07, + "loss": 0.3129, + "step": 3802 + }, + { + "epoch": 2.5730717185385656, + "grad_norm": 1.1183433012969533, + "learning_rate": 6.045063321949696e-07, + "loss": 0.3038, + "step": 3803 + }, + { + "epoch": 2.5737483085250337, + "grad_norm": 1.1945213716863263, + "learning_rate": 6.026312439675553e-07, + "loss": 0.3264, + "step": 3804 + }, + { + "epoch": 2.574424898511502, + "grad_norm": 1.1543056646963448, + "learning_rate": 6.007588818717458e-07, + "loss": 0.3112, + "step": 3805 + }, + { + "epoch": 2.57510148849797, + "grad_norm": 1.1974830719655707, + "learning_rate": 5.988892470683072e-07, + "loss": 0.3165, + "step": 3806 + }, + { + "epoch": 2.575778078484438, + "grad_norm": 1.1449929264797616, + "learning_rate": 5.9702234071631e-07, + "loss": 0.3054, + "step": 3807 + }, + { + "epoch": 2.5764546684709067, + "grad_norm": 1.133367760593302, + "learning_rate": 5.951581639731374e-07, + "loss": 0.3147, + "step": 3808 + }, + { + "epoch": 2.5771312584573747, + "grad_norm": 1.2114262871091466, + "learning_rate": 5.932967179944788e-07, + "loss": 0.3255, + "step": 3809 + }, + { + "epoch": 2.577807848443843, + "grad_norm": 1.1473435560399297, + "learning_rate": 5.914380039343281e-07, + "loss": 0.3079, + "step": 3810 + }, + { + "epoch": 2.578484438430311, + "grad_norm": 1.1333252360860042, + "learning_rate": 5.895820229449906e-07, + "loss": 0.3134, + "step": 3811 + }, + { + "epoch": 2.579161028416779, + "grad_norm": 1.230923209536186, + "learning_rate": 5.877287761770717e-07, + "loss": 0.3205, + "step": 3812 + }, + { + "epoch": 2.5798376184032477, + "grad_norm": 1.1113958757274316, + "learning_rate": 5.858782647794864e-07, + "loss": 0.3012, + "step": 3813 + }, + { + "epoch": 2.5805142083897157, + "grad_norm": 1.1042971238464216, + "learning_rate": 5.84030489899452e-07, + "loss": 0.299, + "step": 3814 + }, + { + "epoch": 2.581190798376184, + "grad_norm": 1.1580458278579813, + "learning_rate": 5.821854526824883e-07, + "loss": 0.3193, + "step": 3815 + }, + { + "epoch": 2.581867388362652, + "grad_norm": 1.134781982194373, + "learning_rate": 5.803431542724192e-07, + "loss": 0.3042, + "step": 3816 + }, + { + "epoch": 2.58254397834912, + "grad_norm": 1.1392046120583328, + "learning_rate": 5.785035958113717e-07, + "loss": 0.3144, + "step": 3817 + }, + { + "epoch": 2.5832205683355887, + "grad_norm": 1.1381504881565074, + "learning_rate": 5.766667784397706e-07, + "loss": 0.3183, + "step": 3818 + }, + { + "epoch": 2.5838971583220567, + "grad_norm": 1.1643788710651002, + "learning_rate": 5.748327032963464e-07, + "loss": 0.3222, + "step": 3819 + }, + { + "epoch": 2.584573748308525, + "grad_norm": 1.1390705272497892, + "learning_rate": 5.730013715181238e-07, + "loss": 0.317, + "step": 3820 + }, + { + "epoch": 2.585250338294993, + "grad_norm": 1.1334983243193666, + "learning_rate": 5.711727842404319e-07, + "loss": 0.3083, + "step": 3821 + }, + { + "epoch": 2.585926928281461, + "grad_norm": 1.1663671443675592, + "learning_rate": 5.693469425968962e-07, + "loss": 0.3298, + "step": 3822 + }, + { + "epoch": 2.5866035182679297, + "grad_norm": 1.1251060279128053, + "learning_rate": 5.675238477194389e-07, + "loss": 0.3068, + "step": 3823 + }, + { + "epoch": 2.5872801082543977, + "grad_norm": 1.1271727377453336, + "learning_rate": 5.657035007382822e-07, + "loss": 0.3118, + "step": 3824 + }, + { + "epoch": 2.587956698240866, + "grad_norm": 1.1354461790796073, + "learning_rate": 5.63885902781941e-07, + "loss": 0.3118, + "step": 3825 + }, + { + "epoch": 2.588633288227334, + "grad_norm": 1.1252400447339435, + "learning_rate": 5.620710549772295e-07, + "loss": 0.3097, + "step": 3826 + }, + { + "epoch": 2.589309878213802, + "grad_norm": 1.1291326986328731, + "learning_rate": 5.602589584492563e-07, + "loss": 0.3008, + "step": 3827 + }, + { + "epoch": 2.5899864682002707, + "grad_norm": 1.1347672955430506, + "learning_rate": 5.584496143214213e-07, + "loss": 0.3232, + "step": 3828 + }, + { + "epoch": 2.5906630581867387, + "grad_norm": 1.1218525800587003, + "learning_rate": 5.566430237154219e-07, + "loss": 0.3111, + "step": 3829 + }, + { + "epoch": 2.591339648173207, + "grad_norm": 1.1301765416375555, + "learning_rate": 5.548391877512471e-07, + "loss": 0.3009, + "step": 3830 + }, + { + "epoch": 2.592016238159675, + "grad_norm": 1.101603795212092, + "learning_rate": 5.530381075471775e-07, + "loss": 0.2977, + "step": 3831 + }, + { + "epoch": 2.592692828146143, + "grad_norm": 1.0931148059644717, + "learning_rate": 5.512397842197847e-07, + "loss": 0.3018, + "step": 3832 + }, + { + "epoch": 2.5933694181326117, + "grad_norm": 1.1639899007014718, + "learning_rate": 5.494442188839333e-07, + "loss": 0.3194, + "step": 3833 + }, + { + "epoch": 2.5940460081190797, + "grad_norm": 1.1339612664014729, + "learning_rate": 5.476514126527771e-07, + "loss": 0.3029, + "step": 3834 + }, + { + "epoch": 2.594722598105548, + "grad_norm": 1.068036944889056, + "learning_rate": 5.458613666377599e-07, + "loss": 0.2987, + "step": 3835 + }, + { + "epoch": 2.595399188092016, + "grad_norm": 1.1269405083440234, + "learning_rate": 5.440740819486123e-07, + "loss": 0.2916, + "step": 3836 + }, + { + "epoch": 2.596075778078484, + "grad_norm": 1.1489369451334235, + "learning_rate": 5.422895596933559e-07, + "loss": 0.3283, + "step": 3837 + }, + { + "epoch": 2.5967523680649527, + "grad_norm": 1.1596398422234107, + "learning_rate": 5.405078009782966e-07, + "loss": 0.3088, + "step": 3838 + }, + { + "epoch": 2.5974289580514207, + "grad_norm": 1.1886613346019737, + "learning_rate": 5.387288069080298e-07, + "loss": 0.3266, + "step": 3839 + }, + { + "epoch": 2.598105548037889, + "grad_norm": 1.1249537768973477, + "learning_rate": 5.369525785854368e-07, + "loss": 0.3092, + "step": 3840 + }, + { + "epoch": 2.598782138024357, + "grad_norm": 1.0910942396425627, + "learning_rate": 5.351791171116815e-07, + "loss": 0.2969, + "step": 3841 + }, + { + "epoch": 2.5994587280108252, + "grad_norm": 1.1308182243039568, + "learning_rate": 5.334084235862158e-07, + "loss": 0.2982, + "step": 3842 + }, + { + "epoch": 2.6001353179972937, + "grad_norm": 1.173235056178167, + "learning_rate": 5.316404991067747e-07, + "loss": 0.3276, + "step": 3843 + }, + { + "epoch": 2.6008119079837617, + "grad_norm": 1.1379719665613248, + "learning_rate": 5.29875344769375e-07, + "loss": 0.3106, + "step": 3844 + }, + { + "epoch": 2.60148849797023, + "grad_norm": 1.1492949648015884, + "learning_rate": 5.281129616683167e-07, + "loss": 0.3218, + "step": 3845 + }, + { + "epoch": 2.602165087956698, + "grad_norm": 1.183433053907029, + "learning_rate": 5.263533508961827e-07, + "loss": 0.3211, + "step": 3846 + }, + { + "epoch": 2.6028416779431662, + "grad_norm": 1.1355125151963343, + "learning_rate": 5.24596513543838e-07, + "loss": 0.3102, + "step": 3847 + }, + { + "epoch": 2.6035182679296347, + "grad_norm": 1.136279113327033, + "learning_rate": 5.228424507004265e-07, + "loss": 0.3139, + "step": 3848 + }, + { + "epoch": 2.6041948579161027, + "grad_norm": 1.117956201996482, + "learning_rate": 5.210911634533722e-07, + "loss": 0.2964, + "step": 3849 + }, + { + "epoch": 2.604871447902571, + "grad_norm": 1.1359805428603762, + "learning_rate": 5.193426528883788e-07, + "loss": 0.304, + "step": 3850 + }, + { + "epoch": 2.605548037889039, + "grad_norm": 1.146090712694019, + "learning_rate": 5.175969200894293e-07, + "loss": 0.3023, + "step": 3851 + }, + { + "epoch": 2.6062246278755072, + "grad_norm": 1.1620580556060454, + "learning_rate": 5.15853966138784e-07, + "loss": 0.3122, + "step": 3852 + }, + { + "epoch": 2.6069012178619757, + "grad_norm": 1.1298787014335747, + "learning_rate": 5.141137921169792e-07, + "loss": 0.3049, + "step": 3853 + }, + { + "epoch": 2.6075778078484437, + "grad_norm": 1.1399805341720364, + "learning_rate": 5.123763991028291e-07, + "loss": 0.3163, + "step": 3854 + }, + { + "epoch": 2.608254397834912, + "grad_norm": 1.1782093257327317, + "learning_rate": 5.106417881734244e-07, + "loss": 0.3225, + "step": 3855 + }, + { + "epoch": 2.60893098782138, + "grad_norm": 1.1557535236522072, + "learning_rate": 5.089099604041314e-07, + "loss": 0.3154, + "step": 3856 + }, + { + "epoch": 2.6096075778078482, + "grad_norm": 1.1067704143694668, + "learning_rate": 5.071809168685887e-07, + "loss": 0.3077, + "step": 3857 + }, + { + "epoch": 2.6102841677943167, + "grad_norm": 1.1208639217652279, + "learning_rate": 5.054546586387093e-07, + "loss": 0.3121, + "step": 3858 + }, + { + "epoch": 2.6109607577807847, + "grad_norm": 1.0972136590623127, + "learning_rate": 5.037311867846817e-07, + "loss": 0.2928, + "step": 3859 + }, + { + "epoch": 2.611637347767253, + "grad_norm": 1.1555705840082688, + "learning_rate": 5.020105023749644e-07, + "loss": 0.3045, + "step": 3860 + }, + { + "epoch": 2.6123139377537212, + "grad_norm": 1.1430388538618768, + "learning_rate": 5.002926064762908e-07, + "loss": 0.3076, + "step": 3861 + }, + { + "epoch": 2.6129905277401893, + "grad_norm": 1.1655175463541332, + "learning_rate": 4.985775001536619e-07, + "loss": 0.3173, + "step": 3862 + }, + { + "epoch": 2.6136671177266577, + "grad_norm": 1.1864998372109552, + "learning_rate": 4.968651844703514e-07, + "loss": 0.3313, + "step": 3863 + }, + { + "epoch": 2.6143437077131257, + "grad_norm": 1.160701416541282, + "learning_rate": 4.951556604879049e-07, + "loss": 0.3128, + "step": 3864 + }, + { + "epoch": 2.615020297699594, + "grad_norm": 1.1200346113836344, + "learning_rate": 4.934489292661326e-07, + "loss": 0.2989, + "step": 3865 + }, + { + "epoch": 2.6156968876860622, + "grad_norm": 1.1502479978378564, + "learning_rate": 4.917449918631162e-07, + "loss": 0.312, + "step": 3866 + }, + { + "epoch": 2.6163734776725303, + "grad_norm": 1.1481287995177303, + "learning_rate": 4.900438493352056e-07, + "loss": 0.3154, + "step": 3867 + }, + { + "epoch": 2.6170500676589987, + "grad_norm": 1.0868128739565612, + "learning_rate": 4.883455027370171e-07, + "loss": 0.3029, + "step": 3868 + }, + { + "epoch": 2.6177266576454667, + "grad_norm": 1.1093881770639489, + "learning_rate": 4.866499531214353e-07, + "loss": 0.3003, + "step": 3869 + }, + { + "epoch": 2.618403247631935, + "grad_norm": 1.1246919172934888, + "learning_rate": 4.849572015396081e-07, + "loss": 0.3161, + "step": 3870 + }, + { + "epoch": 2.6190798376184032, + "grad_norm": 1.0871026094196163, + "learning_rate": 4.832672490409513e-07, + "loss": 0.3095, + "step": 3871 + }, + { + "epoch": 2.6197564276048713, + "grad_norm": 1.152052591915853, + "learning_rate": 4.815800966731432e-07, + "loss": 0.3144, + "step": 3872 + }, + { + "epoch": 2.6204330175913397, + "grad_norm": 1.0746075049924178, + "learning_rate": 4.798957454821285e-07, + "loss": 0.2986, + "step": 3873 + }, + { + "epoch": 2.6211096075778078, + "grad_norm": 1.1605616867814241, + "learning_rate": 4.782141965121129e-07, + "loss": 0.3233, + "step": 3874 + }, + { + "epoch": 2.621786197564276, + "grad_norm": 1.1428749498555164, + "learning_rate": 4.7653545080556694e-07, + "loss": 0.3146, + "step": 3875 + }, + { + "epoch": 2.6224627875507442, + "grad_norm": 1.1420103428517616, + "learning_rate": 4.748595094032221e-07, + "loss": 0.3054, + "step": 3876 + }, + { + "epoch": 2.6231393775372123, + "grad_norm": 1.1193136035906281, + "learning_rate": 4.7318637334407335e-07, + "loss": 0.3024, + "step": 3877 + }, + { + "epoch": 2.6238159675236807, + "grad_norm": 1.1644387829831746, + "learning_rate": 4.715160436653732e-07, + "loss": 0.3276, + "step": 3878 + }, + { + "epoch": 2.6244925575101488, + "grad_norm": 1.1327022065160812, + "learning_rate": 4.698485214026349e-07, + "loss": 0.3087, + "step": 3879 + }, + { + "epoch": 2.6251691474966172, + "grad_norm": 1.1565690136087279, + "learning_rate": 4.6818380758963445e-07, + "loss": 0.3138, + "step": 3880 + }, + { + "epoch": 2.6258457374830853, + "grad_norm": 1.1361096516042588, + "learning_rate": 4.6652190325840396e-07, + "loss": 0.3013, + "step": 3881 + }, + { + "epoch": 2.6265223274695533, + "grad_norm": 1.0813213385441323, + "learning_rate": 4.6486280943923547e-07, + "loss": 0.3028, + "step": 3882 + }, + { + "epoch": 2.6271989174560217, + "grad_norm": 1.1901263997014517, + "learning_rate": 4.632065271606756e-07, + "loss": 0.3175, + "step": 3883 + }, + { + "epoch": 2.6278755074424898, + "grad_norm": 1.2003580096600932, + "learning_rate": 4.615530574495325e-07, + "loss": 0.3314, + "step": 3884 + }, + { + "epoch": 2.6285520974289582, + "grad_norm": 1.121270228678812, + "learning_rate": 4.5990240133086617e-07, + "loss": 0.3007, + "step": 3885 + }, + { + "epoch": 2.6292286874154263, + "grad_norm": 1.14675010293949, + "learning_rate": 4.582545598279964e-07, + "loss": 0.3079, + "step": 3886 + }, + { + "epoch": 2.6299052774018943, + "grad_norm": 1.162026647892871, + "learning_rate": 4.566095339624943e-07, + "loss": 0.3221, + "step": 3887 + }, + { + "epoch": 2.6305818673883627, + "grad_norm": 1.1839009765579254, + "learning_rate": 4.549673247541875e-07, + "loss": 0.3141, + "step": 3888 + }, + { + "epoch": 2.6312584573748308, + "grad_norm": 1.1185597241812426, + "learning_rate": 4.533279332211582e-07, + "loss": 0.3049, + "step": 3889 + }, + { + "epoch": 2.6319350473612992, + "grad_norm": 1.087754701605978, + "learning_rate": 4.516913603797407e-07, + "loss": 0.2988, + "step": 3890 + }, + { + "epoch": 2.6326116373477673, + "grad_norm": 1.138088046546554, + "learning_rate": 4.5005760724452173e-07, + "loss": 0.3194, + "step": 3891 + }, + { + "epoch": 2.6332882273342353, + "grad_norm": 1.1498375096555644, + "learning_rate": 4.484266748283389e-07, + "loss": 0.3211, + "step": 3892 + }, + { + "epoch": 2.6339648173207038, + "grad_norm": 1.161275366585453, + "learning_rate": 4.4679856414228394e-07, + "loss": 0.3146, + "step": 3893 + }, + { + "epoch": 2.634641407307172, + "grad_norm": 1.149623698273307, + "learning_rate": 4.4517327619569784e-07, + "loss": 0.3179, + "step": 3894 + }, + { + "epoch": 2.6353179972936402, + "grad_norm": 1.199573699525809, + "learning_rate": 4.435508119961701e-07, + "loss": 0.3232, + "step": 3895 + }, + { + "epoch": 2.6359945872801083, + "grad_norm": 1.1401399431685741, + "learning_rate": 4.4193117254954174e-07, + "loss": 0.3112, + "step": 3896 + }, + { + "epoch": 2.6366711772665763, + "grad_norm": 1.1588197843666872, + "learning_rate": 4.403143588599029e-07, + "loss": 0.3052, + "step": 3897 + }, + { + "epoch": 2.6373477672530448, + "grad_norm": 1.187777768827832, + "learning_rate": 4.387003719295896e-07, + "loss": 0.3195, + "step": 3898 + }, + { + "epoch": 2.638024357239513, + "grad_norm": 1.1078991330180872, + "learning_rate": 4.37089212759188e-07, + "loss": 0.3121, + "step": 3899 + }, + { + "epoch": 2.6387009472259813, + "grad_norm": 1.1890306336468357, + "learning_rate": 4.3548088234752814e-07, + "loss": 0.3202, + "step": 3900 + }, + { + "epoch": 2.6393775372124493, + "grad_norm": 1.1449029963949626, + "learning_rate": 4.3387538169168905e-07, + "loss": 0.3093, + "step": 3901 + }, + { + "epoch": 2.6400541271989173, + "grad_norm": 1.1606366928401124, + "learning_rate": 4.322727117869951e-07, + "loss": 0.3254, + "step": 3902 + }, + { + "epoch": 2.6407307171853858, + "grad_norm": 1.1356345185048866, + "learning_rate": 4.3067287362701606e-07, + "loss": 0.3151, + "step": 3903 + }, + { + "epoch": 2.641407307171854, + "grad_norm": 1.1381160631442553, + "learning_rate": 4.2907586820356337e-07, + "loss": 0.318, + "step": 3904 + }, + { + "epoch": 2.6420838971583223, + "grad_norm": 1.156409825974388, + "learning_rate": 4.2748169650669524e-07, + "loss": 0.3287, + "step": 3905 + }, + { + "epoch": 2.6427604871447903, + "grad_norm": 1.14925788312353, + "learning_rate": 4.258903595247116e-07, + "loss": 0.3123, + "step": 3906 + }, + { + "epoch": 2.6434370771312583, + "grad_norm": 1.1401115761497753, + "learning_rate": 4.2430185824415717e-07, + "loss": 0.3193, + "step": 3907 + }, + { + "epoch": 2.6441136671177268, + "grad_norm": 1.1116498482252897, + "learning_rate": 4.2271619364981474e-07, + "loss": 0.2982, + "step": 3908 + }, + { + "epoch": 2.644790257104195, + "grad_norm": 1.115715506840046, + "learning_rate": 4.211333667247125e-07, + "loss": 0.2977, + "step": 3909 + }, + { + "epoch": 2.6454668470906633, + "grad_norm": 1.0934925130360404, + "learning_rate": 4.195533784501177e-07, + "loss": 0.2971, + "step": 3910 + }, + { + "epoch": 2.6461434370771313, + "grad_norm": 1.1315352641263112, + "learning_rate": 4.179762298055384e-07, + "loss": 0.3123, + "step": 3911 + }, + { + "epoch": 2.6468200270635993, + "grad_norm": 1.1835451385787614, + "learning_rate": 4.164019217687215e-07, + "loss": 0.3256, + "step": 3912 + }, + { + "epoch": 2.647496617050068, + "grad_norm": 1.1684264719194448, + "learning_rate": 4.1483045531565183e-07, + "loss": 0.3154, + "step": 3913 + }, + { + "epoch": 2.648173207036536, + "grad_norm": 1.1456838214951601, + "learning_rate": 4.132618314205544e-07, + "loss": 0.3158, + "step": 3914 + }, + { + "epoch": 2.6488497970230043, + "grad_norm": 1.19382123557591, + "learning_rate": 4.1169605105589315e-07, + "loss": 0.3137, + "step": 3915 + }, + { + "epoch": 2.6495263870094723, + "grad_norm": 1.1475399291479902, + "learning_rate": 4.101331151923649e-07, + "loss": 0.3097, + "step": 3916 + }, + { + "epoch": 2.6502029769959403, + "grad_norm": 1.1420180852653452, + "learning_rate": 4.085730247989078e-07, + "loss": 0.3109, + "step": 3917 + }, + { + "epoch": 2.650879566982409, + "grad_norm": 1.0929143586978611, + "learning_rate": 4.070157808426928e-07, + "loss": 0.2943, + "step": 3918 + }, + { + "epoch": 2.651556156968877, + "grad_norm": 1.1198853637818, + "learning_rate": 4.0546138428912694e-07, + "loss": 0.3071, + "step": 3919 + }, + { + "epoch": 2.6522327469553453, + "grad_norm": 1.1754271856360132, + "learning_rate": 4.039098361018534e-07, + "loss": 0.3237, + "step": 3920 + }, + { + "epoch": 2.6529093369418133, + "grad_norm": 1.1545185588249969, + "learning_rate": 4.0236113724274716e-07, + "loss": 0.312, + "step": 3921 + }, + { + "epoch": 2.6535859269282813, + "grad_norm": 1.1252462422848621, + "learning_rate": 4.0081528867191854e-07, + "loss": 0.3031, + "step": 3922 + }, + { + "epoch": 2.65426251691475, + "grad_norm": 1.1312672578179293, + "learning_rate": 3.992722913477104e-07, + "loss": 0.3124, + "step": 3923 + }, + { + "epoch": 2.654939106901218, + "grad_norm": 1.1309409352072899, + "learning_rate": 3.9773214622669974e-07, + "loss": 0.3067, + "step": 3924 + }, + { + "epoch": 2.6556156968876863, + "grad_norm": 1.158834269778159, + "learning_rate": 3.9619485426369007e-07, + "loss": 0.3164, + "step": 3925 + }, + { + "epoch": 2.6562922868741543, + "grad_norm": 1.1211089267543728, + "learning_rate": 3.9466041641172126e-07, + "loss": 0.3017, + "step": 3926 + }, + { + "epoch": 2.6569688768606223, + "grad_norm": 1.1781346466914768, + "learning_rate": 3.9312883362206177e-07, + "loss": 0.3172, + "step": 3927 + }, + { + "epoch": 2.657645466847091, + "grad_norm": 1.1579686733853118, + "learning_rate": 3.916001068442116e-07, + "loss": 0.3061, + "step": 3928 + }, + { + "epoch": 2.658322056833559, + "grad_norm": 1.158083934820583, + "learning_rate": 3.90074237025897e-07, + "loss": 0.3276, + "step": 3929 + }, + { + "epoch": 2.6589986468200273, + "grad_norm": 1.2224644770566764, + "learning_rate": 3.885512251130763e-07, + "loss": 0.3301, + "step": 3930 + }, + { + "epoch": 2.6596752368064953, + "grad_norm": 1.154118992241007, + "learning_rate": 3.870310720499354e-07, + "loss": 0.3133, + "step": 3931 + }, + { + "epoch": 2.6603518267929633, + "grad_norm": 1.1285314214553759, + "learning_rate": 3.8551377877888487e-07, + "loss": 0.2973, + "step": 3932 + }, + { + "epoch": 2.661028416779432, + "grad_norm": 1.1350986384555009, + "learning_rate": 3.839993462405678e-07, + "loss": 0.3025, + "step": 3933 + }, + { + "epoch": 2.6617050067659, + "grad_norm": 1.1691896415739214, + "learning_rate": 3.8248777537384763e-07, + "loss": 0.3194, + "step": 3934 + }, + { + "epoch": 2.6623815967523683, + "grad_norm": 1.1226685200207052, + "learning_rate": 3.8097906711581864e-07, + "loss": 0.3002, + "step": 3935 + }, + { + "epoch": 2.6630581867388363, + "grad_norm": 1.0962648684689034, + "learning_rate": 3.794732224017994e-07, + "loss": 0.3012, + "step": 3936 + }, + { + "epoch": 2.6637347767253043, + "grad_norm": 1.1664936167721887, + "learning_rate": 3.7797024216533143e-07, + "loss": 0.3177, + "step": 3937 + }, + { + "epoch": 2.664411366711773, + "grad_norm": 1.1141677206329326, + "learning_rate": 3.764701273381799e-07, + "loss": 0.3, + "step": 3938 + }, + { + "epoch": 2.665087956698241, + "grad_norm": 1.1920885348497052, + "learning_rate": 3.7497287885033763e-07, + "loss": 0.3149, + "step": 3939 + }, + { + "epoch": 2.6657645466847093, + "grad_norm": 1.1332018548170577, + "learning_rate": 3.734784976300165e-07, + "loss": 0.3087, + "step": 3940 + }, + { + "epoch": 2.6664411366711773, + "grad_norm": 1.1736281956057233, + "learning_rate": 3.719869846036539e-07, + "loss": 0.3083, + "step": 3941 + }, + { + "epoch": 2.6671177266576453, + "grad_norm": 1.1631910018685614, + "learning_rate": 3.7049834069590507e-07, + "loss": 0.3161, + "step": 3942 + }, + { + "epoch": 2.667794316644114, + "grad_norm": 1.1391266383655867, + "learning_rate": 3.6901256682965123e-07, + "loss": 0.3114, + "step": 3943 + }, + { + "epoch": 2.668470906630582, + "grad_norm": 1.1383146039668341, + "learning_rate": 3.675296639259912e-07, + "loss": 0.3033, + "step": 3944 + }, + { + "epoch": 2.6691474966170503, + "grad_norm": 1.1379065654288862, + "learning_rate": 3.6604963290424453e-07, + "loss": 0.3279, + "step": 3945 + }, + { + "epoch": 2.6698240866035183, + "grad_norm": 1.112887933058892, + "learning_rate": 3.6457247468195233e-07, + "loss": 0.2973, + "step": 3946 + }, + { + "epoch": 2.6705006765899864, + "grad_norm": 1.0936236241153474, + "learning_rate": 3.6309819017487034e-07, + "loss": 0.2962, + "step": 3947 + }, + { + "epoch": 2.671177266576455, + "grad_norm": 1.1282324362379799, + "learning_rate": 3.6162678029697696e-07, + "loss": 0.3013, + "step": 3948 + }, + { + "epoch": 2.671853856562923, + "grad_norm": 1.1428927851246673, + "learning_rate": 3.60158245960468e-07, + "loss": 0.3158, + "step": 3949 + }, + { + "epoch": 2.6725304465493913, + "grad_norm": 1.0813084587548822, + "learning_rate": 3.5869258807575414e-07, + "loss": 0.295, + "step": 3950 + }, + { + "epoch": 2.6732070365358593, + "grad_norm": 1.166242524004221, + "learning_rate": 3.572298075514652e-07, + "loss": 0.305, + "step": 3951 + }, + { + "epoch": 2.6738836265223274, + "grad_norm": 1.1212011377351583, + "learning_rate": 3.557699052944447e-07, + "loss": 0.3134, + "step": 3952 + }, + { + "epoch": 2.674560216508796, + "grad_norm": 1.083384538000415, + "learning_rate": 3.5431288220975466e-07, + "loss": 0.2957, + "step": 3953 + }, + { + "epoch": 2.675236806495264, + "grad_norm": 1.180540152010333, + "learning_rate": 3.528587392006716e-07, + "loss": 0.3195, + "step": 3954 + }, + { + "epoch": 2.6759133964817323, + "grad_norm": 1.1121805932210829, + "learning_rate": 3.5140747716868375e-07, + "loss": 0.2995, + "step": 3955 + }, + { + "epoch": 2.6765899864682003, + "grad_norm": 1.1358071974123964, + "learning_rate": 3.499590970134964e-07, + "loss": 0.3166, + "step": 3956 + }, + { + "epoch": 2.6772665764546684, + "grad_norm": 1.146525898007564, + "learning_rate": 3.48513599633028e-07, + "loss": 0.3237, + "step": 3957 + }, + { + "epoch": 2.677943166441137, + "grad_norm": 1.154294420325088, + "learning_rate": 3.470709859234084e-07, + "loss": 0.3193, + "step": 3958 + }, + { + "epoch": 2.678619756427605, + "grad_norm": 1.1522985369452075, + "learning_rate": 3.4563125677897936e-07, + "loss": 0.3048, + "step": 3959 + }, + { + "epoch": 2.6792963464140733, + "grad_norm": 1.111198778038217, + "learning_rate": 3.4419441309229587e-07, + "loss": 0.306, + "step": 3960 + }, + { + "epoch": 2.6799729364005414, + "grad_norm": 1.1111159654198905, + "learning_rate": 3.427604557541242e-07, + "loss": 0.3118, + "step": 3961 + }, + { + "epoch": 2.6806495263870094, + "grad_norm": 1.130495969079568, + "learning_rate": 3.4132938565344054e-07, + "loss": 0.3022, + "step": 3962 + }, + { + "epoch": 2.6813261163734774, + "grad_norm": 1.1229295474030616, + "learning_rate": 3.3990120367743074e-07, + "loss": 0.2997, + "step": 3963 + }, + { + "epoch": 2.682002706359946, + "grad_norm": 1.1437934061736013, + "learning_rate": 3.38475910711491e-07, + "loss": 0.3105, + "step": 3964 + }, + { + "epoch": 2.6826792963464143, + "grad_norm": 1.1404437199848547, + "learning_rate": 3.370535076392256e-07, + "loss": 0.3138, + "step": 3965 + }, + { + "epoch": 2.6833558863328824, + "grad_norm": 1.184558328404599, + "learning_rate": 3.356339953424481e-07, + "loss": 0.3141, + "step": 3966 + }, + { + "epoch": 2.6840324763193504, + "grad_norm": 1.1706284914906093, + "learning_rate": 3.342173747011801e-07, + "loss": 0.3273, + "step": 3967 + }, + { + "epoch": 2.6847090663058184, + "grad_norm": 1.134593138015031, + "learning_rate": 3.3280364659364903e-07, + "loss": 0.3031, + "step": 3968 + }, + { + "epoch": 2.685385656292287, + "grad_norm": 1.1892073800544944, + "learning_rate": 3.313928118962906e-07, + "loss": 0.3234, + "step": 3969 + }, + { + "epoch": 2.6860622462787553, + "grad_norm": 1.1470210857615581, + "learning_rate": 3.299848714837473e-07, + "loss": 0.3112, + "step": 3970 + }, + { + "epoch": 2.6867388362652234, + "grad_norm": 1.1137745498498943, + "learning_rate": 3.285798262288653e-07, + "loss": 0.3043, + "step": 3971 + }, + { + "epoch": 2.6874154262516914, + "grad_norm": 1.1740980544937267, + "learning_rate": 3.271776770026963e-07, + "loss": 0.3137, + "step": 3972 + }, + { + "epoch": 2.6880920162381594, + "grad_norm": 1.1455006056831107, + "learning_rate": 3.2577842467449773e-07, + "loss": 0.3092, + "step": 3973 + }, + { + "epoch": 2.688768606224628, + "grad_norm": 1.149254969088589, + "learning_rate": 3.243820701117306e-07, + "loss": 0.3188, + "step": 3974 + }, + { + "epoch": 2.6894451962110963, + "grad_norm": 1.1212664459809358, + "learning_rate": 3.229886141800609e-07, + "loss": 0.3115, + "step": 3975 + }, + { + "epoch": 2.6901217861975644, + "grad_norm": 1.0991914124526325, + "learning_rate": 3.2159805774335364e-07, + "loss": 0.3069, + "step": 3976 + }, + { + "epoch": 2.6907983761840324, + "grad_norm": 1.1249068903027997, + "learning_rate": 3.2021040166368145e-07, + "loss": 0.3051, + "step": 3977 + }, + { + "epoch": 2.6914749661705004, + "grad_norm": 1.1283914367979384, + "learning_rate": 3.18825646801314e-07, + "loss": 0.3121, + "step": 3978 + }, + { + "epoch": 2.692151556156969, + "grad_norm": 1.118970879435845, + "learning_rate": 3.174437940147268e-07, + "loss": 0.3076, + "step": 3979 + }, + { + "epoch": 2.6928281461434374, + "grad_norm": 1.1018902269904114, + "learning_rate": 3.160648441605918e-07, + "loss": 0.299, + "step": 3980 + }, + { + "epoch": 2.6935047361299054, + "grad_norm": 1.1649370015384704, + "learning_rate": 3.146887980937852e-07, + "loss": 0.3148, + "step": 3981 + }, + { + "epoch": 2.6941813261163734, + "grad_norm": 1.1177723940804327, + "learning_rate": 3.133156566673806e-07, + "loss": 0.3054, + "step": 3982 + }, + { + "epoch": 2.6948579161028414, + "grad_norm": 1.1573227601445006, + "learning_rate": 3.119454207326533e-07, + "loss": 0.316, + "step": 3983 + }, + { + "epoch": 2.69553450608931, + "grad_norm": 1.1521274288041907, + "learning_rate": 3.105780911390738e-07, + "loss": 0.314, + "step": 3984 + }, + { + "epoch": 2.696211096075778, + "grad_norm": 1.1504808865411964, + "learning_rate": 3.0921366873431337e-07, + "loss": 0.3058, + "step": 3985 + }, + { + "epoch": 2.6968876860622464, + "grad_norm": 1.142037633162387, + "learning_rate": 3.0785215436423986e-07, + "loss": 0.3138, + "step": 3986 + }, + { + "epoch": 2.6975642760487144, + "grad_norm": 1.1975045807010414, + "learning_rate": 3.0649354887291927e-07, + "loss": 0.3181, + "step": 3987 + }, + { + "epoch": 2.6982408660351824, + "grad_norm": 1.1534965573568399, + "learning_rate": 3.05137853102615e-07, + "loss": 0.3243, + "step": 3988 + }, + { + "epoch": 2.698917456021651, + "grad_norm": 1.1211567734205146, + "learning_rate": 3.037850678937831e-07, + "loss": 0.3135, + "step": 3989 + }, + { + "epoch": 2.699594046008119, + "grad_norm": 1.1120909076464152, + "learning_rate": 3.0243519408507894e-07, + "loss": 0.2967, + "step": 3990 + }, + { + "epoch": 2.7002706359945874, + "grad_norm": 1.1371430477916977, + "learning_rate": 3.0108823251335183e-07, + "loss": 0.3094, + "step": 3991 + }, + { + "epoch": 2.7009472259810554, + "grad_norm": 1.1738354999672715, + "learning_rate": 2.997441840136445e-07, + "loss": 0.3186, + "step": 3992 + }, + { + "epoch": 2.7016238159675234, + "grad_norm": 1.0896630238956622, + "learning_rate": 2.984030494191942e-07, + "loss": 0.2936, + "step": 3993 + }, + { + "epoch": 2.702300405953992, + "grad_norm": 1.1838252999459855, + "learning_rate": 2.97064829561432e-07, + "loss": 0.3123, + "step": 3994 + }, + { + "epoch": 2.70297699594046, + "grad_norm": 1.1357917165707252, + "learning_rate": 2.957295252699832e-07, + "loss": 0.31, + "step": 3995 + }, + { + "epoch": 2.7036535859269284, + "grad_norm": 1.1507316397604836, + "learning_rate": 2.9439713737266504e-07, + "loss": 0.3156, + "step": 3996 + }, + { + "epoch": 2.7043301759133964, + "grad_norm": 1.1306261457857945, + "learning_rate": 2.930676666954846e-07, + "loss": 0.2991, + "step": 3997 + }, + { + "epoch": 2.7050067658998644, + "grad_norm": 1.1379277221353257, + "learning_rate": 2.917411140626425e-07, + "loss": 0.314, + "step": 3998 + }, + { + "epoch": 2.705683355886333, + "grad_norm": 1.116247752238892, + "learning_rate": 2.904174802965293e-07, + "loss": 0.2937, + "step": 3999 + }, + { + "epoch": 2.706359945872801, + "grad_norm": 1.1468361273310435, + "learning_rate": 2.8909676621772853e-07, + "loss": 0.3152, + "step": 4000 + }, + { + "epoch": 2.7070365358592694, + "grad_norm": 1.139665575219445, + "learning_rate": 2.877789726450092e-07, + "loss": 0.3176, + "step": 4001 + }, + { + "epoch": 2.7077131258457374, + "grad_norm": 1.166998089614663, + "learning_rate": 2.864641003953339e-07, + "loss": 0.3132, + "step": 4002 + }, + { + "epoch": 2.7083897158322054, + "grad_norm": 1.1376079208582652, + "learning_rate": 2.8515215028385223e-07, + "loss": 0.3099, + "step": 4003 + }, + { + "epoch": 2.709066305818674, + "grad_norm": 1.1625053920853172, + "learning_rate": 2.8384312312390306e-07, + "loss": 0.3088, + "step": 4004 + }, + { + "epoch": 2.709742895805142, + "grad_norm": 1.1248582279451893, + "learning_rate": 2.8253701972701275e-07, + "loss": 0.3066, + "step": 4005 + }, + { + "epoch": 2.7104194857916104, + "grad_norm": 1.1216692215532875, + "learning_rate": 2.8123384090289307e-07, + "loss": 0.2862, + "step": 4006 + }, + { + "epoch": 2.7110960757780784, + "grad_norm": 1.1556087009946527, + "learning_rate": 2.799335874594461e-07, + "loss": 0.3045, + "step": 4007 + }, + { + "epoch": 2.7117726657645465, + "grad_norm": 1.1589825412941182, + "learning_rate": 2.7863626020275867e-07, + "loss": 0.3166, + "step": 4008 + }, + { + "epoch": 2.712449255751015, + "grad_norm": 1.0787311167365927, + "learning_rate": 2.773418599371047e-07, + "loss": 0.2921, + "step": 4009 + }, + { + "epoch": 2.713125845737483, + "grad_norm": 1.1914457524471502, + "learning_rate": 2.7605038746494063e-07, + "loss": 0.3152, + "step": 4010 + }, + { + "epoch": 2.7138024357239514, + "grad_norm": 1.1592472178730937, + "learning_rate": 2.7476184358691206e-07, + "loss": 0.3131, + "step": 4011 + }, + { + "epoch": 2.7144790257104194, + "grad_norm": 1.166114882399594, + "learning_rate": 2.7347622910184445e-07, + "loss": 0.3199, + "step": 4012 + }, + { + "epoch": 2.7151556156968875, + "grad_norm": 1.1919275580767696, + "learning_rate": 2.7219354480675144e-07, + "loss": 0.3391, + "step": 4013 + }, + { + "epoch": 2.715832205683356, + "grad_norm": 1.1382464826637717, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.2977, + "step": 4014 + }, + { + "epoch": 2.716508795669824, + "grad_norm": 1.1860334518820388, + "learning_rate": 2.696369699654489e-07, + "loss": 0.3163, + "step": 4015 + }, + { + "epoch": 2.7171853856562924, + "grad_norm": 1.1305080812300905, + "learning_rate": 2.6836308100417874e-07, + "loss": 0.3179, + "step": 4016 + }, + { + "epoch": 2.7178619756427604, + "grad_norm": 1.1478005912251539, + "learning_rate": 2.670921254027592e-07, + "loss": 0.3179, + "step": 4017 + }, + { + "epoch": 2.7185385656292285, + "grad_norm": 1.249165660172556, + "learning_rate": 2.6582410394911327e-07, + "loss": 0.3037, + "step": 4018 + }, + { + "epoch": 2.719215155615697, + "grad_norm": 1.1812469984608873, + "learning_rate": 2.6455901742934556e-07, + "loss": 0.3291, + "step": 4019 + }, + { + "epoch": 2.719891745602165, + "grad_norm": 1.1394091565896398, + "learning_rate": 2.6329686662774247e-07, + "loss": 0.3102, + "step": 4020 + }, + { + "epoch": 2.7205683355886334, + "grad_norm": 1.172131940690211, + "learning_rate": 2.620376523267698e-07, + "loss": 0.3186, + "step": 4021 + }, + { + "epoch": 2.7212449255751014, + "grad_norm": 1.131089064937029, + "learning_rate": 2.6078137530707146e-07, + "loss": 0.3118, + "step": 4022 + }, + { + "epoch": 2.7219215155615695, + "grad_norm": 1.080660036279916, + "learning_rate": 2.595280363474717e-07, + "loss": 0.2894, + "step": 4023 + }, + { + "epoch": 2.722598105548038, + "grad_norm": 1.1520481513191763, + "learning_rate": 2.582776362249739e-07, + "loss": 0.3103, + "step": 4024 + }, + { + "epoch": 2.723274695534506, + "grad_norm": 1.1409844303404006, + "learning_rate": 2.5703017571475755e-07, + "loss": 0.3115, + "step": 4025 + }, + { + "epoch": 2.7239512855209744, + "grad_norm": 1.1487512510352567, + "learning_rate": 2.5578565559018276e-07, + "loss": 0.3052, + "step": 4026 + }, + { + "epoch": 2.7246278755074425, + "grad_norm": 1.1594837220607868, + "learning_rate": 2.545440766227825e-07, + "loss": 0.3128, + "step": 4027 + }, + { + "epoch": 2.7253044654939105, + "grad_norm": 1.1184293351288757, + "learning_rate": 2.5330543958227036e-07, + "loss": 0.3132, + "step": 4028 + }, + { + "epoch": 2.725981055480379, + "grad_norm": 1.1164129239627643, + "learning_rate": 2.520697452365345e-07, + "loss": 0.2977, + "step": 4029 + }, + { + "epoch": 2.726657645466847, + "grad_norm": 1.182607668698762, + "learning_rate": 2.508369943516387e-07, + "loss": 0.3246, + "step": 4030 + }, + { + "epoch": 2.7273342354533154, + "grad_norm": 1.1640272681730328, + "learning_rate": 2.4960718769182214e-07, + "loss": 0.3267, + "step": 4031 + }, + { + "epoch": 2.7280108254397835, + "grad_norm": 1.1600039951419086, + "learning_rate": 2.483803260194978e-07, + "loss": 0.3112, + "step": 4032 + }, + { + "epoch": 2.7286874154262515, + "grad_norm": 1.172072214532289, + "learning_rate": 2.4715641009525446e-07, + "loss": 0.3123, + "step": 4033 + }, + { + "epoch": 2.72936400541272, + "grad_norm": 1.1617898967194897, + "learning_rate": 2.459354406778547e-07, + "loss": 0.3123, + "step": 4034 + }, + { + "epoch": 2.730040595399188, + "grad_norm": 1.1424919602451964, + "learning_rate": 2.447174185242324e-07, + "loss": 0.3065, + "step": 4035 + }, + { + "epoch": 2.7307171853856564, + "grad_norm": 1.1260938582436268, + "learning_rate": 2.4350234438949625e-07, + "loss": 0.2956, + "step": 4036 + }, + { + "epoch": 2.7313937753721245, + "grad_norm": 1.1743016504532113, + "learning_rate": 2.4229021902692663e-07, + "loss": 0.3195, + "step": 4037 + }, + { + "epoch": 2.7320703653585925, + "grad_norm": 1.101385386007145, + "learning_rate": 2.4108104318797674e-07, + "loss": 0.2995, + "step": 4038 + }, + { + "epoch": 2.732746955345061, + "grad_norm": 1.1269255833849812, + "learning_rate": 2.3987481762226984e-07, + "loss": 0.3074, + "step": 4039 + }, + { + "epoch": 2.733423545331529, + "grad_norm": 1.125855092520463, + "learning_rate": 2.3867154307759986e-07, + "loss": 0.3037, + "step": 4040 + }, + { + "epoch": 2.7341001353179974, + "grad_norm": 1.100393441841296, + "learning_rate": 2.3747122029993296e-07, + "loss": 0.3047, + "step": 4041 + }, + { + "epoch": 2.7347767253044655, + "grad_norm": 1.160766957345758, + "learning_rate": 2.3627385003340552e-07, + "loss": 0.3086, + "step": 4042 + }, + { + "epoch": 2.7354533152909335, + "grad_norm": 1.1635387600353782, + "learning_rate": 2.3507943302032045e-07, + "loss": 0.3233, + "step": 4043 + }, + { + "epoch": 2.736129905277402, + "grad_norm": 1.1728497369695257, + "learning_rate": 2.3388797000115427e-07, + "loss": 0.3234, + "step": 4044 + }, + { + "epoch": 2.73680649526387, + "grad_norm": 1.1728148975131911, + "learning_rate": 2.3269946171454727e-07, + "loss": 0.3157, + "step": 4045 + }, + { + "epoch": 2.7374830852503385, + "grad_norm": 1.1095476346839723, + "learning_rate": 2.3151390889731285e-07, + "loss": 0.3072, + "step": 4046 + }, + { + "epoch": 2.7381596752368065, + "grad_norm": 1.0998786334559265, + "learning_rate": 2.3033131228442863e-07, + "loss": 0.2985, + "step": 4047 + }, + { + "epoch": 2.7388362652232745, + "grad_norm": 1.1056490613710948, + "learning_rate": 2.2915167260904092e-07, + "loss": 0.302, + "step": 4048 + }, + { + "epoch": 2.739512855209743, + "grad_norm": 1.1118695733654114, + "learning_rate": 2.2797499060246253e-07, + "loss": 0.3082, + "step": 4049 + }, + { + "epoch": 2.740189445196211, + "grad_norm": 1.1508815373411923, + "learning_rate": 2.2680126699417383e-07, + "loss": 0.3112, + "step": 4050 + }, + { + "epoch": 2.7408660351826795, + "grad_norm": 1.1552472996586618, + "learning_rate": 2.256305025118194e-07, + "loss": 0.3103, + "step": 4051 + }, + { + "epoch": 2.7415426251691475, + "grad_norm": 1.1217024972889118, + "learning_rate": 2.244626978812109e-07, + "loss": 0.3135, + "step": 4052 + }, + { + "epoch": 2.7422192151556155, + "grad_norm": 1.1745396521279396, + "learning_rate": 2.2329785382632253e-07, + "loss": 0.3235, + "step": 4053 + }, + { + "epoch": 2.742895805142084, + "grad_norm": 1.1040598537793167, + "learning_rate": 2.2213597106929608e-07, + "loss": 0.3004, + "step": 4054 + }, + { + "epoch": 2.743572395128552, + "grad_norm": 1.185141353204137, + "learning_rate": 2.2097705033043703e-07, + "loss": 0.3203, + "step": 4055 + }, + { + "epoch": 2.7442489851150205, + "grad_norm": 1.1892370342215453, + "learning_rate": 2.198210923282118e-07, + "loss": 0.3119, + "step": 4056 + }, + { + "epoch": 2.7449255751014885, + "grad_norm": 1.1374531970626292, + "learning_rate": 2.1866809777925323e-07, + "loss": 0.3144, + "step": 4057 + }, + { + "epoch": 2.7456021650879565, + "grad_norm": 1.1202972350254548, + "learning_rate": 2.1751806739835624e-07, + "loss": 0.2989, + "step": 4058 + }, + { + "epoch": 2.746278755074425, + "grad_norm": 1.1168695709129022, + "learning_rate": 2.163710018984766e-07, + "loss": 0.3048, + "step": 4059 + }, + { + "epoch": 2.746955345060893, + "grad_norm": 1.2040695708185165, + "learning_rate": 2.1522690199073382e-07, + "loss": 0.3256, + "step": 4060 + }, + { + "epoch": 2.7476319350473615, + "grad_norm": 1.1507883712785076, + "learning_rate": 2.140857683844072e-07, + "loss": 0.3148, + "step": 4061 + }, + { + "epoch": 2.7483085250338295, + "grad_norm": 1.0805625359510496, + "learning_rate": 2.1294760178693918e-07, + "loss": 0.2851, + "step": 4062 + }, + { + "epoch": 2.7489851150202975, + "grad_norm": 1.0821339895994082, + "learning_rate": 2.118124029039309e-07, + "loss": 0.2893, + "step": 4063 + }, + { + "epoch": 2.749661705006766, + "grad_norm": 1.1465099231279592, + "learning_rate": 2.1068017243914663e-07, + "loss": 0.3247, + "step": 4064 + }, + { + "epoch": 2.750338294993234, + "grad_norm": 1.1445379537695655, + "learning_rate": 2.0955091109450488e-07, + "loss": 0.3118, + "step": 4065 + }, + { + "epoch": 2.7510148849797025, + "grad_norm": 1.1380250051116825, + "learning_rate": 2.0842461957008841e-07, + "loss": 0.2958, + "step": 4066 + }, + { + "epoch": 2.7516914749661705, + "grad_norm": 1.136764005997454, + "learning_rate": 2.0730129856413705e-07, + "loss": 0.3002, + "step": 4067 + }, + { + "epoch": 2.7523680649526385, + "grad_norm": 1.1440071360904445, + "learning_rate": 2.061809487730504e-07, + "loss": 0.3051, + "step": 4068 + }, + { + "epoch": 2.753044654939107, + "grad_norm": 1.1675675818474753, + "learning_rate": 2.050635708913834e-07, + "loss": 0.3153, + "step": 4069 + }, + { + "epoch": 2.753721244925575, + "grad_norm": 1.131607828217165, + "learning_rate": 2.0394916561185085e-07, + "loss": 0.3136, + "step": 4070 + }, + { + "epoch": 2.7543978349120435, + "grad_norm": 1.1720738901947765, + "learning_rate": 2.0283773362532455e-07, + "loss": 0.3218, + "step": 4071 + }, + { + "epoch": 2.7550744248985115, + "grad_norm": 1.166079943261266, + "learning_rate": 2.0172927562083056e-07, + "loss": 0.3156, + "step": 4072 + }, + { + "epoch": 2.7557510148849795, + "grad_norm": 1.0881044885961269, + "learning_rate": 2.006237922855553e-07, + "loss": 0.2975, + "step": 4073 + }, + { + "epoch": 2.756427604871448, + "grad_norm": 1.132434513163992, + "learning_rate": 1.9952128430483718e-07, + "loss": 0.3097, + "step": 4074 + }, + { + "epoch": 2.757104194857916, + "grad_norm": 1.1962221017978052, + "learning_rate": 1.9842175236217176e-07, + "loss": 0.3232, + "step": 4075 + }, + { + "epoch": 2.7577807848443845, + "grad_norm": 1.1314845406806597, + "learning_rate": 1.973251971392115e-07, + "loss": 0.3042, + "step": 4076 + }, + { + "epoch": 2.7584573748308525, + "grad_norm": 1.153865224777945, + "learning_rate": 1.962316193157593e-07, + "loss": 0.3152, + "step": 4077 + }, + { + "epoch": 2.7591339648173205, + "grad_norm": 1.1281332265264288, + "learning_rate": 1.9514101956977617e-07, + "loss": 0.3109, + "step": 4078 + }, + { + "epoch": 2.759810554803789, + "grad_norm": 1.1427027397151777, + "learning_rate": 1.9405339857737348e-07, + "loss": 0.2993, + "step": 4079 + }, + { + "epoch": 2.760487144790257, + "grad_norm": 1.1206618432879296, + "learning_rate": 1.9296875701281858e-07, + "loss": 0.3061, + "step": 4080 + }, + { + "epoch": 2.7611637347767255, + "grad_norm": 1.1269631300621363, + "learning_rate": 1.9188709554853137e-07, + "loss": 0.2995, + "step": 4081 + }, + { + "epoch": 2.7618403247631935, + "grad_norm": 1.1635517077821298, + "learning_rate": 1.9080841485508205e-07, + "loss": 0.3119, + "step": 4082 + }, + { + "epoch": 2.7625169147496615, + "grad_norm": 1.1708843569601985, + "learning_rate": 1.8973271560119576e-07, + "loss": 0.3186, + "step": 4083 + }, + { + "epoch": 2.76319350473613, + "grad_norm": 1.1710324106069068, + "learning_rate": 1.8865999845374794e-07, + "loss": 0.3067, + "step": 4084 + }, + { + "epoch": 2.763870094722598, + "grad_norm": 1.1225365775772305, + "learning_rate": 1.8759026407776605e-07, + "loss": 0.3107, + "step": 4085 + }, + { + "epoch": 2.7645466847090665, + "grad_norm": 1.0651871022090058, + "learning_rate": 1.8652351313642568e-07, + "loss": 0.2868, + "step": 4086 + }, + { + "epoch": 2.7652232746955345, + "grad_norm": 1.116776176522001, + "learning_rate": 1.8545974629105624e-07, + "loss": 0.3085, + "step": 4087 + }, + { + "epoch": 2.7658998646820026, + "grad_norm": 1.1364118546211295, + "learning_rate": 1.8439896420113569e-07, + "loss": 0.3015, + "step": 4088 + }, + { + "epoch": 2.766576454668471, + "grad_norm": 1.227473079179004, + "learning_rate": 1.8334116752429243e-07, + "loss": 0.3084, + "step": 4089 + }, + { + "epoch": 2.767253044654939, + "grad_norm": 1.1578007786831845, + "learning_rate": 1.8228635691630191e-07, + "loss": 0.312, + "step": 4090 + }, + { + "epoch": 2.7679296346414075, + "grad_norm": 1.1333879687887103, + "learning_rate": 1.812345330310916e-07, + "loss": 0.3163, + "step": 4091 + }, + { + "epoch": 2.7686062246278755, + "grad_norm": 1.1771525934157423, + "learning_rate": 1.801856965207338e-07, + "loss": 0.318, + "step": 4092 + }, + { + "epoch": 2.7692828146143436, + "grad_norm": 1.1258856212722625, + "learning_rate": 1.791398480354517e-07, + "loss": 0.3053, + "step": 4093 + }, + { + "epoch": 2.769959404600812, + "grad_norm": 1.1185884402081516, + "learning_rate": 1.78096988223615e-07, + "loss": 0.2996, + "step": 4094 + }, + { + "epoch": 2.77063599458728, + "grad_norm": 1.1276149308890613, + "learning_rate": 1.770571177317404e-07, + "loss": 0.3137, + "step": 4095 + }, + { + "epoch": 2.7713125845737485, + "grad_norm": 1.1124886369718174, + "learning_rate": 1.7602023720449114e-07, + "loss": 0.2968, + "step": 4096 + }, + { + "epoch": 2.7719891745602165, + "grad_norm": 1.1753353901708072, + "learning_rate": 1.74986347284678e-07, + "loss": 0.3122, + "step": 4097 + }, + { + "epoch": 2.7726657645466846, + "grad_norm": 1.1020443639067645, + "learning_rate": 1.7395544861325718e-07, + "loss": 0.2969, + "step": 4098 + }, + { + "epoch": 2.773342354533153, + "grad_norm": 1.180845965506115, + "learning_rate": 1.7292754182932914e-07, + "loss": 0.3105, + "step": 4099 + }, + { + "epoch": 2.774018944519621, + "grad_norm": 1.1508204490656577, + "learning_rate": 1.7190262757014076e-07, + "loss": 0.3149, + "step": 4100 + }, + { + "epoch": 2.7746955345060895, + "grad_norm": 1.1444752421633846, + "learning_rate": 1.7088070647108433e-07, + "loss": 0.3135, + "step": 4101 + }, + { + "epoch": 2.7753721244925575, + "grad_norm": 1.103582666802079, + "learning_rate": 1.6986177916569646e-07, + "loss": 0.3024, + "step": 4102 + }, + { + "epoch": 2.7760487144790256, + "grad_norm": 1.1541460657030975, + "learning_rate": 1.688458462856557e-07, + "loss": 0.3144, + "step": 4103 + }, + { + "epoch": 2.776725304465494, + "grad_norm": 1.1641081686268657, + "learning_rate": 1.6783290846078714e-07, + "loss": 0.3279, + "step": 4104 + }, + { + "epoch": 2.777401894451962, + "grad_norm": 1.0654124923045716, + "learning_rate": 1.6682296631905626e-07, + "loss": 0.2975, + "step": 4105 + }, + { + "epoch": 2.7780784844384305, + "grad_norm": 1.1435681623690126, + "learning_rate": 1.6581602048657387e-07, + "loss": 0.3078, + "step": 4106 + }, + { + "epoch": 2.7787550744248986, + "grad_norm": 1.157534135780399, + "learning_rate": 1.648120715875906e-07, + "loss": 0.3024, + "step": 4107 + }, + { + "epoch": 2.7794316644113666, + "grad_norm": 1.1346443620652469, + "learning_rate": 1.6381112024450196e-07, + "loss": 0.3096, + "step": 4108 + }, + { + "epoch": 2.780108254397835, + "grad_norm": 1.1859527271884611, + "learning_rate": 1.6281316707784377e-07, + "loss": 0.3219, + "step": 4109 + }, + { + "epoch": 2.780784844384303, + "grad_norm": 1.1083880407225266, + "learning_rate": 1.618182127062934e-07, + "loss": 0.3003, + "step": 4110 + }, + { + "epoch": 2.7814614343707715, + "grad_norm": 1.1909965957014754, + "learning_rate": 1.6082625774666793e-07, + "loss": 0.3149, + "step": 4111 + }, + { + "epoch": 2.7821380243572396, + "grad_norm": 1.1109292979066097, + "learning_rate": 1.5983730281392663e-07, + "loss": 0.3072, + "step": 4112 + }, + { + "epoch": 2.7828146143437076, + "grad_norm": 1.1317810099916719, + "learning_rate": 1.588513485211679e-07, + "loss": 0.3235, + "step": 4113 + }, + { + "epoch": 2.783491204330176, + "grad_norm": 1.1060728758762992, + "learning_rate": 1.5786839547963008e-07, + "loss": 0.3014, + "step": 4114 + }, + { + "epoch": 2.784167794316644, + "grad_norm": 1.151548816364964, + "learning_rate": 1.5688844429869232e-07, + "loss": 0.3159, + "step": 4115 + }, + { + "epoch": 2.7848443843031125, + "grad_norm": 1.1795173923464533, + "learning_rate": 1.5591149558587037e-07, + "loss": 0.3236, + "step": 4116 + }, + { + "epoch": 2.7855209742895806, + "grad_norm": 1.1230554910539787, + "learning_rate": 1.5493754994681977e-07, + "loss": 0.3006, + "step": 4117 + }, + { + "epoch": 2.7861975642760486, + "grad_norm": 1.1274673045913184, + "learning_rate": 1.539666079853358e-07, + "loss": 0.3051, + "step": 4118 + }, + { + "epoch": 2.786874154262517, + "grad_norm": 1.093550164196601, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.3016, + "step": 4119 + }, + { + "epoch": 2.787550744248985, + "grad_norm": 1.1354627153320578, + "learning_rate": 1.5203373750092676e-07, + "loss": 0.2982, + "step": 4120 + }, + { + "epoch": 2.7882273342354535, + "grad_norm": 1.1272910263619973, + "learning_rate": 1.5107181017627813e-07, + "loss": 0.2953, + "step": 4121 + }, + { + "epoch": 2.7889039242219216, + "grad_norm": 1.1757951011000707, + "learning_rate": 1.5011288892574526e-07, + "loss": 0.3242, + "step": 4122 + }, + { + "epoch": 2.7895805142083896, + "grad_norm": 1.1516008767284027, + "learning_rate": 1.4915697434380816e-07, + "loss": 0.3157, + "step": 4123 + }, + { + "epoch": 2.790257104194858, + "grad_norm": 1.1053151641072043, + "learning_rate": 1.4820406702308165e-07, + "loss": 0.3041, + "step": 4124 + }, + { + "epoch": 2.790933694181326, + "grad_norm": 1.1537120198382154, + "learning_rate": 1.4725416755431655e-07, + "loss": 0.3159, + "step": 4125 + }, + { + "epoch": 2.7916102841677946, + "grad_norm": 1.1182513124253504, + "learning_rate": 1.463072765264001e-07, + "loss": 0.3161, + "step": 4126 + }, + { + "epoch": 2.7922868741542626, + "grad_norm": 1.1111189281912415, + "learning_rate": 1.4536339452635385e-07, + "loss": 0.3024, + "step": 4127 + }, + { + "epoch": 2.7929634641407306, + "grad_norm": 1.0917773822743122, + "learning_rate": 1.444225221393325e-07, + "loss": 0.2948, + "step": 4128 + }, + { + "epoch": 2.793640054127199, + "grad_norm": 1.1243183403401535, + "learning_rate": 1.4348465994862782e-07, + "loss": 0.3141, + "step": 4129 + }, + { + "epoch": 2.794316644113667, + "grad_norm": 1.1228307517491027, + "learning_rate": 1.4254980853566248e-07, + "loss": 0.3115, + "step": 4130 + }, + { + "epoch": 2.7949932341001356, + "grad_norm": 1.1239836066876274, + "learning_rate": 1.4161796847999566e-07, + "loss": 0.2951, + "step": 4131 + }, + { + "epoch": 2.7956698240866036, + "grad_norm": 1.2078843049461978, + "learning_rate": 1.4068914035931635e-07, + "loss": 0.3128, + "step": 4132 + }, + { + "epoch": 2.7963464140730716, + "grad_norm": 1.0981294995686268, + "learning_rate": 1.3976332474944842e-07, + "loss": 0.2987, + "step": 4133 + }, + { + "epoch": 2.79702300405954, + "grad_norm": 1.1587213188731176, + "learning_rate": 1.388405222243472e-07, + "loss": 0.3156, + "step": 4134 + }, + { + "epoch": 2.797699594046008, + "grad_norm": 1.1846858692026618, + "learning_rate": 1.3792073335610111e-07, + "loss": 0.3175, + "step": 4135 + }, + { + "epoch": 2.7983761840324766, + "grad_norm": 1.1258815737012846, + "learning_rate": 1.3700395871493023e-07, + "loss": 0.3107, + "step": 4136 + }, + { + "epoch": 2.7990527740189446, + "grad_norm": 1.11739619207689, + "learning_rate": 1.360901988691843e-07, + "loss": 0.3015, + "step": 4137 + }, + { + "epoch": 2.7997293640054126, + "grad_norm": 1.1153167164708462, + "learning_rate": 1.3517945438534629e-07, + "loss": 0.3036, + "step": 4138 + }, + { + "epoch": 2.800405953991881, + "grad_norm": 1.148846921294776, + "learning_rate": 1.342717258280274e-07, + "loss": 0.3078, + "step": 4139 + }, + { + "epoch": 2.801082543978349, + "grad_norm": 1.1362173734992982, + "learning_rate": 1.333670137599713e-07, + "loss": 0.3078, + "step": 4140 + }, + { + "epoch": 2.8017591339648176, + "grad_norm": 1.145705430890723, + "learning_rate": 1.3246531874204994e-07, + "loss": 0.3106, + "step": 4141 + }, + { + "epoch": 2.8024357239512856, + "grad_norm": 1.1355565467776323, + "learning_rate": 1.3156664133326614e-07, + "loss": 0.3106, + "step": 4142 + }, + { + "epoch": 2.8031123139377536, + "grad_norm": 1.0968762107052787, + "learning_rate": 1.3067098209075202e-07, + "loss": 0.2931, + "step": 4143 + }, + { + "epoch": 2.803788903924222, + "grad_norm": 1.181016506230049, + "learning_rate": 1.2977834156976733e-07, + "loss": 0.3183, + "step": 4144 + }, + { + "epoch": 2.80446549391069, + "grad_norm": 1.168940594706016, + "learning_rate": 1.2888872032370103e-07, + "loss": 0.3142, + "step": 4145 + }, + { + "epoch": 2.8051420838971586, + "grad_norm": 1.1187113158210438, + "learning_rate": 1.280021189040709e-07, + "loss": 0.3065, + "step": 4146 + }, + { + "epoch": 2.8058186738836266, + "grad_norm": 1.1074702361941224, + "learning_rate": 1.2711853786052108e-07, + "loss": 0.2937, + "step": 4147 + }, + { + "epoch": 2.8064952638700946, + "grad_norm": 1.1815011731635956, + "learning_rate": 1.2623797774082514e-07, + "loss": 0.3145, + "step": 4148 + }, + { + "epoch": 2.8071718538565626, + "grad_norm": 1.1651985673228868, + "learning_rate": 1.253604390908819e-07, + "loss": 0.322, + "step": 4149 + }, + { + "epoch": 2.807848443843031, + "grad_norm": 1.1611545199311877, + "learning_rate": 1.2448592245471903e-07, + "loss": 0.3175, + "step": 4150 + }, + { + "epoch": 2.8085250338294996, + "grad_norm": 1.1423329228733659, + "learning_rate": 1.2361442837449e-07, + "loss": 0.3126, + "step": 4151 + }, + { + "epoch": 2.8092016238159676, + "grad_norm": 1.094420894467688, + "learning_rate": 1.2274595739047267e-07, + "loss": 0.2927, + "step": 4152 + }, + { + "epoch": 2.8098782138024356, + "grad_norm": 1.1482770593230631, + "learning_rate": 1.2188051004107305e-07, + "loss": 0.311, + "step": 4153 + }, + { + "epoch": 2.8105548037889037, + "grad_norm": 1.1870469052793697, + "learning_rate": 1.210180868628219e-07, + "loss": 0.3095, + "step": 4154 + }, + { + "epoch": 2.811231393775372, + "grad_norm": 1.1079696291691443, + "learning_rate": 1.2015868839037492e-07, + "loss": 0.3073, + "step": 4155 + }, + { + "epoch": 2.8119079837618406, + "grad_norm": 1.151093810299083, + "learning_rate": 1.1930231515651313e-07, + "loss": 0.3084, + "step": 4156 + }, + { + "epoch": 2.8125845737483086, + "grad_norm": 1.1479766364856205, + "learning_rate": 1.1844896769214187e-07, + "loss": 0.306, + "step": 4157 + }, + { + "epoch": 2.8132611637347766, + "grad_norm": 1.194479911283576, + "learning_rate": 1.1759864652629072e-07, + "loss": 0.3264, + "step": 4158 + }, + { + "epoch": 2.8139377537212447, + "grad_norm": 1.142204995609788, + "learning_rate": 1.1675135218611188e-07, + "loss": 0.3127, + "step": 4159 + }, + { + "epoch": 2.814614343707713, + "grad_norm": 1.136898076727429, + "learning_rate": 1.1590708519688243e-07, + "loss": 0.3113, + "step": 4160 + }, + { + "epoch": 2.8152909336941816, + "grad_norm": 1.1563326571565744, + "learning_rate": 1.1506584608200366e-07, + "loss": 0.325, + "step": 4161 + }, + { + "epoch": 2.8159675236806496, + "grad_norm": 1.1685954336342999, + "learning_rate": 1.142276353629973e-07, + "loss": 0.3256, + "step": 4162 + }, + { + "epoch": 2.8166441136671176, + "grad_norm": 1.1592850359604618, + "learning_rate": 1.1339245355950934e-07, + "loss": 0.3117, + "step": 4163 + }, + { + "epoch": 2.8173207036535857, + "grad_norm": 1.1657630261390621, + "learning_rate": 1.1256030118930727e-07, + "loss": 0.3068, + "step": 4164 + }, + { + "epoch": 2.817997293640054, + "grad_norm": 1.1398606878363755, + "learning_rate": 1.1173117876828066e-07, + "loss": 0.3019, + "step": 4165 + }, + { + "epoch": 2.8186738836265226, + "grad_norm": 1.1695017410378856, + "learning_rate": 1.1090508681044055e-07, + "loss": 0.3173, + "step": 4166 + }, + { + "epoch": 2.8193504736129906, + "grad_norm": 1.1425891144598073, + "learning_rate": 1.1008202582792005e-07, + "loss": 0.296, + "step": 4167 + }, + { + "epoch": 2.8200270635994586, + "grad_norm": 1.1568237647767319, + "learning_rate": 1.0926199633097156e-07, + "loss": 0.3043, + "step": 4168 + }, + { + "epoch": 2.8207036535859267, + "grad_norm": 1.1625379556250555, + "learning_rate": 1.0844499882797011e-07, + "loss": 0.3052, + "step": 4169 + }, + { + "epoch": 2.821380243572395, + "grad_norm": 1.1568963498525002, + "learning_rate": 1.0763103382541052e-07, + "loss": 0.312, + "step": 4170 + }, + { + "epoch": 2.822056833558863, + "grad_norm": 1.146058362233935, + "learning_rate": 1.0682010182790637e-07, + "loss": 0.3071, + "step": 4171 + }, + { + "epoch": 2.8227334235453316, + "grad_norm": 1.1541650012035944, + "learning_rate": 1.0601220333819162e-07, + "loss": 0.307, + "step": 4172 + }, + { + "epoch": 2.8234100135317997, + "grad_norm": 1.1650657993777809, + "learning_rate": 1.0520733885712008e-07, + "loss": 0.3219, + "step": 4173 + }, + { + "epoch": 2.8240866035182677, + "grad_norm": 1.189695651515764, + "learning_rate": 1.0440550888366485e-07, + "loss": 0.3261, + "step": 4174 + }, + { + "epoch": 2.824763193504736, + "grad_norm": 1.1410751873083764, + "learning_rate": 1.0360671391491606e-07, + "loss": 0.3048, + "step": 4175 + }, + { + "epoch": 2.825439783491204, + "grad_norm": 1.181271198859864, + "learning_rate": 1.0281095444608425e-07, + "loss": 0.3228, + "step": 4176 + }, + { + "epoch": 2.8261163734776726, + "grad_norm": 1.0996530026726692, + "learning_rate": 1.0201823097049812e-07, + "loss": 0.3094, + "step": 4177 + }, + { + "epoch": 2.8267929634641407, + "grad_norm": 1.1427832540400786, + "learning_rate": 1.0122854397960292e-07, + "loss": 0.306, + "step": 4178 + }, + { + "epoch": 2.8274695534506087, + "grad_norm": 1.1551453432551007, + "learning_rate": 1.0044189396296144e-07, + "loss": 0.3134, + "step": 4179 + }, + { + "epoch": 2.828146143437077, + "grad_norm": 1.142606923697455, + "learning_rate": 9.965828140825529e-08, + "loss": 0.3079, + "step": 4180 + }, + { + "epoch": 2.828822733423545, + "grad_norm": 1.095129495826332, + "learning_rate": 9.887770680128083e-08, + "loss": 0.2974, + "step": 4181 + }, + { + "epoch": 2.8294993234100136, + "grad_norm": 1.15517421321211, + "learning_rate": 9.810017062595322e-08, + "loss": 0.3167, + "step": 4182 + }, + { + "epoch": 2.8301759133964817, + "grad_norm": 1.1648548269503585, + "learning_rate": 9.732567336430298e-08, + "loss": 0.3122, + "step": 4183 + }, + { + "epoch": 2.8308525033829497, + "grad_norm": 1.1182338854160827, + "learning_rate": 9.655421549647603e-08, + "loss": 0.3177, + "step": 4184 + }, + { + "epoch": 2.831529093369418, + "grad_norm": 1.1405031371526768, + "learning_rate": 9.57857975007348e-08, + "loss": 0.3172, + "step": 4185 + }, + { + "epoch": 2.832205683355886, + "grad_norm": 1.1440733212189316, + "learning_rate": 9.502041985345766e-08, + "loss": 0.3128, + "step": 4186 + }, + { + "epoch": 2.8328822733423547, + "grad_norm": 1.1361539508525604, + "learning_rate": 9.42580830291373e-08, + "loss": 0.3085, + "step": 4187 + }, + { + "epoch": 2.8335588633288227, + "grad_norm": 1.1509590013050146, + "learning_rate": 9.349878750038067e-08, + "loss": 0.2973, + "step": 4188 + }, + { + "epoch": 2.8342354533152907, + "grad_norm": 1.1329417952007672, + "learning_rate": 9.274253373791064e-08, + "loss": 0.3182, + "step": 4189 + }, + { + "epoch": 2.834912043301759, + "grad_norm": 1.1376827475730935, + "learning_rate": 9.198932221056333e-08, + "loss": 0.3124, + "step": 4190 + }, + { + "epoch": 2.835588633288227, + "grad_norm": 1.1393405169013877, + "learning_rate": 9.123915338529132e-08, + "loss": 0.3027, + "step": 4191 + }, + { + "epoch": 2.8362652232746957, + "grad_norm": 1.098878898100899, + "learning_rate": 9.049202772715593e-08, + "loss": 0.2994, + "step": 4192 + }, + { + "epoch": 2.8369418132611637, + "grad_norm": 1.158227816546315, + "learning_rate": 8.974794569933609e-08, + "loss": 0.3149, + "step": 4193 + }, + { + "epoch": 2.8376184032476317, + "grad_norm": 1.1326083176134083, + "learning_rate": 8.900690776312282e-08, + "loss": 0.3083, + "step": 4194 + }, + { + "epoch": 2.8382949932341, + "grad_norm": 1.1070998118615791, + "learning_rate": 8.826891437791974e-08, + "loss": 0.3114, + "step": 4195 + }, + { + "epoch": 2.838971583220568, + "grad_norm": 1.1564043460596638, + "learning_rate": 8.753396600124254e-08, + "loss": 0.3241, + "step": 4196 + }, + { + "epoch": 2.8396481732070367, + "grad_norm": 1.1348179095928945, + "learning_rate": 8.680206308871953e-08, + "loss": 0.3148, + "step": 4197 + }, + { + "epoch": 2.8403247631935047, + "grad_norm": 1.2031507738983054, + "learning_rate": 8.607320609409165e-08, + "loss": 0.3157, + "step": 4198 + }, + { + "epoch": 2.8410013531799727, + "grad_norm": 1.1558767804874057, + "learning_rate": 8.534739546921023e-08, + "loss": 0.3131, + "step": 4199 + }, + { + "epoch": 2.841677943166441, + "grad_norm": 1.1134903575631645, + "learning_rate": 8.462463166403978e-08, + "loss": 0.303, + "step": 4200 + }, + { + "epoch": 2.842354533152909, + "grad_norm": 1.14888777254408, + "learning_rate": 8.390491512665355e-08, + "loss": 0.3129, + "step": 4201 + }, + { + "epoch": 2.8430311231393777, + "grad_norm": 1.1558301195406901, + "learning_rate": 8.318824630323741e-08, + "loss": 0.3096, + "step": 4202 + }, + { + "epoch": 2.8437077131258457, + "grad_norm": 1.151349654889666, + "learning_rate": 8.247462563808816e-08, + "loss": 0.321, + "step": 4203 + }, + { + "epoch": 2.8443843031123137, + "grad_norm": 1.1615776626594305, + "learning_rate": 8.176405357361194e-08, + "loss": 0.3128, + "step": 4204 + }, + { + "epoch": 2.845060893098782, + "grad_norm": 1.1550974909648855, + "learning_rate": 8.105653055032415e-08, + "loss": 0.3149, + "step": 4205 + }, + { + "epoch": 2.84573748308525, + "grad_norm": 1.1331678946526835, + "learning_rate": 8.035205700685167e-08, + "loss": 0.3181, + "step": 4206 + }, + { + "epoch": 2.8464140730717187, + "grad_norm": 1.1529725830179045, + "learning_rate": 7.965063337993018e-08, + "loss": 0.3099, + "step": 4207 + }, + { + "epoch": 2.8470906630581867, + "grad_norm": 1.167392401492405, + "learning_rate": 7.89522601044046e-08, + "loss": 0.3123, + "step": 4208 + }, + { + "epoch": 2.8477672530446547, + "grad_norm": 1.117294557253064, + "learning_rate": 7.825693761322861e-08, + "loss": 0.3133, + "step": 4209 + }, + { + "epoch": 2.848443843031123, + "grad_norm": 1.206116459835629, + "learning_rate": 7.756466633746407e-08, + "loss": 0.3229, + "step": 4210 + }, + { + "epoch": 2.849120433017591, + "grad_norm": 1.1468698271960456, + "learning_rate": 7.687544670628267e-08, + "loss": 0.3129, + "step": 4211 + }, + { + "epoch": 2.8497970230040597, + "grad_norm": 1.1055694300739007, + "learning_rate": 7.618927914696372e-08, + "loss": 0.3044, + "step": 4212 + }, + { + "epoch": 2.8504736129905277, + "grad_norm": 1.1838200930266098, + "learning_rate": 7.550616408489253e-08, + "loss": 0.3287, + "step": 4213 + }, + { + "epoch": 2.8511502029769957, + "grad_norm": 1.1044320622923602, + "learning_rate": 7.482610194356477e-08, + "loss": 0.3069, + "step": 4214 + }, + { + "epoch": 2.851826792963464, + "grad_norm": 1.1455420346697114, + "learning_rate": 7.414909314458263e-08, + "loss": 0.3084, + "step": 4215 + }, + { + "epoch": 2.852503382949932, + "grad_norm": 1.1684253547397263, + "learning_rate": 7.347513810765427e-08, + "loss": 0.3204, + "step": 4216 + }, + { + "epoch": 2.8531799729364007, + "grad_norm": 1.1195624570775045, + "learning_rate": 7.280423725059604e-08, + "loss": 0.3079, + "step": 4217 + }, + { + "epoch": 2.8538565629228687, + "grad_norm": 1.1385479529534084, + "learning_rate": 7.213639098933022e-08, + "loss": 0.3084, + "step": 4218 + }, + { + "epoch": 2.8545331529093367, + "grad_norm": 1.1331069585572051, + "learning_rate": 7.147159973788508e-08, + "loss": 0.3079, + "step": 4219 + }, + { + "epoch": 2.855209742895805, + "grad_norm": 1.1465147205919115, + "learning_rate": 7.080986390839539e-08, + "loss": 0.3184, + "step": 4220 + }, + { + "epoch": 2.8558863328822732, + "grad_norm": 1.1791051770711483, + "learning_rate": 7.015118391110299e-08, + "loss": 0.3359, + "step": 4221 + }, + { + "epoch": 2.8565629228687417, + "grad_norm": 1.1773610438147977, + "learning_rate": 6.949556015435178e-08, + "loss": 0.324, + "step": 4222 + }, + { + "epoch": 2.8572395128552097, + "grad_norm": 1.1946754374606703, + "learning_rate": 6.884299304459497e-08, + "loss": 0.3171, + "step": 4223 + }, + { + "epoch": 2.8579161028416777, + "grad_norm": 1.211898989104555, + "learning_rate": 6.819348298638839e-08, + "loss": 0.3238, + "step": 4224 + }, + { + "epoch": 2.858592692828146, + "grad_norm": 1.1204644983851304, + "learning_rate": 6.75470303823933e-08, + "loss": 0.306, + "step": 4225 + }, + { + "epoch": 2.8592692828146142, + "grad_norm": 1.1183598532251589, + "learning_rate": 6.690363563337466e-08, + "loss": 0.3043, + "step": 4226 + }, + { + "epoch": 2.8599458728010827, + "grad_norm": 1.1104493994712024, + "learning_rate": 6.626329913820339e-08, + "loss": 0.3217, + "step": 4227 + }, + { + "epoch": 2.8606224627875507, + "grad_norm": 1.1383910628787934, + "learning_rate": 6.562602129385365e-08, + "loss": 0.3103, + "step": 4228 + }, + { + "epoch": 2.8612990527740187, + "grad_norm": 1.1456320872057297, + "learning_rate": 6.499180249540382e-08, + "loss": 0.3264, + "step": 4229 + }, + { + "epoch": 2.861975642760487, + "grad_norm": 1.0972800260668512, + "learning_rate": 6.436064313603385e-08, + "loss": 0.2902, + "step": 4230 + }, + { + "epoch": 2.8626522327469552, + "grad_norm": 1.1457624918306197, + "learning_rate": 6.373254360703019e-08, + "loss": 0.3108, + "step": 4231 + }, + { + "epoch": 2.8633288227334237, + "grad_norm": 1.1740680025880672, + "learning_rate": 6.310750429777912e-08, + "loss": 0.3209, + "step": 4232 + }, + { + "epoch": 2.8640054127198917, + "grad_norm": 1.156124442542606, + "learning_rate": 6.248552559577292e-08, + "loss": 0.3127, + "step": 4233 + }, + { + "epoch": 2.8646820027063598, + "grad_norm": 1.111208197867936, + "learning_rate": 6.186660788660315e-08, + "loss": 0.3053, + "step": 4234 + }, + { + "epoch": 2.865358592692828, + "grad_norm": 1.1541823175218264, + "learning_rate": 6.125075155396675e-08, + "loss": 0.3137, + "step": 4235 + }, + { + "epoch": 2.8660351826792962, + "grad_norm": 1.1556398548370799, + "learning_rate": 6.063795697966057e-08, + "loss": 0.3099, + "step": 4236 + }, + { + "epoch": 2.8667117726657647, + "grad_norm": 1.1474723616040594, + "learning_rate": 6.00282245435857e-08, + "loss": 0.3032, + "step": 4237 + }, + { + "epoch": 2.8673883626522327, + "grad_norm": 1.1636513063535923, + "learning_rate": 5.9421554623742e-08, + "loss": 0.3204, + "step": 4238 + }, + { + "epoch": 2.8680649526387008, + "grad_norm": 1.151528450500134, + "learning_rate": 5.881794759623194e-08, + "loss": 0.3102, + "step": 4239 + }, + { + "epoch": 2.8687415426251692, + "grad_norm": 1.1227225002615908, + "learning_rate": 5.8217403835260086e-08, + "loss": 0.3077, + "step": 4240 + }, + { + "epoch": 2.8694181326116373, + "grad_norm": 1.1314870892298636, + "learning_rate": 5.7619923713130857e-08, + "loss": 0.314, + "step": 4241 + }, + { + "epoch": 2.8700947225981057, + "grad_norm": 1.1341287369437187, + "learning_rate": 5.7025507600250165e-08, + "loss": 0.3124, + "step": 4242 + }, + { + "epoch": 2.8707713125845737, + "grad_norm": 1.1221479027133288, + "learning_rate": 5.643415586512324e-08, + "loss": 0.3027, + "step": 4243 + }, + { + "epoch": 2.8714479025710418, + "grad_norm": 1.1445270725231738, + "learning_rate": 5.584586887435739e-08, + "loss": 0.3194, + "step": 4244 + }, + { + "epoch": 2.8721244925575102, + "grad_norm": 1.106224431618639, + "learning_rate": 5.526064699265754e-08, + "loss": 0.3021, + "step": 4245 + }, + { + "epoch": 2.8728010825439783, + "grad_norm": 1.144408374954783, + "learning_rate": 5.4678490582830704e-08, + "loss": 0.308, + "step": 4246 + }, + { + "epoch": 2.8734776725304467, + "grad_norm": 1.1519235955199802, + "learning_rate": 5.409940000578207e-08, + "loss": 0.3254, + "step": 4247 + }, + { + "epoch": 2.8741542625169147, + "grad_norm": 1.154899227868188, + "learning_rate": 5.352337562051613e-08, + "loss": 0.3223, + "step": 4248 + }, + { + "epoch": 2.8748308525033828, + "grad_norm": 1.162412920809482, + "learning_rate": 5.2950417784137785e-08, + "loss": 0.3165, + "step": 4249 + }, + { + "epoch": 2.8755074424898512, + "grad_norm": 1.1324873167755383, + "learning_rate": 5.2380526851850135e-08, + "loss": 0.3004, + "step": 4250 + }, + { + "epoch": 2.8761840324763193, + "grad_norm": 1.1099522242217954, + "learning_rate": 5.181370317695389e-08, + "loss": 0.3053, + "step": 4251 + }, + { + "epoch": 2.8768606224627877, + "grad_norm": 1.1489724733135365, + "learning_rate": 5.124994711084963e-08, + "loss": 0.3036, + "step": 4252 + }, + { + "epoch": 2.8775372124492558, + "grad_norm": 1.115590005852624, + "learning_rate": 5.0689259003035566e-08, + "loss": 0.3109, + "step": 4253 + }, + { + "epoch": 2.878213802435724, + "grad_norm": 1.1517167262264398, + "learning_rate": 5.013163920110864e-08, + "loss": 0.3099, + "step": 4254 + }, + { + "epoch": 2.8788903924221922, + "grad_norm": 1.1240083042703695, + "learning_rate": 4.9577088050762337e-08, + "loss": 0.3031, + "step": 4255 + }, + { + "epoch": 2.8795669824086603, + "grad_norm": 1.1318330550373505, + "learning_rate": 4.9025605895788867e-08, + "loss": 0.3164, + "step": 4256 + }, + { + "epoch": 2.8802435723951287, + "grad_norm": 1.239141698502644, + "learning_rate": 4.847719307807752e-08, + "loss": 0.3306, + "step": 4257 + }, + { + "epoch": 2.8809201623815968, + "grad_norm": 1.134785816446629, + "learning_rate": 4.793184993761468e-08, + "loss": 0.2994, + "step": 4258 + }, + { + "epoch": 2.881596752368065, + "grad_norm": 1.1317951031414402, + "learning_rate": 4.73895768124838e-08, + "loss": 0.3067, + "step": 4259 + }, + { + "epoch": 2.8822733423545333, + "grad_norm": 1.1435087101163084, + "learning_rate": 4.685037403886483e-08, + "loss": 0.3139, + "step": 4260 + }, + { + "epoch": 2.8829499323410013, + "grad_norm": 1.1418542042945439, + "learning_rate": 4.631424195103373e-08, + "loss": 0.3044, + "step": 4261 + }, + { + "epoch": 2.8836265223274697, + "grad_norm": 1.195661538243185, + "learning_rate": 4.578118088136463e-08, + "loss": 0.3119, + "step": 4262 + }, + { + "epoch": 2.8843031123139378, + "grad_norm": 1.099512951186891, + "learning_rate": 4.52511911603265e-08, + "loss": 0.2981, + "step": 4263 + }, + { + "epoch": 2.884979702300406, + "grad_norm": 1.1341522644029463, + "learning_rate": 4.4724273116483754e-08, + "loss": 0.3063, + "step": 4264 + }, + { + "epoch": 2.8856562922868743, + "grad_norm": 1.1028914095176316, + "learning_rate": 4.42004270764973e-08, + "loss": 0.3045, + "step": 4265 + }, + { + "epoch": 2.8863328822733423, + "grad_norm": 1.1768256494467735, + "learning_rate": 4.367965336512403e-08, + "loss": 0.3246, + "step": 4266 + }, + { + "epoch": 2.8870094722598107, + "grad_norm": 1.1216950161209396, + "learning_rate": 4.316195230521514e-08, + "loss": 0.3068, + "step": 4267 + }, + { + "epoch": 2.8876860622462788, + "grad_norm": 1.1218076507246368, + "learning_rate": 4.264732421771722e-08, + "loss": 0.3044, + "step": 4268 + }, + { + "epoch": 2.888362652232747, + "grad_norm": 1.159660336650667, + "learning_rate": 4.21357694216723e-08, + "loss": 0.3052, + "step": 4269 + }, + { + "epoch": 2.8890392422192153, + "grad_norm": 1.1531078466110107, + "learning_rate": 4.162728823421669e-08, + "loss": 0.3077, + "step": 4270 + }, + { + "epoch": 2.8897158322056833, + "grad_norm": 1.140202671945342, + "learning_rate": 4.112188097058156e-08, + "loss": 0.3162, + "step": 4271 + }, + { + "epoch": 2.8903924221921518, + "grad_norm": 1.1292119945054573, + "learning_rate": 4.061954794409184e-08, + "loss": 0.3163, + "step": 4272 + }, + { + "epoch": 2.89106901217862, + "grad_norm": 1.1191609652179115, + "learning_rate": 4.0120289466166754e-08, + "loss": 0.3037, + "step": 4273 + }, + { + "epoch": 2.891745602165088, + "grad_norm": 1.2125190697405077, + "learning_rate": 3.9624105846319813e-08, + "loss": 0.3298, + "step": 4274 + }, + { + "epoch": 2.8924221921515563, + "grad_norm": 1.1198285600941194, + "learning_rate": 3.9130997392157756e-08, + "loss": 0.3079, + "step": 4275 + }, + { + "epoch": 2.8930987821380243, + "grad_norm": 1.1031892741599705, + "learning_rate": 3.86409644093827e-08, + "loss": 0.2921, + "step": 4276 + }, + { + "epoch": 2.8937753721244928, + "grad_norm": 1.1909455252020806, + "learning_rate": 3.8154007201787194e-08, + "loss": 0.3146, + "step": 4277 + }, + { + "epoch": 2.894451962110961, + "grad_norm": 1.1639128358691913, + "learning_rate": 3.7670126071259194e-08, + "loss": 0.3294, + "step": 4278 + }, + { + "epoch": 2.895128552097429, + "grad_norm": 1.1423374219976659, + "learning_rate": 3.718932131777819e-08, + "loss": 0.3059, + "step": 4279 + }, + { + "epoch": 2.8958051420838973, + "grad_norm": 1.1339918716128774, + "learning_rate": 3.6711593239417976e-08, + "loss": 0.3018, + "step": 4280 + }, + { + "epoch": 2.8964817320703653, + "grad_norm": 1.144619379367415, + "learning_rate": 3.62369421323433e-08, + "loss": 0.3066, + "step": 4281 + }, + { + "epoch": 2.8971583220568338, + "grad_norm": 1.157453603164695, + "learning_rate": 3.576536829081323e-08, + "loss": 0.3232, + "step": 4282 + }, + { + "epoch": 2.897834912043302, + "grad_norm": 1.1663898548339142, + "learning_rate": 3.52968720071778e-08, + "loss": 0.3121, + "step": 4283 + }, + { + "epoch": 2.89851150202977, + "grad_norm": 1.151140435547747, + "learning_rate": 3.483145357187967e-08, + "loss": 0.3109, + "step": 4284 + }, + { + "epoch": 2.8991880920162383, + "grad_norm": 1.1582013388566426, + "learning_rate": 3.436911327345305e-08, + "loss": 0.2984, + "step": 4285 + }, + { + "epoch": 2.8998646820027063, + "grad_norm": 1.1412644655425839, + "learning_rate": 3.3909851398523654e-08, + "loss": 0.308, + "step": 4286 + }, + { + "epoch": 2.9005412719891748, + "grad_norm": 1.1645084434101423, + "learning_rate": 3.345366823180929e-08, + "loss": 0.3206, + "step": 4287 + }, + { + "epoch": 2.901217861975643, + "grad_norm": 1.149063910594397, + "learning_rate": 3.300056405611873e-08, + "loss": 0.321, + "step": 4288 + }, + { + "epoch": 2.901894451962111, + "grad_norm": 1.1572085115744013, + "learning_rate": 3.2550539152352845e-08, + "loss": 0.3093, + "step": 4289 + }, + { + "epoch": 2.9025710419485793, + "grad_norm": 1.1516695356643691, + "learning_rate": 3.2103593799501786e-08, + "loss": 0.3087, + "step": 4290 + }, + { + "epoch": 2.9032476319350473, + "grad_norm": 1.1729018449936988, + "learning_rate": 3.165972827464892e-08, + "loss": 0.309, + "step": 4291 + }, + { + "epoch": 2.903924221921516, + "grad_norm": 1.123975545818059, + "learning_rate": 3.1218942852965226e-08, + "loss": 0.3037, + "step": 4292 + }, + { + "epoch": 2.904600811907984, + "grad_norm": 1.1420041050641399, + "learning_rate": 3.078123780771602e-08, + "loss": 0.3076, + "step": 4293 + }, + { + "epoch": 2.905277401894452, + "grad_norm": 1.186079651681517, + "learning_rate": 3.034661341025258e-08, + "loss": 0.321, + "step": 4294 + }, + { + "epoch": 2.9059539918809203, + "grad_norm": 1.1526191444306315, + "learning_rate": 2.9915069930019914e-08, + "loss": 0.3091, + "step": 4295 + }, + { + "epoch": 2.9066305818673883, + "grad_norm": 1.1565920873433775, + "learning_rate": 2.94866076345518e-08, + "loss": 0.3118, + "step": 4296 + }, + { + "epoch": 2.907307171853857, + "grad_norm": 1.1453368313546646, + "learning_rate": 2.9061226789471874e-08, + "loss": 0.314, + "step": 4297 + }, + { + "epoch": 2.907983761840325, + "grad_norm": 1.1340941402221867, + "learning_rate": 2.863892765849252e-08, + "loss": 0.3064, + "step": 4298 + }, + { + "epoch": 2.908660351826793, + "grad_norm": 1.171677798973917, + "learning_rate": 2.8219710503416543e-08, + "loss": 0.3268, + "step": 4299 + }, + { + "epoch": 2.9093369418132613, + "grad_norm": 1.1474207981438231, + "learning_rate": 2.78035755841366e-08, + "loss": 0.2955, + "step": 4300 + }, + { + "epoch": 2.9100135317997293, + "grad_norm": 1.1276142697099485, + "learning_rate": 2.7390523158633552e-08, + "loss": 0.3141, + "step": 4301 + }, + { + "epoch": 2.910690121786198, + "grad_norm": 1.134055949016168, + "learning_rate": 2.6980553482977566e-08, + "loss": 0.3136, + "step": 4302 + }, + { + "epoch": 2.911366711772666, + "grad_norm": 1.1643805303472392, + "learning_rate": 2.657366681132756e-08, + "loss": 0.319, + "step": 4303 + }, + { + "epoch": 2.912043301759134, + "grad_norm": 1.160709431692687, + "learning_rate": 2.6169863395932304e-08, + "loss": 0.3146, + "step": 4304 + }, + { + "epoch": 2.9127198917456023, + "grad_norm": 1.1510206817058268, + "learning_rate": 2.5769143487127113e-08, + "loss": 0.3083, + "step": 4305 + }, + { + "epoch": 2.9133964817320703, + "grad_norm": 1.1919954516471527, + "learning_rate": 2.5371507333337153e-08, + "loss": 0.3149, + "step": 4306 + }, + { + "epoch": 2.914073071718539, + "grad_norm": 1.166119466811266, + "learning_rate": 2.497695518107579e-08, + "loss": 0.3185, + "step": 4307 + }, + { + "epoch": 2.914749661705007, + "grad_norm": 1.1456687779806747, + "learning_rate": 2.4585487274942922e-08, + "loss": 0.3206, + "step": 4308 + }, + { + "epoch": 2.915426251691475, + "grad_norm": 1.1323663586397024, + "learning_rate": 2.4197103857628858e-08, + "loss": 0.3102, + "step": 4309 + }, + { + "epoch": 2.9161028416779433, + "grad_norm": 1.1033632191595115, + "learning_rate": 2.381180516990933e-08, + "loss": 0.3029, + "step": 4310 + }, + { + "epoch": 2.9167794316644113, + "grad_norm": 1.1557013695686562, + "learning_rate": 2.3429591450649934e-08, + "loss": 0.3131, + "step": 4311 + }, + { + "epoch": 2.91745602165088, + "grad_norm": 1.131307734017743, + "learning_rate": 2.305046293680113e-08, + "loss": 0.3084, + "step": 4312 + }, + { + "epoch": 2.918132611637348, + "grad_norm": 1.1290122102509739, + "learning_rate": 2.267441986340324e-08, + "loss": 0.3136, + "step": 4313 + }, + { + "epoch": 2.918809201623816, + "grad_norm": 1.1217149604078114, + "learning_rate": 2.230146246358256e-08, + "loss": 0.2991, + "step": 4314 + }, + { + "epoch": 2.9194857916102843, + "grad_norm": 1.15371748816963, + "learning_rate": 2.193159096855191e-08, + "loss": 0.3202, + "step": 4315 + }, + { + "epoch": 2.9201623815967523, + "grad_norm": 1.1426381301277648, + "learning_rate": 2.1564805607612317e-08, + "loss": 0.3074, + "step": 4316 + }, + { + "epoch": 2.920838971583221, + "grad_norm": 1.1490085281905362, + "learning_rate": 2.120110660815078e-08, + "loss": 0.3042, + "step": 4317 + }, + { + "epoch": 2.921515561569689, + "grad_norm": 1.166632241591695, + "learning_rate": 2.0840494195641382e-08, + "loss": 0.3216, + "step": 4318 + }, + { + "epoch": 2.922192151556157, + "grad_norm": 1.1147929426270626, + "learning_rate": 2.0482968593643625e-08, + "loss": 0.3073, + "step": 4319 + }, + { + "epoch": 2.9228687415426253, + "grad_norm": 1.1575877238634333, + "learning_rate": 2.012853002380466e-08, + "loss": 0.3004, + "step": 4320 + }, + { + "epoch": 2.9235453315290933, + "grad_norm": 1.1135864591516014, + "learning_rate": 1.97771787058576e-08, + "loss": 0.3066, + "step": 4321 + }, + { + "epoch": 2.924221921515562, + "grad_norm": 1.1322713277733414, + "learning_rate": 1.942891485762044e-08, + "loss": 0.3082, + "step": 4322 + }, + { + "epoch": 2.92489851150203, + "grad_norm": 1.1568277979539667, + "learning_rate": 1.9083738694998798e-08, + "loss": 0.3214, + "step": 4323 + }, + { + "epoch": 2.925575101488498, + "grad_norm": 1.0867872016253546, + "learning_rate": 1.8741650431982615e-08, + "loss": 0.2924, + "step": 4324 + }, + { + "epoch": 2.9262516914749663, + "grad_norm": 1.1458071757826465, + "learning_rate": 1.8402650280648916e-08, + "loss": 0.3112, + "step": 4325 + }, + { + "epoch": 2.9269282814614344, + "grad_norm": 1.0777452737503344, + "learning_rate": 1.8066738451159028e-08, + "loss": 0.2902, + "step": 4326 + }, + { + "epoch": 2.927604871447903, + "grad_norm": 1.1488526020025729, + "learning_rate": 1.773391515176026e-08, + "loss": 0.3093, + "step": 4327 + }, + { + "epoch": 2.928281461434371, + "grad_norm": 1.1562641729153602, + "learning_rate": 1.740418058878479e-08, + "loss": 0.3121, + "step": 4328 + }, + { + "epoch": 2.928958051420839, + "grad_norm": 1.1746849763786804, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.3068, + "step": 4329 + }, + { + "epoch": 2.9296346414073073, + "grad_norm": 1.1733716856475764, + "learning_rate": 1.6753978487860645e-08, + "loss": 0.3254, + "step": 4330 + }, + { + "epoch": 2.9303112313937754, + "grad_norm": 1.1052256437749923, + "learning_rate": 1.6433511353002863e-08, + "loss": 0.29, + "step": 4331 + }, + { + "epoch": 2.930987821380244, + "grad_norm": 1.0968689832486906, + "learning_rate": 1.6116133760747944e-08, + "loss": 0.3048, + "step": 4332 + }, + { + "epoch": 2.931664411366712, + "grad_norm": 1.1139674432670768, + "learning_rate": 1.5801845907854606e-08, + "loss": 0.3056, + "step": 4333 + }, + { + "epoch": 2.93234100135318, + "grad_norm": 1.0980939854228233, + "learning_rate": 1.549064798916311e-08, + "loss": 0.3006, + "step": 4334 + }, + { + "epoch": 2.933017591339648, + "grad_norm": 1.1462567878563057, + "learning_rate": 1.5182540197600237e-08, + "loss": 0.3251, + "step": 4335 + }, + { + "epoch": 2.9336941813261164, + "grad_norm": 1.1233414413943121, + "learning_rate": 1.4877522724175974e-08, + "loss": 0.3155, + "step": 4336 + }, + { + "epoch": 2.934370771312585, + "grad_norm": 1.1694248394037319, + "learning_rate": 1.4575595757985172e-08, + "loss": 0.3128, + "step": 4337 + }, + { + "epoch": 2.935047361299053, + "grad_norm": 1.1554047473605722, + "learning_rate": 1.4276759486205328e-08, + "loss": 0.3214, + "step": 4338 + }, + { + "epoch": 2.935723951285521, + "grad_norm": 1.1722479445058565, + "learning_rate": 1.3981014094099354e-08, + "loss": 0.3129, + "step": 4339 + }, + { + "epoch": 2.936400541271989, + "grad_norm": 1.1309022463962741, + "learning_rate": 1.368835976501337e-08, + "loss": 0.308, + "step": 4340 + }, + { + "epoch": 2.9370771312584574, + "grad_norm": 1.1506270946384547, + "learning_rate": 1.3398796680377245e-08, + "loss": 0.3101, + "step": 4341 + }, + { + "epoch": 2.937753721244926, + "grad_norm": 1.1441809234601779, + "learning_rate": 1.3112325019704608e-08, + "loss": 0.3084, + "step": 4342 + }, + { + "epoch": 2.938430311231394, + "grad_norm": 1.129286698477764, + "learning_rate": 1.2828944960592837e-08, + "loss": 0.3036, + "step": 4343 + }, + { + "epoch": 2.939106901217862, + "grad_norm": 1.13276012436253, + "learning_rate": 1.2548656678721404e-08, + "loss": 0.3151, + "step": 4344 + }, + { + "epoch": 2.93978349120433, + "grad_norm": 1.1405521841051598, + "learning_rate": 1.2271460347854091e-08, + "loss": 0.3028, + "step": 4345 + }, + { + "epoch": 2.9404600811907984, + "grad_norm": 1.161498100591003, + "learning_rate": 1.1997356139838434e-08, + "loss": 0.3122, + "step": 4346 + }, + { + "epoch": 2.941136671177267, + "grad_norm": 1.1523731008191445, + "learning_rate": 1.1726344224603504e-08, + "loss": 0.2991, + "step": 4347 + }, + { + "epoch": 2.941813261163735, + "grad_norm": 1.1710962219756096, + "learning_rate": 1.145842477016268e-08, + "loss": 0.3295, + "step": 4348 + }, + { + "epoch": 2.942489851150203, + "grad_norm": 1.1348381341768248, + "learning_rate": 1.119359794261088e-08, + "loss": 0.3043, + "step": 4349 + }, + { + "epoch": 2.943166441136671, + "grad_norm": 1.119796205001435, + "learning_rate": 1.0931863906127327e-08, + "loss": 0.3114, + "step": 4350 + }, + { + "epoch": 2.9438430311231394, + "grad_norm": 1.1489488407168515, + "learning_rate": 1.0673222822972229e-08, + "loss": 0.3168, + "step": 4351 + }, + { + "epoch": 2.944519621109608, + "grad_norm": 1.110150794084272, + "learning_rate": 1.0417674853489545e-08, + "loss": 0.2983, + "step": 4352 + }, + { + "epoch": 2.945196211096076, + "grad_norm": 1.1683257612732658, + "learning_rate": 1.0165220156105326e-08, + "loss": 0.3234, + "step": 4353 + }, + { + "epoch": 2.945872801082544, + "grad_norm": 1.109813392704684, + "learning_rate": 9.915858887327157e-09, + "loss": 0.2923, + "step": 4354 + }, + { + "epoch": 2.946549391069012, + "grad_norm": 1.196883300808739, + "learning_rate": 9.669591201746375e-09, + "loss": 0.3164, + "step": 4355 + }, + { + "epoch": 2.9472259810554804, + "grad_norm": 1.1456394101942902, + "learning_rate": 9.426417252035858e-09, + "loss": 0.3086, + "step": 4356 + }, + { + "epoch": 2.9479025710419484, + "grad_norm": 1.1331249915686332, + "learning_rate": 9.186337188949456e-09, + "loss": 0.307, + "step": 4357 + }, + { + "epoch": 2.948579161028417, + "grad_norm": 1.2310369862585375, + "learning_rate": 8.949351161324227e-09, + "loss": 0.3324, + "step": 4358 + }, + { + "epoch": 2.949255751014885, + "grad_norm": 1.1533518432060552, + "learning_rate": 8.715459316078756e-09, + "loss": 0.3172, + "step": 4359 + }, + { + "epoch": 2.949932341001353, + "grad_norm": 1.1373796418007207, + "learning_rate": 8.484661798213723e-09, + "loss": 0.3084, + "step": 4360 + }, + { + "epoch": 2.9506089309878214, + "grad_norm": 1.117765115517499, + "learning_rate": 8.256958750810784e-09, + "loss": 0.2974, + "step": 4361 + }, + { + "epoch": 2.9512855209742894, + "grad_norm": 1.156417502178171, + "learning_rate": 8.032350315033688e-09, + "loss": 0.3067, + "step": 4362 + }, + { + "epoch": 2.951962110960758, + "grad_norm": 1.1304037754137621, + "learning_rate": 7.810836630127717e-09, + "loss": 0.307, + "step": 4363 + }, + { + "epoch": 2.952638700947226, + "grad_norm": 1.138766354028754, + "learning_rate": 7.59241783341913e-09, + "loss": 0.3139, + "step": 4364 + }, + { + "epoch": 2.953315290933694, + "grad_norm": 1.1439120799276494, + "learning_rate": 7.377094060315726e-09, + "loss": 0.3162, + "step": 4365 + }, + { + "epoch": 2.9539918809201624, + "grad_norm": 1.1107857012151061, + "learning_rate": 7.164865444306834e-09, + "loss": 0.2997, + "step": 4366 + }, + { + "epoch": 2.9546684709066304, + "grad_norm": 1.1851367102888233, + "learning_rate": 6.9557321169622105e-09, + "loss": 0.3235, + "step": 4367 + }, + { + "epoch": 2.955345060893099, + "grad_norm": 1.1398386073520308, + "learning_rate": 6.7496942079342546e-09, + "loss": 0.3102, + "step": 4368 + }, + { + "epoch": 2.956021650879567, + "grad_norm": 1.1828735338696095, + "learning_rate": 6.546751844955235e-09, + "loss": 0.3164, + "step": 4369 + }, + { + "epoch": 2.956698240866035, + "grad_norm": 1.1264165638073909, + "learning_rate": 6.346905153837291e-09, + "loss": 0.3194, + "step": 4370 + }, + { + "epoch": 2.9573748308525034, + "grad_norm": 1.116024016414339, + "learning_rate": 6.150154258476315e-09, + "loss": 0.2981, + "step": 4371 + }, + { + "epoch": 2.9580514208389714, + "grad_norm": 1.0984584846507182, + "learning_rate": 5.956499280845851e-09, + "loss": 0.3011, + "step": 4372 + }, + { + "epoch": 2.95872801082544, + "grad_norm": 1.1119161048589243, + "learning_rate": 5.765940341002085e-09, + "loss": 0.3099, + "step": 4373 + }, + { + "epoch": 2.959404600811908, + "grad_norm": 1.1637728366343703, + "learning_rate": 5.578477557081074e-09, + "loss": 0.3202, + "step": 4374 + }, + { + "epoch": 2.960081190798376, + "grad_norm": 1.1234941277175854, + "learning_rate": 5.394111045299855e-09, + "loss": 0.3018, + "step": 4375 + }, + { + "epoch": 2.9607577807848444, + "grad_norm": 1.1741162886088914, + "learning_rate": 5.212840919955886e-09, + "loss": 0.3299, + "step": 4376 + }, + { + "epoch": 2.9614343707713124, + "grad_norm": 1.1440273787574575, + "learning_rate": 5.034667293427053e-09, + "loss": 0.3133, + "step": 4377 + }, + { + "epoch": 2.962110960757781, + "grad_norm": 1.124868236628393, + "learning_rate": 4.859590276170556e-09, + "loss": 0.3146, + "step": 4378 + }, + { + "epoch": 2.962787550744249, + "grad_norm": 1.1031333776064192, + "learning_rate": 4.687609976725127e-09, + "loss": 0.2942, + "step": 4379 + }, + { + "epoch": 2.963464140730717, + "grad_norm": 1.1387413183402901, + "learning_rate": 4.51872650170937e-09, + "loss": 0.3078, + "step": 4380 + }, + { + "epoch": 2.9641407307171854, + "grad_norm": 1.0601478577033863, + "learning_rate": 4.352939955822311e-09, + "loss": 0.2847, + "step": 4381 + }, + { + "epoch": 2.9648173207036534, + "grad_norm": 1.1175635320768091, + "learning_rate": 4.190250441841737e-09, + "loss": 0.3004, + "step": 4382 + }, + { + "epoch": 2.965493910690122, + "grad_norm": 1.15277509463722, + "learning_rate": 4.030658060626969e-09, + "loss": 0.3106, + "step": 4383 + }, + { + "epoch": 2.96617050067659, + "grad_norm": 1.1296122280853835, + "learning_rate": 3.874162911117196e-09, + "loss": 0.3107, + "step": 4384 + }, + { + "epoch": 2.966847090663058, + "grad_norm": 1.1385531535031743, + "learning_rate": 3.7207650903298143e-09, + "loss": 0.2958, + "step": 4385 + }, + { + "epoch": 2.9675236806495264, + "grad_norm": 1.1876739786980457, + "learning_rate": 3.570464693364306e-09, + "loss": 0.3181, + "step": 4386 + }, + { + "epoch": 2.9682002706359945, + "grad_norm": 1.1089642461299507, + "learning_rate": 3.4232618133978044e-09, + "loss": 0.2913, + "step": 4387 + }, + { + "epoch": 2.968876860622463, + "grad_norm": 1.1483076805904084, + "learning_rate": 3.279156541688422e-09, + "loss": 0.313, + "step": 4388 + }, + { + "epoch": 2.969553450608931, + "grad_norm": 1.0951418894610812, + "learning_rate": 3.1381489675746946e-09, + "loss": 0.3035, + "step": 4389 + }, + { + "epoch": 2.970230040595399, + "grad_norm": 1.1389919207006642, + "learning_rate": 3.000239178472253e-09, + "loss": 0.3259, + "step": 4390 + }, + { + "epoch": 2.9709066305818674, + "grad_norm": 1.1079950747464211, + "learning_rate": 2.8654272598788167e-09, + "loss": 0.3055, + "step": 4391 + }, + { + "epoch": 2.9715832205683355, + "grad_norm": 1.1359130773351196, + "learning_rate": 2.7337132953697555e-09, + "loss": 0.3095, + "step": 4392 + }, + { + "epoch": 2.972259810554804, + "grad_norm": 1.1636770999387485, + "learning_rate": 2.605097366601417e-09, + "loss": 0.3192, + "step": 4393 + }, + { + "epoch": 2.972936400541272, + "grad_norm": 1.1317737119272322, + "learning_rate": 2.479579553307798e-09, + "loss": 0.31, + "step": 4394 + }, + { + "epoch": 2.97361299052774, + "grad_norm": 1.1862242556681604, + "learning_rate": 2.3571599333038765e-09, + "loss": 0.3254, + "step": 4395 + }, + { + "epoch": 2.9742895805142084, + "grad_norm": 1.1509284888371165, + "learning_rate": 2.237838582483387e-09, + "loss": 0.3043, + "step": 4396 + }, + { + "epoch": 2.9749661705006765, + "grad_norm": 1.1426201542871595, + "learning_rate": 2.12161557481827e-09, + "loss": 0.3213, + "step": 4397 + }, + { + "epoch": 2.975642760487145, + "grad_norm": 1.1252494999790472, + "learning_rate": 2.008490982360889e-09, + "loss": 0.2965, + "step": 4398 + }, + { + "epoch": 2.976319350473613, + "grad_norm": 1.1204951487459502, + "learning_rate": 1.8984648752429222e-09, + "loss": 0.2978, + "step": 4399 + }, + { + "epoch": 2.976995940460081, + "grad_norm": 1.1695668395379417, + "learning_rate": 1.7915373216742527e-09, + "loss": 0.3115, + "step": 4400 + }, + { + "epoch": 2.9776725304465494, + "grad_norm": 1.119696402677367, + "learning_rate": 1.687708387944076e-09, + "loss": 0.2968, + "step": 4401 + }, + { + "epoch": 2.9783491204330175, + "grad_norm": 1.1047732478283987, + "learning_rate": 1.5869781384203475e-09, + "loss": 0.3021, + "step": 4402 + }, + { + "epoch": 2.979025710419486, + "grad_norm": 1.1364559701770647, + "learning_rate": 1.4893466355514474e-09, + "loss": 0.3064, + "step": 4403 + }, + { + "epoch": 2.979702300405954, + "grad_norm": 1.1148899633374867, + "learning_rate": 1.3948139398628492e-09, + "loss": 0.3093, + "step": 4404 + }, + { + "epoch": 2.980378890392422, + "grad_norm": 1.12969911338335, + "learning_rate": 1.3033801099598954e-09, + "loss": 0.3005, + "step": 4405 + }, + { + "epoch": 2.9810554803788905, + "grad_norm": 1.1212118242990712, + "learning_rate": 1.215045202527243e-09, + "loss": 0.3044, + "step": 4406 + }, + { + "epoch": 2.9817320703653585, + "grad_norm": 1.117365484245686, + "learning_rate": 1.1298092723266429e-09, + "loss": 0.3001, + "step": 4407 + }, + { + "epoch": 2.982408660351827, + "grad_norm": 1.0904502565464471, + "learning_rate": 1.0476723722002702e-09, + "loss": 0.3028, + "step": 4408 + }, + { + "epoch": 2.983085250338295, + "grad_norm": 1.145864536129481, + "learning_rate": 9.686345530690589e-10, + "loss": 0.3101, + "step": 4409 + }, + { + "epoch": 2.983761840324763, + "grad_norm": 1.116492308235731, + "learning_rate": 8.926958639315919e-10, + "loss": 0.3075, + "step": 4410 + }, + { + "epoch": 2.9844384303112315, + "grad_norm": 1.15049666392336, + "learning_rate": 8.198563518657665e-10, + "loss": 0.3058, + "step": 4411 + }, + { + "epoch": 2.9851150202976995, + "grad_norm": 1.0942773671352757, + "learning_rate": 7.50116062028794e-10, + "loss": 0.2933, + "step": 4412 + }, + { + "epoch": 2.985791610284168, + "grad_norm": 1.1922095560073203, + "learning_rate": 6.834750376549793e-10, + "loss": 0.3336, + "step": 4413 + }, + { + "epoch": 2.986468200270636, + "grad_norm": 1.1380529184851402, + "learning_rate": 6.199333200590519e-10, + "loss": 0.3169, + "step": 4414 + }, + { + "epoch": 2.987144790257104, + "grad_norm": 1.1053295877919185, + "learning_rate": 5.594909486328348e-10, + "loss": 0.2981, + "step": 4415 + }, + { + "epoch": 2.9878213802435725, + "grad_norm": 1.1197045497023874, + "learning_rate": 5.021479608474655e-10, + "loss": 0.3141, + "step": 4416 + }, + { + "epoch": 2.9884979702300405, + "grad_norm": 1.1186182970793708, + "learning_rate": 4.4790439225284034e-10, + "loss": 0.3034, + "step": 4417 + }, + { + "epoch": 2.989174560216509, + "grad_norm": 1.1731055612079, + "learning_rate": 3.967602764770595e-10, + "loss": 0.3054, + "step": 4418 + }, + { + "epoch": 2.989851150202977, + "grad_norm": 1.121528681282999, + "learning_rate": 3.487156452258722e-10, + "loss": 0.2953, + "step": 4419 + }, + { + "epoch": 2.990527740189445, + "grad_norm": 1.1943003012372644, + "learning_rate": 3.0377052828489684e-10, + "loss": 0.3188, + "step": 4420 + }, + { + "epoch": 2.9912043301759135, + "grad_norm": 1.1217151073214495, + "learning_rate": 2.6192495351795576e-10, + "loss": 0.2999, + "step": 4421 + }, + { + "epoch": 2.9918809201623815, + "grad_norm": 1.1422225996726174, + "learning_rate": 2.231789468670753e-10, + "loss": 0.2982, + "step": 4422 + }, + { + "epoch": 2.99255751014885, + "grad_norm": 1.1261024883457165, + "learning_rate": 1.8753253235248568e-10, + "loss": 0.2966, + "step": 4423 + }, + { + "epoch": 2.993234100135318, + "grad_norm": 1.175016815498028, + "learning_rate": 1.5498573207262112e-10, + "loss": 0.3207, + "step": 4424 + }, + { + "epoch": 2.993910690121786, + "grad_norm": 1.142588901617758, + "learning_rate": 1.2553856620522997e-10, + "loss": 0.3187, + "step": 4425 + }, + { + "epoch": 2.9945872801082545, + "grad_norm": 1.1462723401963262, + "learning_rate": 9.919105300570941e-11, + "loss": 0.3166, + "step": 4426 + }, + { + "epoch": 2.9952638700947225, + "grad_norm": 1.134317040194697, + "learning_rate": 7.59432088082157e-11, + "loss": 0.3095, + "step": 4427 + }, + { + "epoch": 2.995940460081191, + "grad_norm": 1.1462902638304833, + "learning_rate": 5.579504802566416e-11, + "loss": 0.3164, + "step": 4428 + }, + { + "epoch": 2.996617050067659, + "grad_norm": 1.1741380922743976, + "learning_rate": 3.8746583148063786e-11, + "loss": 0.3096, + "step": 4429 + }, + { + "epoch": 2.997293640054127, + "grad_norm": 1.1566532627269621, + "learning_rate": 2.4797824744737797e-11, + "loss": 0.3142, + "step": 4430 + }, + { + "epoch": 2.9979702300405955, + "grad_norm": 1.185825918050989, + "learning_rate": 1.3948781463213324e-11, + "loss": 0.3286, + "step": 4431 + }, + { + "epoch": 2.9986468200270635, + "grad_norm": 1.1129394991701362, + "learning_rate": 6.199460029221449e-12, + "loss": 0.2988, + "step": 4432 + }, + { + "epoch": 2.999323410013532, + "grad_norm": 1.1564904886316638, + "learning_rate": 1.549865247807425e-12, + "loss": 0.3123, + "step": 4433 + }, + { + "epoch": 3.0, + "grad_norm": 1.068601292043698, + "learning_rate": 0.0, + "loss": 0.2835, + "step": 4434 + }, + { + "epoch": 3.0, + "eval_loss": 0.39919164776802063, + "eval_runtime": 431.5509, + "eval_samples_per_second": 23.068, + "eval_steps_per_second": 0.723, + "step": 4434 + }, + { + "epoch": 3.0, + "step": 4434, + "total_flos": 1.6090113089910866e+18, + "train_loss": 0.69218219782406, + "train_runtime": 92434.0859, + "train_samples_per_second": 6.138, + "train_steps_per_second": 0.048 + } + ], + "logging_steps": 1, + "max_steps": 4434, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6090113089910866e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}