{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020755500207555002, "grad_norm": 11.311776762787964, "learning_rate": 3.9999996173438937e-07, "loss": 1.8103, "step": 1 }, { "epoch": 0.00041511000415110004, "grad_norm": 9.353463242142924, "learning_rate": 3.9999984693757373e-07, "loss": 1.8375, "step": 2 }, { "epoch": 0.0006226650062266501, "grad_norm": 8.67336800381065, "learning_rate": 3.999996556096019e-07, "loss": 1.8509, "step": 3 }, { "epoch": 0.0008302200083022001, "grad_norm": 8.863000043142469, "learning_rate": 3.999993877505552e-07, "loss": 1.8109, "step": 4 }, { "epoch": 0.0010377750103777502, "grad_norm": 8.92142828494341, "learning_rate": 3.9999904336054757e-07, "loss": 1.8316, "step": 5 }, { "epoch": 0.0012453300124533001, "grad_norm": 8.923191987375874, "learning_rate": 3.999986224397254e-07, "loss": 1.8512, "step": 6 }, { "epoch": 0.0014528850145288502, "grad_norm": 10.559992136409935, "learning_rate": 3.999981249882676e-07, "loss": 1.8537, "step": 7 }, { "epoch": 0.0016604400166044002, "grad_norm": 7.709226021472745, "learning_rate": 3.999975510063859e-07, "loss": 1.7617, "step": 8 }, { "epoch": 0.0018679950186799503, "grad_norm": 7.602312931282997, "learning_rate": 3.9999690049432405e-07, "loss": 1.8194, "step": 9 }, { "epoch": 0.0020755500207555004, "grad_norm": 7.326323018777289, "learning_rate": 3.9999617345235876e-07, "loss": 1.9101, "step": 10 }, { "epoch": 0.00228310502283105, "grad_norm": 7.598220784049708, "learning_rate": 3.9999536988079914e-07, "loss": 1.8226, "step": 11 }, { "epoch": 0.0024906600249066002, "grad_norm": 4.995333047136985, "learning_rate": 3.9999448977998685e-07, "loss": 1.7141, "step": 12 }, { "epoch": 0.0026982150269821504, "grad_norm": 4.388908625648589, "learning_rate": 3.999935331502961e-07, "loss": 1.7453, "step": 13 }, { "epoch": 0.0029057700290577005, "grad_norm": 3.655298270617133, "learning_rate": 3.9999249999213364e-07, "loss": 1.7598, "step": 14 }, { "epoch": 0.00311332503113325, "grad_norm": 3.767616537060441, "learning_rate": 3.999913903059387e-07, "loss": 1.7484, "step": 15 }, { "epoch": 0.0033208800332088003, "grad_norm": 4.390789350050781, "learning_rate": 3.999902040921831e-07, "loss": 1.7198, "step": 16 }, { "epoch": 0.0035284350352843504, "grad_norm": 3.1143857817107343, "learning_rate": 3.999889413513712e-07, "loss": 1.6737, "step": 17 }, { "epoch": 0.0037359900373599006, "grad_norm": 13.135689335510376, "learning_rate": 3.999876020840398e-07, "loss": 1.666, "step": 18 }, { "epoch": 0.00394354503943545, "grad_norm": 2.7414767134828546, "learning_rate": 3.9998618629075846e-07, "loss": 1.6352, "step": 19 }, { "epoch": 0.004151100041511001, "grad_norm": 2.7089328082011424, "learning_rate": 3.9998469397212906e-07, "loss": 1.714, "step": 20 }, { "epoch": 0.0043586550435865505, "grad_norm": 4.299280097728705, "learning_rate": 3.999831251287861e-07, "loss": 1.6463, "step": 21 }, { "epoch": 0.0045662100456621, "grad_norm": 2.9172203605477294, "learning_rate": 3.999814797613966e-07, "loss": 1.8079, "step": 22 }, { "epoch": 0.004773765047737651, "grad_norm": 3.830109900005944, "learning_rate": 3.999797578706602e-07, "loss": 1.7054, "step": 23 }, { "epoch": 0.0049813200498132005, "grad_norm": 6.031012034118779, "learning_rate": 3.9997795945730887e-07, "loss": 1.7954, "step": 24 }, { "epoch": 0.00518887505188875, "grad_norm": 7.305625803127848, "learning_rate": 3.9997608452210734e-07, "loss": 1.6476, "step": 25 }, { "epoch": 0.005396430053964301, "grad_norm": 3.199858149858899, "learning_rate": 3.9997413306585275e-07, "loss": 1.6505, "step": 26 }, { "epoch": 0.00560398505603985, "grad_norm": 3.1464883611285788, "learning_rate": 3.999721050893749e-07, "loss": 1.6867, "step": 27 }, { "epoch": 0.005811540058115401, "grad_norm": 3.2747257371112384, "learning_rate": 3.9997000059353595e-07, "loss": 1.7287, "step": 28 }, { "epoch": 0.006019095060190951, "grad_norm": 2.970788897794001, "learning_rate": 3.999678195792306e-07, "loss": 1.7611, "step": 29 }, { "epoch": 0.0062266500622665, "grad_norm": 5.449631086335021, "learning_rate": 3.999655620473863e-07, "loss": 1.6526, "step": 30 }, { "epoch": 0.006434205064342051, "grad_norm": 1.6293671283027087, "learning_rate": 3.999632279989628e-07, "loss": 1.571, "step": 31 }, { "epoch": 0.006641760066417601, "grad_norm": 1.6360904002070937, "learning_rate": 3.9996081743495247e-07, "loss": 1.6968, "step": 32 }, { "epoch": 0.00684931506849315, "grad_norm": 1.5499493046506658, "learning_rate": 3.9995833035638034e-07, "loss": 1.6265, "step": 33 }, { "epoch": 0.007056870070568701, "grad_norm": 1.627232202949144, "learning_rate": 3.9995576676430375e-07, "loss": 1.6673, "step": 34 }, { "epoch": 0.0072644250726442506, "grad_norm": 1.1414425435382647, "learning_rate": 3.999531266598126e-07, "loss": 1.6336, "step": 35 }, { "epoch": 0.007471980074719801, "grad_norm": 1.2406746982833923, "learning_rate": 3.999504100440296e-07, "loss": 1.6717, "step": 36 }, { "epoch": 0.007679535076795351, "grad_norm": 1.1874081408656214, "learning_rate": 3.9994761691810956e-07, "loss": 1.6095, "step": 37 }, { "epoch": 0.0078870900788709, "grad_norm": 1.5188163338014686, "learning_rate": 3.999447472832402e-07, "loss": 1.6461, "step": 38 }, { "epoch": 0.00809464508094645, "grad_norm": 1.33634941500503, "learning_rate": 3.999418011406415e-07, "loss": 1.6274, "step": 39 }, { "epoch": 0.008302200083022002, "grad_norm": 1.2824476876179023, "learning_rate": 3.999387784915662e-07, "loss": 1.6986, "step": 40 }, { "epoch": 0.008509755085097551, "grad_norm": 1.1997067875355065, "learning_rate": 3.9993567933729933e-07, "loss": 1.6037, "step": 41 }, { "epoch": 0.008717310087173101, "grad_norm": 1.371312130776846, "learning_rate": 3.9993250367915873e-07, "loss": 1.6133, "step": 42 }, { "epoch": 0.00892486508924865, "grad_norm": 1.4148239664038342, "learning_rate": 3.999292515184944e-07, "loss": 1.6403, "step": 43 }, { "epoch": 0.0091324200913242, "grad_norm": 1.0705459177594618, "learning_rate": 3.9992592285668916e-07, "loss": 1.5651, "step": 44 }, { "epoch": 0.00933997509339975, "grad_norm": 1.141698744909979, "learning_rate": 3.9992251769515837e-07, "loss": 1.6246, "step": 45 }, { "epoch": 0.009547530095475302, "grad_norm": 0.9921870431298467, "learning_rate": 3.9991903603534964e-07, "loss": 1.6022, "step": 46 }, { "epoch": 0.009755085097550851, "grad_norm": 0.9010365335437199, "learning_rate": 3.9991547787874343e-07, "loss": 1.6159, "step": 47 }, { "epoch": 0.009962640099626401, "grad_norm": 0.9693950271808062, "learning_rate": 3.999118432268525e-07, "loss": 1.6386, "step": 48 }, { "epoch": 0.01017019510170195, "grad_norm": 1.0199944001952896, "learning_rate": 3.9990813208122224e-07, "loss": 1.6736, "step": 49 }, { "epoch": 0.0103777501037775, "grad_norm": 1.2436120571389748, "learning_rate": 3.999043444434305e-07, "loss": 1.6482, "step": 50 }, { "epoch": 0.010585305105853052, "grad_norm": 1.4153279715371405, "learning_rate": 3.9990048031508765e-07, "loss": 1.6776, "step": 51 }, { "epoch": 0.010792860107928601, "grad_norm": 1.4925781849984703, "learning_rate": 3.998965396978367e-07, "loss": 1.6113, "step": 52 }, { "epoch": 0.011000415110004151, "grad_norm": 1.0123982309527828, "learning_rate": 3.99892522593353e-07, "loss": 1.6865, "step": 53 }, { "epoch": 0.0112079701120797, "grad_norm": 1.0459941412707985, "learning_rate": 3.998884290033446e-07, "loss": 1.6034, "step": 54 }, { "epoch": 0.01141552511415525, "grad_norm": 0.9406488418358278, "learning_rate": 3.99884258929552e-07, "loss": 1.5439, "step": 55 }, { "epoch": 0.011623080116230802, "grad_norm": 0.9041137465808828, "learning_rate": 3.9988001237374804e-07, "loss": 1.6299, "step": 56 }, { "epoch": 0.011830635118306352, "grad_norm": 1.112009836123954, "learning_rate": 3.9987568933773844e-07, "loss": 1.7235, "step": 57 }, { "epoch": 0.012038190120381901, "grad_norm": 1.047373763538019, "learning_rate": 3.9987128982336114e-07, "loss": 1.6787, "step": 58 }, { "epoch": 0.012245745122457451, "grad_norm": 0.8835602431501268, "learning_rate": 3.998668138324867e-07, "loss": 1.6616, "step": 59 }, { "epoch": 0.012453300124533, "grad_norm": 1.6835123820339188, "learning_rate": 3.998622613670183e-07, "loss": 1.6134, "step": 60 }, { "epoch": 0.01266085512660855, "grad_norm": 0.9534183321700821, "learning_rate": 3.998576324288914e-07, "loss": 1.6385, "step": 61 }, { "epoch": 0.012868410128684102, "grad_norm": 0.9104143262122223, "learning_rate": 3.998529270200741e-07, "loss": 1.6014, "step": 62 }, { "epoch": 0.013075965130759652, "grad_norm": 1.530542587667927, "learning_rate": 3.9984814514256715e-07, "loss": 1.5427, "step": 63 }, { "epoch": 0.013283520132835201, "grad_norm": 1.2104918255889494, "learning_rate": 3.9984328679840343e-07, "loss": 1.6357, "step": 64 }, { "epoch": 0.013491075134910751, "grad_norm": 0.8108098711016526, "learning_rate": 3.9983835198964885e-07, "loss": 1.5577, "step": 65 }, { "epoch": 0.0136986301369863, "grad_norm": 0.8894370000259616, "learning_rate": 3.9983334071840135e-07, "loss": 1.6112, "step": 66 }, { "epoch": 0.013906185139061852, "grad_norm": 0.7398465159411312, "learning_rate": 3.9982825298679176e-07, "loss": 1.6371, "step": 67 }, { "epoch": 0.014113740141137402, "grad_norm": 1.4236875339228963, "learning_rate": 3.9982308879698317e-07, "loss": 1.619, "step": 68 }, { "epoch": 0.014321295143212951, "grad_norm": 3.265087745899655, "learning_rate": 3.998178481511712e-07, "loss": 1.6064, "step": 69 }, { "epoch": 0.014528850145288501, "grad_norm": 0.7568948390200323, "learning_rate": 3.998125310515841e-07, "loss": 1.5552, "step": 70 }, { "epoch": 0.01473640514736405, "grad_norm": 0.7756456280737289, "learning_rate": 3.998071375004826e-07, "loss": 1.6534, "step": 71 }, { "epoch": 0.014943960149439602, "grad_norm": 3.5014949988448847, "learning_rate": 3.9980166750015975e-07, "loss": 1.5392, "step": 72 }, { "epoch": 0.015151515151515152, "grad_norm": 0.8739187478417162, "learning_rate": 3.9979612105294144e-07, "loss": 1.5773, "step": 73 }, { "epoch": 0.015359070153590702, "grad_norm": 1.3102361826816729, "learning_rate": 3.997904981611857e-07, "loss": 1.5842, "step": 74 }, { "epoch": 0.015566625155666251, "grad_norm": 0.8691694903148511, "learning_rate": 3.9978479882728335e-07, "loss": 1.6076, "step": 75 }, { "epoch": 0.0157741801577418, "grad_norm": 0.7842186698846596, "learning_rate": 3.997790230536575e-07, "loss": 1.5706, "step": 76 }, { "epoch": 0.01598173515981735, "grad_norm": 0.8596725339526274, "learning_rate": 3.99773170842764e-07, "loss": 1.5754, "step": 77 }, { "epoch": 0.0161892901618929, "grad_norm": 1.0555561024172766, "learning_rate": 3.9976724219709095e-07, "loss": 1.5816, "step": 78 }, { "epoch": 0.01639684516396845, "grad_norm": 0.6843759261931969, "learning_rate": 3.9976123711915897e-07, "loss": 1.6009, "step": 79 }, { "epoch": 0.016604400166044003, "grad_norm": 1.2404904928062643, "learning_rate": 3.9975515561152145e-07, "loss": 1.6722, "step": 80 }, { "epoch": 0.016811955168119553, "grad_norm": 1.0796349325750587, "learning_rate": 3.9974899767676395e-07, "loss": 1.6735, "step": 81 }, { "epoch": 0.017019510170195103, "grad_norm": 0.744744231983327, "learning_rate": 3.997427633175047e-07, "loss": 1.6355, "step": 82 }, { "epoch": 0.017227065172270652, "grad_norm": 0.8136623677227462, "learning_rate": 3.997364525363944e-07, "loss": 1.5393, "step": 83 }, { "epoch": 0.017434620174346202, "grad_norm": 0.7060002367144068, "learning_rate": 3.997300653361162e-07, "loss": 1.6214, "step": 84 }, { "epoch": 0.01764217517642175, "grad_norm": 0.8792853840506358, "learning_rate": 3.997236017193858e-07, "loss": 1.5898, "step": 85 }, { "epoch": 0.0178497301784973, "grad_norm": 0.8053031716414784, "learning_rate": 3.9971706168895136e-07, "loss": 1.5168, "step": 86 }, { "epoch": 0.01805728518057285, "grad_norm": 0.8358084053196052, "learning_rate": 3.9971044524759344e-07, "loss": 1.597, "step": 87 }, { "epoch": 0.0182648401826484, "grad_norm": 1.8858433136361286, "learning_rate": 3.9970375239812525e-07, "loss": 1.6603, "step": 88 }, { "epoch": 0.01847239518472395, "grad_norm": 0.7772154947227216, "learning_rate": 3.996969831433925e-07, "loss": 1.6533, "step": 89 }, { "epoch": 0.0186799501867995, "grad_norm": 0.7203965146552309, "learning_rate": 3.996901374862731e-07, "loss": 1.559, "step": 90 }, { "epoch": 0.018887505188875053, "grad_norm": 0.9141428370702639, "learning_rate": 3.996832154296778e-07, "loss": 1.5923, "step": 91 }, { "epoch": 0.019095060190950603, "grad_norm": 0.8730542345016723, "learning_rate": 3.9967621697654955e-07, "loss": 1.6517, "step": 92 }, { "epoch": 0.019302615193026153, "grad_norm": 0.700994912781778, "learning_rate": 3.996691421298641e-07, "loss": 1.5687, "step": 93 }, { "epoch": 0.019510170195101702, "grad_norm": 0.8546352866257564, "learning_rate": 3.996619908926292e-07, "loss": 1.635, "step": 94 }, { "epoch": 0.019717725197177252, "grad_norm": 0.7118268331423449, "learning_rate": 3.9965476326788563e-07, "loss": 1.5886, "step": 95 }, { "epoch": 0.019925280199252802, "grad_norm": 0.9067268803639301, "learning_rate": 3.9964745925870626e-07, "loss": 1.5618, "step": 96 }, { "epoch": 0.02013283520132835, "grad_norm": 7.3842092978252305, "learning_rate": 3.9964007886819656e-07, "loss": 1.5876, "step": 97 }, { "epoch": 0.0203403902034039, "grad_norm": 0.978924255031244, "learning_rate": 3.996326220994945e-07, "loss": 1.6045, "step": 98 }, { "epoch": 0.02054794520547945, "grad_norm": 0.822612571727339, "learning_rate": 3.996250889557706e-07, "loss": 1.629, "step": 99 }, { "epoch": 0.020755500207555, "grad_norm": 0.8769641944628137, "learning_rate": 3.996174794402276e-07, "loss": 1.6471, "step": 100 }, { "epoch": 0.020963055209630554, "grad_norm": 0.8126776014287214, "learning_rate": 3.9960979355610085e-07, "loss": 1.6068, "step": 101 }, { "epoch": 0.021170610211706103, "grad_norm": 0.7784589415169838, "learning_rate": 3.9960203130665823e-07, "loss": 1.6294, "step": 102 }, { "epoch": 0.021378165213781653, "grad_norm": 1.199036492044139, "learning_rate": 3.9959419269520013e-07, "loss": 1.5832, "step": 103 }, { "epoch": 0.021585720215857203, "grad_norm": 0.7984232635569228, "learning_rate": 3.9958627772505924e-07, "loss": 1.5183, "step": 104 }, { "epoch": 0.021793275217932753, "grad_norm": 0.9139096875631566, "learning_rate": 3.9957828639960083e-07, "loss": 1.5612, "step": 105 }, { "epoch": 0.022000830220008302, "grad_norm": 1.0650085114380075, "learning_rate": 3.995702187222225e-07, "loss": 1.5779, "step": 106 }, { "epoch": 0.022208385222083852, "grad_norm": 0.8002619026048111, "learning_rate": 3.9956207469635454e-07, "loss": 1.5826, "step": 107 }, { "epoch": 0.0224159402241594, "grad_norm": 1.0383859991179343, "learning_rate": 3.995538543254595e-07, "loss": 1.5817, "step": 108 }, { "epoch": 0.02262349522623495, "grad_norm": 0.7189329429408791, "learning_rate": 3.995455576130325e-07, "loss": 1.5694, "step": 109 }, { "epoch": 0.0228310502283105, "grad_norm": 0.7622695324143753, "learning_rate": 3.9953718456260113e-07, "loss": 1.6204, "step": 110 }, { "epoch": 0.02303860523038605, "grad_norm": 0.7427754575751896, "learning_rate": 3.9952873517772524e-07, "loss": 1.5273, "step": 111 }, { "epoch": 0.023246160232461604, "grad_norm": 1.3000202135831596, "learning_rate": 3.995202094619974e-07, "loss": 1.6408, "step": 112 }, { "epoch": 0.023453715234537154, "grad_norm": 0.9593720967609558, "learning_rate": 3.995116074190424e-07, "loss": 1.6065, "step": 113 }, { "epoch": 0.023661270236612703, "grad_norm": 0.6955278941718566, "learning_rate": 3.995029290525178e-07, "loss": 1.4961, "step": 114 }, { "epoch": 0.023868825238688253, "grad_norm": 0.9759198828584535, "learning_rate": 3.9949417436611325e-07, "loss": 1.5576, "step": 115 }, { "epoch": 0.024076380240763803, "grad_norm": 0.7554845221022716, "learning_rate": 3.994853433635511e-07, "loss": 1.5302, "step": 116 }, { "epoch": 0.024283935242839352, "grad_norm": 0.7163127910598674, "learning_rate": 3.99476436048586e-07, "loss": 1.5539, "step": 117 }, { "epoch": 0.024491490244914902, "grad_norm": 1.1803030168547048, "learning_rate": 3.9946745242500507e-07, "loss": 1.546, "step": 118 }, { "epoch": 0.02469904524699045, "grad_norm": 1.107046921929907, "learning_rate": 3.99458392496628e-07, "loss": 1.5175, "step": 119 }, { "epoch": 0.024906600249066, "grad_norm": 0.7044935713830988, "learning_rate": 3.9944925626730676e-07, "loss": 1.652, "step": 120 }, { "epoch": 0.02511415525114155, "grad_norm": 1.4134203081620509, "learning_rate": 3.994400437409259e-07, "loss": 1.6474, "step": 121 }, { "epoch": 0.0253217102532171, "grad_norm": 0.8450789118515795, "learning_rate": 3.9943075492140234e-07, "loss": 1.4921, "step": 122 }, { "epoch": 0.025529265255292654, "grad_norm": 0.8784976315552246, "learning_rate": 3.9942138981268536e-07, "loss": 1.5586, "step": 123 }, { "epoch": 0.025736820257368204, "grad_norm": 1.8578157384463314, "learning_rate": 3.9941194841875676e-07, "loss": 1.6001, "step": 124 }, { "epoch": 0.025944375259443753, "grad_norm": 0.8477921362158418, "learning_rate": 3.994024307436309e-07, "loss": 1.5293, "step": 125 }, { "epoch": 0.026151930261519303, "grad_norm": 0.8270019911234011, "learning_rate": 3.993928367913543e-07, "loss": 1.5722, "step": 126 }, { "epoch": 0.026359485263594853, "grad_norm": 2.1323843964417595, "learning_rate": 3.99383166566006e-07, "loss": 1.5979, "step": 127 }, { "epoch": 0.026567040265670402, "grad_norm": 0.8831376361302666, "learning_rate": 3.9937342007169777e-07, "loss": 1.5814, "step": 128 }, { "epoch": 0.026774595267745952, "grad_norm": 0.7713679154410665, "learning_rate": 3.993635973125734e-07, "loss": 1.6059, "step": 129 }, { "epoch": 0.026982150269821502, "grad_norm": 1.3485856894156136, "learning_rate": 3.9935369829280924e-07, "loss": 1.5848, "step": 130 }, { "epoch": 0.02718970527189705, "grad_norm": 0.7679506123644734, "learning_rate": 3.9934372301661416e-07, "loss": 1.5869, "step": 131 }, { "epoch": 0.0273972602739726, "grad_norm": 1.4099730664181218, "learning_rate": 3.9933367148822936e-07, "loss": 1.609, "step": 132 }, { "epoch": 0.027604815276048154, "grad_norm": 0.8527046271639593, "learning_rate": 3.993235437119285e-07, "loss": 1.6518, "step": 133 }, { "epoch": 0.027812370278123704, "grad_norm": 0.9876695238073084, "learning_rate": 3.993133396920176e-07, "loss": 1.5732, "step": 134 }, { "epoch": 0.028019925280199254, "grad_norm": 0.8848818713241431, "learning_rate": 3.993030594328352e-07, "loss": 1.619, "step": 135 }, { "epoch": 0.028227480282274803, "grad_norm": 0.8445329910933679, "learning_rate": 3.9929270293875204e-07, "loss": 1.6159, "step": 136 }, { "epoch": 0.028435035284350353, "grad_norm": 0.7036153934743418, "learning_rate": 3.992822702141717e-07, "loss": 1.5072, "step": 137 }, { "epoch": 0.028642590286425903, "grad_norm": 0.7875600395024726, "learning_rate": 3.992717612635296e-07, "loss": 1.684, "step": 138 }, { "epoch": 0.028850145288501453, "grad_norm": 0.6610570426998603, "learning_rate": 3.992611760912941e-07, "loss": 1.5804, "step": 139 }, { "epoch": 0.029057700290577002, "grad_norm": 1.0213832385611197, "learning_rate": 3.992505147019656e-07, "loss": 1.5811, "step": 140 }, { "epoch": 0.029265255292652552, "grad_norm": 1.0008489967572651, "learning_rate": 3.9923977710007705e-07, "loss": 1.4963, "step": 141 }, { "epoch": 0.0294728102947281, "grad_norm": 0.6649639875346893, "learning_rate": 3.992289632901939e-07, "loss": 1.5339, "step": 142 }, { "epoch": 0.02968036529680365, "grad_norm": 2.2653111740366585, "learning_rate": 3.9921807327691375e-07, "loss": 1.6083, "step": 143 }, { "epoch": 0.029887920298879204, "grad_norm": 1.1445291033894631, "learning_rate": 3.992071070648668e-07, "loss": 1.5568, "step": 144 }, { "epoch": 0.030095475300954754, "grad_norm": 2.245761561167164, "learning_rate": 3.9919606465871565e-07, "loss": 1.5667, "step": 145 }, { "epoch": 0.030303030303030304, "grad_norm": 0.9655237026435531, "learning_rate": 3.991849460631552e-07, "loss": 1.5883, "step": 146 }, { "epoch": 0.030510585305105854, "grad_norm": 0.8122892559125585, "learning_rate": 3.9917375128291276e-07, "loss": 1.6374, "step": 147 }, { "epoch": 0.030718140307181403, "grad_norm": 1.1617726676187454, "learning_rate": 3.9916248032274807e-07, "loss": 1.5375, "step": 148 }, { "epoch": 0.030925695309256953, "grad_norm": 0.6676490919385982, "learning_rate": 3.9915113318745327e-07, "loss": 1.6109, "step": 149 }, { "epoch": 0.031133250311332503, "grad_norm": 0.700265676155718, "learning_rate": 3.9913970988185274e-07, "loss": 1.5653, "step": 150 }, { "epoch": 0.031340805313408056, "grad_norm": 0.6914303208551976, "learning_rate": 3.9912821041080353e-07, "loss": 1.5813, "step": 151 }, { "epoch": 0.0315483603154836, "grad_norm": 1.5483990604207802, "learning_rate": 3.9911663477919483e-07, "loss": 1.6318, "step": 152 }, { "epoch": 0.031755915317559155, "grad_norm": 2.426454657016846, "learning_rate": 3.9910498299194825e-07, "loss": 1.6519, "step": 153 }, { "epoch": 0.0319634703196347, "grad_norm": 2.8984298572473453, "learning_rate": 3.9909325505401795e-07, "loss": 1.6033, "step": 154 }, { "epoch": 0.032171025321710255, "grad_norm": 1.1062149773615026, "learning_rate": 3.990814509703902e-07, "loss": 1.5272, "step": 155 }, { "epoch": 0.0323785803237858, "grad_norm": 1.2325479409853999, "learning_rate": 3.9906957074608384e-07, "loss": 1.5379, "step": 156 }, { "epoch": 0.032586135325861354, "grad_norm": 0.6306153405028786, "learning_rate": 3.9905761438615004e-07, "loss": 1.5455, "step": 157 }, { "epoch": 0.0327936903279369, "grad_norm": 0.8282588567870623, "learning_rate": 3.990455818956723e-07, "loss": 1.5969, "step": 158 }, { "epoch": 0.03300124533001245, "grad_norm": 0.8312314136498741, "learning_rate": 3.990334732797665e-07, "loss": 1.5551, "step": 159 }, { "epoch": 0.033208800332088007, "grad_norm": 0.8127972376519234, "learning_rate": 3.99021288543581e-07, "loss": 1.5505, "step": 160 }, { "epoch": 0.03341635533416355, "grad_norm": 0.7182138887399596, "learning_rate": 3.990090276922963e-07, "loss": 1.5305, "step": 161 }, { "epoch": 0.033623910336239106, "grad_norm": 0.8211628140749623, "learning_rate": 3.9899669073112546e-07, "loss": 1.5352, "step": 162 }, { "epoch": 0.03383146533831465, "grad_norm": 1.1408798537766063, "learning_rate": 3.9898427766531383e-07, "loss": 1.6034, "step": 163 }, { "epoch": 0.034039020340390205, "grad_norm": 0.8372043440866829, "learning_rate": 3.9897178850013913e-07, "loss": 1.6699, "step": 164 }, { "epoch": 0.03424657534246575, "grad_norm": 0.700689461293022, "learning_rate": 3.989592232409113e-07, "loss": 1.5614, "step": 165 }, { "epoch": 0.034454130344541305, "grad_norm": 0.6977281072027869, "learning_rate": 3.9894658189297294e-07, "loss": 1.6408, "step": 166 }, { "epoch": 0.03466168534661685, "grad_norm": 1.094797373432667, "learning_rate": 3.9893386446169863e-07, "loss": 1.5868, "step": 167 }, { "epoch": 0.034869240348692404, "grad_norm": 0.6908473634514661, "learning_rate": 3.989210709524957e-07, "loss": 1.5045, "step": 168 }, { "epoch": 0.03507679535076795, "grad_norm": 1.1944367559596936, "learning_rate": 3.9890820137080334e-07, "loss": 1.5161, "step": 169 }, { "epoch": 0.0352843503528435, "grad_norm": 0.773620165920913, "learning_rate": 3.9889525572209363e-07, "loss": 1.5781, "step": 170 }, { "epoch": 0.03549190535491906, "grad_norm": 0.7063459328098319, "learning_rate": 3.9888223401187047e-07, "loss": 1.5963, "step": 171 }, { "epoch": 0.0356994603569946, "grad_norm": 0.7325877920986794, "learning_rate": 3.9886913624567054e-07, "loss": 1.5681, "step": 172 }, { "epoch": 0.035907015359070156, "grad_norm": 0.7487252631547822, "learning_rate": 3.988559624290625e-07, "loss": 1.6309, "step": 173 }, { "epoch": 0.0361145703611457, "grad_norm": 0.7801430432423838, "learning_rate": 3.988427125676477e-07, "loss": 1.657, "step": 174 }, { "epoch": 0.036322125363221255, "grad_norm": 1.0691624441098393, "learning_rate": 3.988293866670595e-07, "loss": 1.5523, "step": 175 }, { "epoch": 0.0365296803652968, "grad_norm": 0.9498431902389803, "learning_rate": 3.9881598473296367e-07, "loss": 1.562, "step": 176 }, { "epoch": 0.036737235367372355, "grad_norm": 0.6660373100674684, "learning_rate": 3.9880250677105847e-07, "loss": 1.6115, "step": 177 }, { "epoch": 0.0369447903694479, "grad_norm": 0.7341769246356358, "learning_rate": 3.987889527870743e-07, "loss": 1.5515, "step": 178 }, { "epoch": 0.037152345371523454, "grad_norm": 0.8412797012709253, "learning_rate": 3.9877532278677396e-07, "loss": 1.5441, "step": 179 }, { "epoch": 0.037359900373599, "grad_norm": 1.6902112570845282, "learning_rate": 3.9876161677595263e-07, "loss": 1.4765, "step": 180 }, { "epoch": 0.037567455375674554, "grad_norm": 0.8407033758598549, "learning_rate": 3.987478347604377e-07, "loss": 1.5988, "step": 181 }, { "epoch": 0.03777501037775011, "grad_norm": 0.7351067608061718, "learning_rate": 3.9873397674608895e-07, "loss": 1.5781, "step": 182 }, { "epoch": 0.03798256537982565, "grad_norm": 0.8740413274445975, "learning_rate": 3.9872004273879834e-07, "loss": 1.5492, "step": 183 }, { "epoch": 0.038190120381901206, "grad_norm": 2.059436882263371, "learning_rate": 3.987060327444904e-07, "loss": 1.5662, "step": 184 }, { "epoch": 0.03839767538397675, "grad_norm": 2.4231207058645037, "learning_rate": 3.9869194676912164e-07, "loss": 1.5922, "step": 185 }, { "epoch": 0.038605230386052306, "grad_norm": 1.0766649990188597, "learning_rate": 3.9867778481868114e-07, "loss": 1.6493, "step": 186 }, { "epoch": 0.03881278538812785, "grad_norm": 1.020653464015153, "learning_rate": 3.986635468991901e-07, "loss": 1.4228, "step": 187 }, { "epoch": 0.039020340390203405, "grad_norm": 0.7514655130406892, "learning_rate": 3.986492330167022e-07, "loss": 1.5397, "step": 188 }, { "epoch": 0.03922789539227895, "grad_norm": 0.6943121513169542, "learning_rate": 3.986348431773033e-07, "loss": 1.5768, "step": 189 }, { "epoch": 0.039435450394354504, "grad_norm": 0.7628930984074667, "learning_rate": 3.986203773871115e-07, "loss": 1.5766, "step": 190 }, { "epoch": 0.03964300539643005, "grad_norm": 1.0381762765052955, "learning_rate": 3.9860583565227744e-07, "loss": 1.5486, "step": 191 }, { "epoch": 0.039850560398505604, "grad_norm": 0.7754563764940113, "learning_rate": 3.985912179789838e-07, "loss": 1.5418, "step": 192 }, { "epoch": 0.04005811540058116, "grad_norm": 0.8452461811861927, "learning_rate": 3.985765243734455e-07, "loss": 1.5459, "step": 193 }, { "epoch": 0.0402656704026567, "grad_norm": 0.756668308136434, "learning_rate": 3.9856175484191004e-07, "loss": 1.5337, "step": 194 }, { "epoch": 0.040473225404732256, "grad_norm": 0.6655494883376677, "learning_rate": 3.9854690939065693e-07, "loss": 1.589, "step": 195 }, { "epoch": 0.0406807804068078, "grad_norm": 1.311028732233012, "learning_rate": 3.9853198802599806e-07, "loss": 1.5563, "step": 196 }, { "epoch": 0.040888335408883356, "grad_norm": 0.8465929555909902, "learning_rate": 3.985169907542777e-07, "loss": 1.6205, "step": 197 }, { "epoch": 0.0410958904109589, "grad_norm": 2.1440242597231065, "learning_rate": 3.9850191758187214e-07, "loss": 1.5954, "step": 198 }, { "epoch": 0.041303445413034455, "grad_norm": 0.7041663115618453, "learning_rate": 3.984867685151903e-07, "loss": 1.5534, "step": 199 }, { "epoch": 0.04151100041511, "grad_norm": 0.8029009177790535, "learning_rate": 3.98471543560673e-07, "loss": 1.544, "step": 200 }, { "epoch": 0.041718555417185554, "grad_norm": 0.8166296318967385, "learning_rate": 3.984562427247935e-07, "loss": 1.6173, "step": 201 }, { "epoch": 0.04192611041926111, "grad_norm": 0.8011846342714176, "learning_rate": 3.9844086601405734e-07, "loss": 1.607, "step": 202 }, { "epoch": 0.042133665421336654, "grad_norm": 0.6950052511381265, "learning_rate": 3.9842541343500233e-07, "loss": 1.5547, "step": 203 }, { "epoch": 0.04234122042341221, "grad_norm": 0.9926840980550315, "learning_rate": 3.9840988499419844e-07, "loss": 1.5576, "step": 204 }, { "epoch": 0.04254877542548775, "grad_norm": 1.0280956372225183, "learning_rate": 3.9839428069824793e-07, "loss": 1.518, "step": 205 }, { "epoch": 0.042756330427563306, "grad_norm": 0.7912577642963848, "learning_rate": 3.983786005537854e-07, "loss": 1.535, "step": 206 }, { "epoch": 0.04296388542963885, "grad_norm": 1.2668204255261601, "learning_rate": 3.9836284456747753e-07, "loss": 1.6266, "step": 207 }, { "epoch": 0.043171440431714406, "grad_norm": 0.8746201750394597, "learning_rate": 3.983470127460235e-07, "loss": 1.5874, "step": 208 }, { "epoch": 0.04337899543378995, "grad_norm": 1.3221081900743643, "learning_rate": 3.9833110509615447e-07, "loss": 1.5472, "step": 209 }, { "epoch": 0.043586550435865505, "grad_norm": 0.7677515453124257, "learning_rate": 3.9831512162463393e-07, "loss": 1.5622, "step": 210 }, { "epoch": 0.04379410543794105, "grad_norm": 0.7388518036238274, "learning_rate": 3.982990623382577e-07, "loss": 1.5878, "step": 211 }, { "epoch": 0.044001660440016604, "grad_norm": 0.8349483578418847, "learning_rate": 3.982829272438538e-07, "loss": 1.6191, "step": 212 }, { "epoch": 0.04420921544209216, "grad_norm": 0.7866287359865883, "learning_rate": 3.982667163482823e-07, "loss": 1.5913, "step": 213 }, { "epoch": 0.044416770444167704, "grad_norm": 0.7337542585933059, "learning_rate": 3.9825042965843574e-07, "loss": 1.6263, "step": 214 }, { "epoch": 0.04462432544624326, "grad_norm": 0.8087773209608627, "learning_rate": 3.9823406718123876e-07, "loss": 1.5302, "step": 215 }, { "epoch": 0.0448318804483188, "grad_norm": 1.6289052960820165, "learning_rate": 3.9821762892364824e-07, "loss": 1.503, "step": 216 }, { "epoch": 0.045039435450394356, "grad_norm": 0.7539242918102785, "learning_rate": 3.9820111489265337e-07, "loss": 1.4897, "step": 217 }, { "epoch": 0.0452469904524699, "grad_norm": 0.7939260472145644, "learning_rate": 3.981845250952754e-07, "loss": 1.5782, "step": 218 }, { "epoch": 0.045454545454545456, "grad_norm": 1.7727611690481937, "learning_rate": 3.981678595385679e-07, "loss": 1.5492, "step": 219 }, { "epoch": 0.045662100456621, "grad_norm": 0.8533572039520807, "learning_rate": 3.9815111822961653e-07, "loss": 1.6221, "step": 220 }, { "epoch": 0.045869655458696555, "grad_norm": 0.7203401381369927, "learning_rate": 3.9813430117553944e-07, "loss": 1.5412, "step": 221 }, { "epoch": 0.0460772104607721, "grad_norm": 0.8959047237613945, "learning_rate": 3.9811740838348664e-07, "loss": 1.6073, "step": 222 }, { "epoch": 0.046284765462847655, "grad_norm": 0.7759094459704792, "learning_rate": 3.9810043986064053e-07, "loss": 1.5732, "step": 223 }, { "epoch": 0.04649232046492321, "grad_norm": 1.3792835863498725, "learning_rate": 3.980833956142157e-07, "loss": 1.6014, "step": 224 }, { "epoch": 0.046699875466998754, "grad_norm": 1.1790585519053556, "learning_rate": 3.9806627565145887e-07, "loss": 1.5806, "step": 225 }, { "epoch": 0.04690743046907431, "grad_norm": 0.8267158587237875, "learning_rate": 3.9804907997964907e-07, "loss": 1.5894, "step": 226 }, { "epoch": 0.04711498547114985, "grad_norm": 1.4237674190785736, "learning_rate": 3.9803180860609736e-07, "loss": 1.5743, "step": 227 }, { "epoch": 0.047322540473225407, "grad_norm": 0.7671641452665827, "learning_rate": 3.980144615381472e-07, "loss": 1.6446, "step": 228 }, { "epoch": 0.04753009547530095, "grad_norm": 0.7846547235437324, "learning_rate": 3.979970387831739e-07, "loss": 1.6352, "step": 229 }, { "epoch": 0.047737650477376506, "grad_norm": 1.051907504842811, "learning_rate": 3.9797954034858534e-07, "loss": 1.4974, "step": 230 }, { "epoch": 0.04794520547945205, "grad_norm": 0.9643123621085294, "learning_rate": 3.9796196624182127e-07, "loss": 1.6384, "step": 231 }, { "epoch": 0.048152760481527605, "grad_norm": 0.8182110474596903, "learning_rate": 3.979443164703538e-07, "loss": 1.5885, "step": 232 }, { "epoch": 0.04836031548360315, "grad_norm": 1.8122984015829537, "learning_rate": 3.9792659104168703e-07, "loss": 1.5165, "step": 233 }, { "epoch": 0.048567870485678705, "grad_norm": 0.9606591271592947, "learning_rate": 3.9790878996335757e-07, "loss": 1.5418, "step": 234 }, { "epoch": 0.04877542548775426, "grad_norm": 1.097135309008659, "learning_rate": 3.978909132429337e-07, "loss": 1.5506, "step": 235 }, { "epoch": 0.048982980489829804, "grad_norm": 1.2352878041003887, "learning_rate": 3.9787296088801636e-07, "loss": 1.5445, "step": 236 }, { "epoch": 0.04919053549190536, "grad_norm": 1.5772240564416988, "learning_rate": 3.9785493290623825e-07, "loss": 1.5373, "step": 237 }, { "epoch": 0.0493980904939809, "grad_norm": 0.7534201022763537, "learning_rate": 3.9783682930526443e-07, "loss": 1.6307, "step": 238 }, { "epoch": 0.04960564549605646, "grad_norm": 0.7530367285207137, "learning_rate": 3.9781865009279217e-07, "loss": 1.5668, "step": 239 }, { "epoch": 0.049813200498132, "grad_norm": 0.9391472494019696, "learning_rate": 3.978003952765506e-07, "loss": 1.585, "step": 240 }, { "epoch": 0.050020755500207556, "grad_norm": 0.774852143433553, "learning_rate": 3.977820648643014e-07, "loss": 1.6155, "step": 241 }, { "epoch": 0.0502283105022831, "grad_norm": 0.745677374247135, "learning_rate": 3.97763658863838e-07, "loss": 1.5293, "step": 242 }, { "epoch": 0.050435865504358655, "grad_norm": 1.1837885959870564, "learning_rate": 3.977451772829862e-07, "loss": 1.5139, "step": 243 }, { "epoch": 0.0506434205064342, "grad_norm": 0.7017706704601079, "learning_rate": 3.977266201296039e-07, "loss": 1.5863, "step": 244 }, { "epoch": 0.050850975508509755, "grad_norm": 0.6737548118654552, "learning_rate": 3.9770798741158113e-07, "loss": 1.5699, "step": 245 }, { "epoch": 0.05105853051058531, "grad_norm": 0.869460773178111, "learning_rate": 3.976892791368399e-07, "loss": 1.5654, "step": 246 }, { "epoch": 0.051266085512660854, "grad_norm": 0.7162890058971442, "learning_rate": 3.976704953133347e-07, "loss": 1.5381, "step": 247 }, { "epoch": 0.05147364051473641, "grad_norm": 0.8775845731013817, "learning_rate": 3.976516359490517e-07, "loss": 1.5739, "step": 248 }, { "epoch": 0.051681195516811954, "grad_norm": 3.5312141103184507, "learning_rate": 3.976327010520094e-07, "loss": 1.5124, "step": 249 }, { "epoch": 0.05188875051888751, "grad_norm": 0.734015116526283, "learning_rate": 3.976136906302586e-07, "loss": 1.6033, "step": 250 }, { "epoch": 0.05209630552096305, "grad_norm": 0.8160873165015806, "learning_rate": 3.975946046918819e-07, "loss": 1.5691, "step": 251 }, { "epoch": 0.052303860523038606, "grad_norm": 0.779206255074589, "learning_rate": 3.9757544324499415e-07, "loss": 1.5603, "step": 252 }, { "epoch": 0.05251141552511415, "grad_norm": 0.7708855397759548, "learning_rate": 3.9755620629774227e-07, "loss": 1.5041, "step": 253 }, { "epoch": 0.052718970527189705, "grad_norm": 0.7890497455375947, "learning_rate": 3.9753689385830537e-07, "loss": 1.5939, "step": 254 }, { "epoch": 0.05292652552926526, "grad_norm": 0.9441825672726503, "learning_rate": 3.975175059348945e-07, "loss": 1.5454, "step": 255 }, { "epoch": 0.053134080531340805, "grad_norm": 0.7639909825332569, "learning_rate": 3.974980425357529e-07, "loss": 1.6438, "step": 256 }, { "epoch": 0.05334163553341636, "grad_norm": 1.692814812927853, "learning_rate": 3.97478503669156e-07, "loss": 1.5218, "step": 257 }, { "epoch": 0.053549190535491904, "grad_norm": 0.7696333481095123, "learning_rate": 3.9745888934341104e-07, "loss": 1.5503, "step": 258 }, { "epoch": 0.05375674553756746, "grad_norm": 0.9242829647779888, "learning_rate": 3.9743919956685763e-07, "loss": 1.5549, "step": 259 }, { "epoch": 0.053964300539643004, "grad_norm": 0.8261988373601983, "learning_rate": 3.974194343478673e-07, "loss": 1.574, "step": 260 }, { "epoch": 0.05417185554171856, "grad_norm": 0.8776349223600801, "learning_rate": 3.9739959369484374e-07, "loss": 1.5281, "step": 261 }, { "epoch": 0.0543794105437941, "grad_norm": 0.7375384171292336, "learning_rate": 3.973796776162226e-07, "loss": 1.5369, "step": 262 }, { "epoch": 0.054586965545869656, "grad_norm": 0.7404636618103695, "learning_rate": 3.973596861204717e-07, "loss": 1.59, "step": 263 }, { "epoch": 0.0547945205479452, "grad_norm": 0.8365598934567327, "learning_rate": 3.973396192160909e-07, "loss": 1.5726, "step": 264 }, { "epoch": 0.055002075550020756, "grad_norm": 0.9914600821937776, "learning_rate": 3.9731947691161213e-07, "loss": 1.5677, "step": 265 }, { "epoch": 0.05520963055209631, "grad_norm": 0.8716730545567861, "learning_rate": 3.972992592155993e-07, "loss": 1.5474, "step": 266 }, { "epoch": 0.055417185554171855, "grad_norm": 3.258720642013415, "learning_rate": 3.972789661366485e-07, "loss": 1.4957, "step": 267 }, { "epoch": 0.05562474055624741, "grad_norm": 1.0773273816958533, "learning_rate": 3.9725859768338776e-07, "loss": 1.5619, "step": 268 }, { "epoch": 0.055832295558322954, "grad_norm": 0.9024714694961539, "learning_rate": 3.9723815386447727e-07, "loss": 1.5378, "step": 269 }, { "epoch": 0.05603985056039851, "grad_norm": 1.3831821553342059, "learning_rate": 3.972176346886092e-07, "loss": 1.5821, "step": 270 }, { "epoch": 0.056247405562474054, "grad_norm": 0.952469940854049, "learning_rate": 3.9719704016450766e-07, "loss": 1.5554, "step": 271 }, { "epoch": 0.05645496056454961, "grad_norm": 1.4725563826659172, "learning_rate": 3.9717637030092897e-07, "loss": 1.5343, "step": 272 }, { "epoch": 0.05666251556662515, "grad_norm": 0.8363626557190338, "learning_rate": 3.9715562510666136e-07, "loss": 1.6163, "step": 273 }, { "epoch": 0.056870070568700706, "grad_norm": 1.4087040072857615, "learning_rate": 3.9713480459052524e-07, "loss": 1.6336, "step": 274 }, { "epoch": 0.05707762557077625, "grad_norm": 1.1491760517128764, "learning_rate": 3.971139087613728e-07, "loss": 1.5442, "step": 275 }, { "epoch": 0.057285180572851806, "grad_norm": 0.688465467984676, "learning_rate": 3.9709293762808846e-07, "loss": 1.5556, "step": 276 }, { "epoch": 0.05749273557492736, "grad_norm": 0.7734717471695348, "learning_rate": 3.970718911995887e-07, "loss": 1.5608, "step": 277 }, { "epoch": 0.057700290577002905, "grad_norm": 0.9435481776922412, "learning_rate": 3.970507694848217e-07, "loss": 1.5769, "step": 278 }, { "epoch": 0.05790784557907846, "grad_norm": 0.8203070725625297, "learning_rate": 3.970295724927679e-07, "loss": 1.5072, "step": 279 }, { "epoch": 0.058115400581154004, "grad_norm": 0.7146359364444479, "learning_rate": 3.970083002324399e-07, "loss": 1.605, "step": 280 }, { "epoch": 0.05832295558322956, "grad_norm": 1.3831084295560547, "learning_rate": 3.9698695271288185e-07, "loss": 1.5278, "step": 281 }, { "epoch": 0.058530510585305104, "grad_norm": 0.9856642286692111, "learning_rate": 3.9696552994317025e-07, "loss": 1.6086, "step": 282 }, { "epoch": 0.05873806558738066, "grad_norm": 0.7068077938134235, "learning_rate": 3.9694403193241346e-07, "loss": 1.6054, "step": 283 }, { "epoch": 0.0589456205894562, "grad_norm": 1.627631425485338, "learning_rate": 3.969224586897519e-07, "loss": 1.5314, "step": 284 }, { "epoch": 0.059153175591531756, "grad_norm": 0.7062886743411992, "learning_rate": 3.9690081022435795e-07, "loss": 1.5861, "step": 285 }, { "epoch": 0.0593607305936073, "grad_norm": 0.789525351577236, "learning_rate": 3.968790865454359e-07, "loss": 1.5406, "step": 286 }, { "epoch": 0.059568285595682856, "grad_norm": 0.7393779288397713, "learning_rate": 3.968572876622222e-07, "loss": 1.5273, "step": 287 }, { "epoch": 0.05977584059775841, "grad_norm": 0.6496148935943981, "learning_rate": 3.96835413583985e-07, "loss": 1.486, "step": 288 }, { "epoch": 0.059983395599833955, "grad_norm": 0.964319646479861, "learning_rate": 3.968134643200247e-07, "loss": 1.5263, "step": 289 }, { "epoch": 0.06019095060190951, "grad_norm": 0.7009271072255719, "learning_rate": 3.967914398796735e-07, "loss": 1.6022, "step": 290 }, { "epoch": 0.060398505603985055, "grad_norm": 0.9633499931880358, "learning_rate": 3.9676934027229564e-07, "loss": 1.6112, "step": 291 }, { "epoch": 0.06060606060606061, "grad_norm": 1.131949634852106, "learning_rate": 3.967471655072872e-07, "loss": 1.5812, "step": 292 }, { "epoch": 0.060813615608136154, "grad_norm": 0.7650738850666946, "learning_rate": 3.9672491559407636e-07, "loss": 1.6406, "step": 293 }, { "epoch": 0.06102117061021171, "grad_norm": 0.8482872049035906, "learning_rate": 3.967025905421232e-07, "loss": 1.5406, "step": 294 }, { "epoch": 0.06122872561228725, "grad_norm": 1.8054256569571165, "learning_rate": 3.966801903609197e-07, "loss": 1.5008, "step": 295 }, { "epoch": 0.061436280614362807, "grad_norm": 0.7331796700490091, "learning_rate": 3.966577150599899e-07, "loss": 1.6281, "step": 296 }, { "epoch": 0.06164383561643835, "grad_norm": 1.039524787278702, "learning_rate": 3.966351646488896e-07, "loss": 1.519, "step": 297 }, { "epoch": 0.061851390618513906, "grad_norm": 0.9016996870861953, "learning_rate": 3.9661253913720684e-07, "loss": 1.525, "step": 298 }, { "epoch": 0.06205894562058946, "grad_norm": 0.6794441374069633, "learning_rate": 3.965898385345611e-07, "loss": 1.6152, "step": 299 }, { "epoch": 0.062266500622665005, "grad_norm": 1.3089635118220553, "learning_rate": 3.965670628506042e-07, "loss": 1.6183, "step": 300 }, { "epoch": 0.06247405562474056, "grad_norm": 1.2221544240377789, "learning_rate": 3.965442120950198e-07, "loss": 1.6397, "step": 301 }, { "epoch": 0.06268161062681611, "grad_norm": 0.6463045173637169, "learning_rate": 3.9652128627752337e-07, "loss": 1.5291, "step": 302 }, { "epoch": 0.06288916562889166, "grad_norm": 0.9212512453584831, "learning_rate": 3.9649828540786247e-07, "loss": 1.5862, "step": 303 }, { "epoch": 0.0630967206309672, "grad_norm": 0.9701094676987546, "learning_rate": 3.964752094958163e-07, "loss": 1.5255, "step": 304 }, { "epoch": 0.06330427563304275, "grad_norm": 0.8050839790809957, "learning_rate": 3.964520585511962e-07, "loss": 1.566, "step": 305 }, { "epoch": 0.06351183063511831, "grad_norm": 1.787638326017031, "learning_rate": 3.964288325838454e-07, "loss": 1.6107, "step": 306 }, { "epoch": 0.06371938563719386, "grad_norm": 1.514367611958326, "learning_rate": 3.964055316036388e-07, "loss": 1.5261, "step": 307 }, { "epoch": 0.0639269406392694, "grad_norm": 0.7560770150906774, "learning_rate": 3.9638215562048355e-07, "loss": 1.5291, "step": 308 }, { "epoch": 0.06413449564134496, "grad_norm": 0.7726530494604988, "learning_rate": 3.9635870464431837e-07, "loss": 1.6066, "step": 309 }, { "epoch": 0.06434205064342051, "grad_norm": 1.3009761711129673, "learning_rate": 3.9633517868511407e-07, "loss": 1.5895, "step": 310 }, { "epoch": 0.06454960564549606, "grad_norm": 1.4546554518052999, "learning_rate": 3.963115777528732e-07, "loss": 1.5441, "step": 311 }, { "epoch": 0.0647571606475716, "grad_norm": 1.5454337067593822, "learning_rate": 3.962879018576303e-07, "loss": 1.5033, "step": 312 }, { "epoch": 0.06496471564964716, "grad_norm": 0.8111037531155808, "learning_rate": 3.962641510094517e-07, "loss": 1.5749, "step": 313 }, { "epoch": 0.06517227065172271, "grad_norm": 2.096194509213919, "learning_rate": 3.9624032521843563e-07, "loss": 1.5651, "step": 314 }, { "epoch": 0.06537982565379825, "grad_norm": 0.925326041461294, "learning_rate": 3.962164244947122e-07, "loss": 1.5303, "step": 315 }, { "epoch": 0.0655873806558738, "grad_norm": 0.8005797991044269, "learning_rate": 3.9619244884844335e-07, "loss": 1.664, "step": 316 }, { "epoch": 0.06579493565794936, "grad_norm": 1.1962393224185432, "learning_rate": 3.9616839828982285e-07, "loss": 1.6106, "step": 317 }, { "epoch": 0.0660024906600249, "grad_norm": 0.7546956822237012, "learning_rate": 3.9614427282907647e-07, "loss": 1.5852, "step": 318 }, { "epoch": 0.06621004566210045, "grad_norm": 0.8854439260997882, "learning_rate": 3.961200724764616e-07, "loss": 1.5192, "step": 319 }, { "epoch": 0.06641760066417601, "grad_norm": 0.7476814369693596, "learning_rate": 3.9609579724226763e-07, "loss": 1.5597, "step": 320 }, { "epoch": 0.06662515566625156, "grad_norm": 1.220053966221941, "learning_rate": 3.960714471368158e-07, "loss": 1.5834, "step": 321 }, { "epoch": 0.0668327106683271, "grad_norm": 0.7171016406798384, "learning_rate": 3.9604702217045903e-07, "loss": 1.4922, "step": 322 }, { "epoch": 0.06704026567040265, "grad_norm": 1.1864436162880359, "learning_rate": 3.9602252235358227e-07, "loss": 1.5939, "step": 323 }, { "epoch": 0.06724782067247821, "grad_norm": 0.7851629580390745, "learning_rate": 3.9599794769660214e-07, "loss": 1.5338, "step": 324 }, { "epoch": 0.06745537567455376, "grad_norm": 0.7731731941124776, "learning_rate": 3.9597329820996704e-07, "loss": 1.5185, "step": 325 }, { "epoch": 0.0676629306766293, "grad_norm": 1.1481780697317034, "learning_rate": 3.9594857390415744e-07, "loss": 1.6273, "step": 326 }, { "epoch": 0.06787048567870485, "grad_norm": 1.231706771563639, "learning_rate": 3.9592377478968537e-07, "loss": 1.5355, "step": 327 }, { "epoch": 0.06807804068078041, "grad_norm": 0.791385321452644, "learning_rate": 3.9589890087709475e-07, "loss": 1.5941, "step": 328 }, { "epoch": 0.06828559568285596, "grad_norm": 0.9820296859582639, "learning_rate": 3.958739521769614e-07, "loss": 1.4661, "step": 329 }, { "epoch": 0.0684931506849315, "grad_norm": 0.9345534758361532, "learning_rate": 3.958489286998927e-07, "loss": 1.5782, "step": 330 }, { "epoch": 0.06870070568700706, "grad_norm": 0.994239478069683, "learning_rate": 3.958238304565281e-07, "loss": 1.5355, "step": 331 }, { "epoch": 0.06890826068908261, "grad_norm": 0.9422649938094205, "learning_rate": 3.9579865745753854e-07, "loss": 1.5736, "step": 332 }, { "epoch": 0.06911581569115816, "grad_norm": 0.9246351674417703, "learning_rate": 3.957734097136271e-07, "loss": 1.4708, "step": 333 }, { "epoch": 0.0693233706932337, "grad_norm": 2.569793437141846, "learning_rate": 3.9574808723552834e-07, "loss": 1.4702, "step": 334 }, { "epoch": 0.06953092569530926, "grad_norm": 0.6471573546153861, "learning_rate": 3.9572269003400876e-07, "loss": 1.578, "step": 335 }, { "epoch": 0.06973848069738481, "grad_norm": 0.7277938493644751, "learning_rate": 3.9569721811986654e-07, "loss": 1.5473, "step": 336 }, { "epoch": 0.06994603569946035, "grad_norm": 0.783551313956181, "learning_rate": 3.9567167150393163e-07, "loss": 1.5488, "step": 337 }, { "epoch": 0.0701535907015359, "grad_norm": 0.7109255168982866, "learning_rate": 3.9564605019706586e-07, "loss": 1.5575, "step": 338 }, { "epoch": 0.07036114570361146, "grad_norm": 0.7915619295057836, "learning_rate": 3.956203542101627e-07, "loss": 1.6001, "step": 339 }, { "epoch": 0.070568700705687, "grad_norm": 0.9391306518808272, "learning_rate": 3.9559458355414734e-07, "loss": 1.5346, "step": 340 }, { "epoch": 0.07077625570776255, "grad_norm": 1.0878389625543508, "learning_rate": 3.955687382399769e-07, "loss": 1.5859, "step": 341 }, { "epoch": 0.07098381070983811, "grad_norm": 2.529640080811518, "learning_rate": 3.955428182786399e-07, "loss": 1.5574, "step": 342 }, { "epoch": 0.07119136571191366, "grad_norm": 0.7024340448196, "learning_rate": 3.9551682368115706e-07, "loss": 1.6081, "step": 343 }, { "epoch": 0.0713989207139892, "grad_norm": 0.9818175237257888, "learning_rate": 3.954907544585805e-07, "loss": 1.57, "step": 344 }, { "epoch": 0.07160647571606475, "grad_norm": 1.4630432972328837, "learning_rate": 3.954646106219942e-07, "loss": 1.5334, "step": 345 }, { "epoch": 0.07181403071814031, "grad_norm": 1.3627602266113426, "learning_rate": 3.9543839218251367e-07, "loss": 1.4364, "step": 346 }, { "epoch": 0.07202158572021586, "grad_norm": 0.7473269943068941, "learning_rate": 3.954120991512865e-07, "loss": 1.5877, "step": 347 }, { "epoch": 0.0722291407222914, "grad_norm": 0.8405582475412623, "learning_rate": 3.9538573153949166e-07, "loss": 1.4467, "step": 348 }, { "epoch": 0.07243669572436695, "grad_norm": 1.1594318553688343, "learning_rate": 3.9535928935834e-07, "loss": 1.6009, "step": 349 }, { "epoch": 0.07264425072644251, "grad_norm": 1.2595260213243793, "learning_rate": 3.9533277261907407e-07, "loss": 1.5722, "step": 350 }, { "epoch": 0.07285180572851806, "grad_norm": 1.0483692983589734, "learning_rate": 3.9530618133296804e-07, "loss": 1.6056, "step": 351 }, { "epoch": 0.0730593607305936, "grad_norm": 1.878051520630954, "learning_rate": 3.952795155113277e-07, "loss": 1.5047, "step": 352 }, { "epoch": 0.07326691573266916, "grad_norm": 0.8285069119449421, "learning_rate": 3.9525277516549087e-07, "loss": 1.5505, "step": 353 }, { "epoch": 0.07347447073474471, "grad_norm": 1.0179567502745943, "learning_rate": 3.952259603068267e-07, "loss": 1.5815, "step": 354 }, { "epoch": 0.07368202573682026, "grad_norm": 0.8437086108640751, "learning_rate": 3.951990709467363e-07, "loss": 1.6478, "step": 355 }, { "epoch": 0.0738895807388958, "grad_norm": 0.7744403790360274, "learning_rate": 3.951721070966521e-07, "loss": 1.654, "step": 356 }, { "epoch": 0.07409713574097136, "grad_norm": 0.9946033853448627, "learning_rate": 3.9514506876803854e-07, "loss": 1.5852, "step": 357 }, { "epoch": 0.07430469074304691, "grad_norm": 1.2992465449230606, "learning_rate": 3.9511795597239155e-07, "loss": 1.6225, "step": 358 }, { "epoch": 0.07451224574512245, "grad_norm": 0.6611632078314491, "learning_rate": 3.9509076872123887e-07, "loss": 1.5222, "step": 359 }, { "epoch": 0.074719800747198, "grad_norm": 0.8554859218864674, "learning_rate": 3.9506350702613966e-07, "loss": 1.6026, "step": 360 }, { "epoch": 0.07492735574927356, "grad_norm": 0.6806192227148197, "learning_rate": 3.9503617089868496e-07, "loss": 1.5226, "step": 361 }, { "epoch": 0.07513491075134911, "grad_norm": 0.9228050332082446, "learning_rate": 3.9500876035049735e-07, "loss": 1.6104, "step": 362 }, { "epoch": 0.07534246575342465, "grad_norm": 1.0163593089389478, "learning_rate": 3.9498127539323105e-07, "loss": 1.5641, "step": 363 }, { "epoch": 0.07555002075550021, "grad_norm": 1.4912559905551543, "learning_rate": 3.9495371603857193e-07, "loss": 1.56, "step": 364 }, { "epoch": 0.07575757575757576, "grad_norm": 0.8892116712870853, "learning_rate": 3.9492608229823753e-07, "loss": 1.5432, "step": 365 }, { "epoch": 0.0759651307596513, "grad_norm": 0.9629533423498724, "learning_rate": 3.9489837418397693e-07, "loss": 1.5888, "step": 366 }, { "epoch": 0.07617268576172685, "grad_norm": 0.655194629821129, "learning_rate": 3.948705917075709e-07, "loss": 1.5458, "step": 367 }, { "epoch": 0.07638024076380241, "grad_norm": 1.7481375792049714, "learning_rate": 3.9484273488083186e-07, "loss": 1.5989, "step": 368 }, { "epoch": 0.07658779576587796, "grad_norm": 0.695241734120242, "learning_rate": 3.9481480371560375e-07, "loss": 1.5241, "step": 369 }, { "epoch": 0.0767953507679535, "grad_norm": 0.9343488497549051, "learning_rate": 3.9478679822376216e-07, "loss": 1.5785, "step": 370 }, { "epoch": 0.07700290577002905, "grad_norm": 0.7423150462414291, "learning_rate": 3.947587184172143e-07, "loss": 1.6286, "step": 371 }, { "epoch": 0.07721046077210461, "grad_norm": 0.7747788070171346, "learning_rate": 3.9473056430789893e-07, "loss": 1.5631, "step": 372 }, { "epoch": 0.07741801577418016, "grad_norm": 0.7845084852813077, "learning_rate": 3.947023359077865e-07, "loss": 1.5857, "step": 373 }, { "epoch": 0.0776255707762557, "grad_norm": 1.3942838974429645, "learning_rate": 3.946740332288789e-07, "loss": 1.5031, "step": 374 }, { "epoch": 0.07783312577833126, "grad_norm": 0.6363998599574435, "learning_rate": 3.9464565628320967e-07, "loss": 1.5853, "step": 375 }, { "epoch": 0.07804068078040681, "grad_norm": 0.7292506159159087, "learning_rate": 3.94617205082844e-07, "loss": 1.5485, "step": 376 }, { "epoch": 0.07824823578248236, "grad_norm": 1.4592810508360627, "learning_rate": 3.9458867963987856e-07, "loss": 1.5113, "step": 377 }, { "epoch": 0.0784557907845579, "grad_norm": 0.8599217550041893, "learning_rate": 3.945600799664416e-07, "loss": 1.5348, "step": 378 }, { "epoch": 0.07866334578663346, "grad_norm": 0.6925878570206188, "learning_rate": 3.94531406074693e-07, "loss": 1.4851, "step": 379 }, { "epoch": 0.07887090078870901, "grad_norm": 0.9552351209400144, "learning_rate": 3.9450265797682396e-07, "loss": 1.5127, "step": 380 }, { "epoch": 0.07907845579078455, "grad_norm": 0.7312953405290957, "learning_rate": 3.944738356850576e-07, "loss": 1.5772, "step": 381 }, { "epoch": 0.0792860107928601, "grad_norm": 0.686828088243894, "learning_rate": 3.944449392116483e-07, "loss": 1.4996, "step": 382 }, { "epoch": 0.07949356579493566, "grad_norm": 1.002100881776077, "learning_rate": 3.944159685688821e-07, "loss": 1.4803, "step": 383 }, { "epoch": 0.07970112079701121, "grad_norm": 1.274816320045783, "learning_rate": 3.9438692376907657e-07, "loss": 1.5238, "step": 384 }, { "epoch": 0.07990867579908675, "grad_norm": 1.550214411197294, "learning_rate": 3.943578048245807e-07, "loss": 1.5407, "step": 385 }, { "epoch": 0.08011623080116231, "grad_norm": 0.8221018746296057, "learning_rate": 3.9432861174777525e-07, "loss": 1.4838, "step": 386 }, { "epoch": 0.08032378580323786, "grad_norm": 1.0610281147160094, "learning_rate": 3.942993445510722e-07, "loss": 1.5784, "step": 387 }, { "epoch": 0.0805313408053134, "grad_norm": 1.6677082247902724, "learning_rate": 3.9427000324691525e-07, "loss": 1.548, "step": 388 }, { "epoch": 0.08073889580738895, "grad_norm": 0.6911748828478762, "learning_rate": 3.942405878477795e-07, "loss": 1.538, "step": 389 }, { "epoch": 0.08094645080946451, "grad_norm": 0.6959081935320677, "learning_rate": 3.942110983661716e-07, "loss": 1.6123, "step": 390 }, { "epoch": 0.08115400581154006, "grad_norm": 4.6744001253512275, "learning_rate": 3.9418153481462976e-07, "loss": 1.5313, "step": 391 }, { "epoch": 0.0813615608136156, "grad_norm": 0.8394496305680753, "learning_rate": 3.941518972057235e-07, "loss": 1.6077, "step": 392 }, { "epoch": 0.08156911581569116, "grad_norm": 0.6411844023965299, "learning_rate": 3.941221855520541e-07, "loss": 1.5269, "step": 393 }, { "epoch": 0.08177667081776671, "grad_norm": 4.091319459987459, "learning_rate": 3.9409239986625405e-07, "loss": 1.5721, "step": 394 }, { "epoch": 0.08198422581984226, "grad_norm": 0.7884330335134523, "learning_rate": 3.940625401609875e-07, "loss": 1.5475, "step": 395 }, { "epoch": 0.0821917808219178, "grad_norm": 0.78314552030894, "learning_rate": 3.9403260644894993e-07, "loss": 1.6138, "step": 396 }, { "epoch": 0.08239933582399336, "grad_norm": 0.7622111269435049, "learning_rate": 3.9400259874286844e-07, "loss": 1.5111, "step": 397 }, { "epoch": 0.08260689082606891, "grad_norm": 0.819239164663395, "learning_rate": 3.9397251705550146e-07, "loss": 1.5822, "step": 398 }, { "epoch": 0.08281444582814446, "grad_norm": 0.7302769195935753, "learning_rate": 3.9394236139963886e-07, "loss": 1.5204, "step": 399 }, { "epoch": 0.08302200083022, "grad_norm": 0.9658003009694968, "learning_rate": 3.9391213178810223e-07, "loss": 1.5423, "step": 400 }, { "epoch": 0.08322955583229556, "grad_norm": 0.8609973872812837, "learning_rate": 3.938818282337442e-07, "loss": 1.6461, "step": 401 }, { "epoch": 0.08343711083437111, "grad_norm": 0.9783977239516148, "learning_rate": 3.938514507494491e-07, "loss": 1.5774, "step": 402 }, { "epoch": 0.08364466583644665, "grad_norm": 1.056699080196957, "learning_rate": 3.9382099934813265e-07, "loss": 1.5597, "step": 403 }, { "epoch": 0.08385222083852222, "grad_norm": 0.7220006838934228, "learning_rate": 3.937904740427419e-07, "loss": 1.4923, "step": 404 }, { "epoch": 0.08405977584059776, "grad_norm": 0.8430739116359129, "learning_rate": 3.9375987484625555e-07, "loss": 1.5426, "step": 405 }, { "epoch": 0.08426733084267331, "grad_norm": 3.613166766223418, "learning_rate": 3.937292017716834e-07, "loss": 1.544, "step": 406 }, { "epoch": 0.08447488584474885, "grad_norm": 0.9376576443914243, "learning_rate": 3.93698454832067e-07, "loss": 1.5918, "step": 407 }, { "epoch": 0.08468244084682441, "grad_norm": 0.9094639893253319, "learning_rate": 3.9366763404047896e-07, "loss": 1.5672, "step": 408 }, { "epoch": 0.08488999584889996, "grad_norm": 0.9663058776634983, "learning_rate": 3.9363673941002366e-07, "loss": 1.5924, "step": 409 }, { "epoch": 0.0850975508509755, "grad_norm": 0.8002328143803354, "learning_rate": 3.9360577095383644e-07, "loss": 1.5445, "step": 410 }, { "epoch": 0.08530510585305105, "grad_norm": 0.8623858127380615, "learning_rate": 3.935747286850843e-07, "loss": 1.5918, "step": 411 }, { "epoch": 0.08551266085512661, "grad_norm": 0.7089643849188245, "learning_rate": 3.935436126169658e-07, "loss": 1.61, "step": 412 }, { "epoch": 0.08572021585720216, "grad_norm": 0.7746493584397219, "learning_rate": 3.935124227627105e-07, "loss": 1.5202, "step": 413 }, { "epoch": 0.0859277708592777, "grad_norm": 1.2816563699568104, "learning_rate": 3.934811591355796e-07, "loss": 1.5418, "step": 414 }, { "epoch": 0.08613532586135327, "grad_norm": 0.9240654353762444, "learning_rate": 3.934498217488654e-07, "loss": 1.611, "step": 415 }, { "epoch": 0.08634288086342881, "grad_norm": 2.1635633770292415, "learning_rate": 3.934184106158919e-07, "loss": 1.6338, "step": 416 }, { "epoch": 0.08655043586550436, "grad_norm": 0.6485865496749437, "learning_rate": 3.933869257500142e-07, "loss": 1.5616, "step": 417 }, { "epoch": 0.0867579908675799, "grad_norm": 0.6615085025280788, "learning_rate": 3.933553671646188e-07, "loss": 1.5267, "step": 418 }, { "epoch": 0.08696554586965546, "grad_norm": 1.0369129853017083, "learning_rate": 3.933237348731236e-07, "loss": 1.5436, "step": 419 }, { "epoch": 0.08717310087173101, "grad_norm": 1.1918789084504378, "learning_rate": 3.932920288889778e-07, "loss": 1.5543, "step": 420 }, { "epoch": 0.08738065587380656, "grad_norm": 0.7387501874317923, "learning_rate": 3.93260249225662e-07, "loss": 1.5508, "step": 421 }, { "epoch": 0.0875882108758821, "grad_norm": 0.8402063796071437, "learning_rate": 3.93228395896688e-07, "loss": 1.5808, "step": 422 }, { "epoch": 0.08779576587795766, "grad_norm": 1.3213676966046553, "learning_rate": 3.93196468915599e-07, "loss": 1.6091, "step": 423 }, { "epoch": 0.08800332088003321, "grad_norm": 0.8763352766282972, "learning_rate": 3.931644682959696e-07, "loss": 1.5402, "step": 424 }, { "epoch": 0.08821087588210876, "grad_norm": 1.3891025379335946, "learning_rate": 3.9313239405140545e-07, "loss": 1.5404, "step": 425 }, { "epoch": 0.08841843088418432, "grad_norm": 0.7251642927038664, "learning_rate": 3.931002461955438e-07, "loss": 1.4728, "step": 426 }, { "epoch": 0.08862598588625986, "grad_norm": 0.8837030874737397, "learning_rate": 3.9306802474205305e-07, "loss": 1.5991, "step": 427 }, { "epoch": 0.08883354088833541, "grad_norm": 0.6507271181113026, "learning_rate": 3.9303572970463283e-07, "loss": 1.5299, "step": 428 }, { "epoch": 0.08904109589041095, "grad_norm": 0.6593824530272688, "learning_rate": 3.930033610970141e-07, "loss": 1.5542, "step": 429 }, { "epoch": 0.08924865089248651, "grad_norm": 0.7472039099118118, "learning_rate": 3.929709189329593e-07, "loss": 1.5415, "step": 430 }, { "epoch": 0.08945620589456206, "grad_norm": 1.7347531358597275, "learning_rate": 3.929384032262619e-07, "loss": 1.5233, "step": 431 }, { "epoch": 0.0896637608966376, "grad_norm": 0.9845210129314712, "learning_rate": 3.929058139907467e-07, "loss": 1.5762, "step": 432 }, { "epoch": 0.08987131589871315, "grad_norm": 0.820894568693371, "learning_rate": 3.9287315124026973e-07, "loss": 1.5065, "step": 433 }, { "epoch": 0.09007887090078871, "grad_norm": 0.8277047313148947, "learning_rate": 3.9284041498871835e-07, "loss": 1.5968, "step": 434 }, { "epoch": 0.09028642590286426, "grad_norm": 0.8556145328957938, "learning_rate": 3.9280760525001123e-07, "loss": 1.5912, "step": 435 }, { "epoch": 0.0904939809049398, "grad_norm": 0.6697043902153069, "learning_rate": 3.9277472203809813e-07, "loss": 1.627, "step": 436 }, { "epoch": 0.09070153590701537, "grad_norm": 0.9233328122184589, "learning_rate": 3.927417653669601e-07, "loss": 1.584, "step": 437 }, { "epoch": 0.09090909090909091, "grad_norm": 0.9039799341906642, "learning_rate": 3.9270873525060956e-07, "loss": 1.6087, "step": 438 }, { "epoch": 0.09111664591116646, "grad_norm": 0.7901578512951782, "learning_rate": 3.9267563170308984e-07, "loss": 1.4938, "step": 439 }, { "epoch": 0.091324200913242, "grad_norm": 0.7466405510809797, "learning_rate": 3.9264245473847584e-07, "loss": 1.5476, "step": 440 }, { "epoch": 0.09153175591531756, "grad_norm": 0.7505865861833354, "learning_rate": 3.9260920437087347e-07, "loss": 1.5833, "step": 441 }, { "epoch": 0.09173931091739311, "grad_norm": 0.7746214690829071, "learning_rate": 3.9257588061441993e-07, "loss": 1.5374, "step": 442 }, { "epoch": 0.09194686591946866, "grad_norm": 0.634379651499812, "learning_rate": 3.925424834832835e-07, "loss": 1.6653, "step": 443 }, { "epoch": 0.0921544209215442, "grad_norm": 0.8289695484975603, "learning_rate": 3.92509012991664e-07, "loss": 1.523, "step": 444 }, { "epoch": 0.09236197592361976, "grad_norm": 0.8986529863797156, "learning_rate": 3.9247546915379186e-07, "loss": 1.5213, "step": 445 }, { "epoch": 0.09256953092569531, "grad_norm": 0.6947621755736146, "learning_rate": 3.9244185198392933e-07, "loss": 1.5785, "step": 446 }, { "epoch": 0.09277708592777086, "grad_norm": 2.047831976460464, "learning_rate": 3.9240816149636936e-07, "loss": 1.5782, "step": 447 }, { "epoch": 0.09298464092984642, "grad_norm": 0.6892096460532096, "learning_rate": 3.923743977054363e-07, "loss": 1.566, "step": 448 }, { "epoch": 0.09319219593192196, "grad_norm": 3.6873811616487595, "learning_rate": 3.923405606254856e-07, "loss": 1.5002, "step": 449 }, { "epoch": 0.09339975093399751, "grad_norm": 0.630593927946454, "learning_rate": 3.92306650270904e-07, "loss": 1.5924, "step": 450 }, { "epoch": 0.09360730593607305, "grad_norm": 0.7655534901138857, "learning_rate": 3.92272666656109e-07, "loss": 1.5677, "step": 451 }, { "epoch": 0.09381486093814861, "grad_norm": 1.9253770300963382, "learning_rate": 3.9223860979554987e-07, "loss": 1.5153, "step": 452 }, { "epoch": 0.09402241594022416, "grad_norm": 1.444757510916554, "learning_rate": 3.922044797037064e-07, "loss": 1.56, "step": 453 }, { "epoch": 0.0942299709422997, "grad_norm": 0.8291854720564956, "learning_rate": 3.9217027639509004e-07, "loss": 1.5342, "step": 454 }, { "epoch": 0.09443752594437525, "grad_norm": 0.896395271499777, "learning_rate": 3.92135999884243e-07, "loss": 1.5403, "step": 455 }, { "epoch": 0.09464508094645081, "grad_norm": 0.7046816160087109, "learning_rate": 3.9210165018573874e-07, "loss": 1.5961, "step": 456 }, { "epoch": 0.09485263594852636, "grad_norm": 1.4111738556351614, "learning_rate": 3.9206722731418187e-07, "loss": 1.5996, "step": 457 }, { "epoch": 0.0950601909506019, "grad_norm": 0.707850415479939, "learning_rate": 3.9203273128420804e-07, "loss": 1.6089, "step": 458 }, { "epoch": 0.09526774595267747, "grad_norm": 0.737125279192193, "learning_rate": 3.919981621104841e-07, "loss": 1.6001, "step": 459 }, { "epoch": 0.09547530095475301, "grad_norm": 1.0734406504459013, "learning_rate": 3.9196351980770794e-07, "loss": 1.5131, "step": 460 }, { "epoch": 0.09568285595682856, "grad_norm": 1.2396986372388994, "learning_rate": 3.9192880439060855e-07, "loss": 1.5442, "step": 461 }, { "epoch": 0.0958904109589041, "grad_norm": 0.769630394005304, "learning_rate": 3.91894015873946e-07, "loss": 1.5468, "step": 462 }, { "epoch": 0.09609796596097966, "grad_norm": 0.6788045988237913, "learning_rate": 3.9185915427251127e-07, "loss": 1.5474, "step": 463 }, { "epoch": 0.09630552096305521, "grad_norm": 0.7232298207770542, "learning_rate": 3.9182421960112687e-07, "loss": 1.5568, "step": 464 }, { "epoch": 0.09651307596513076, "grad_norm": 1.0194928330350024, "learning_rate": 3.91789211874646e-07, "loss": 1.5612, "step": 465 }, { "epoch": 0.0967206309672063, "grad_norm": 1.886389088624282, "learning_rate": 3.917541311079529e-07, "loss": 1.533, "step": 466 }, { "epoch": 0.09692818596928186, "grad_norm": 0.72220049838547, "learning_rate": 3.917189773159631e-07, "loss": 1.5304, "step": 467 }, { "epoch": 0.09713574097135741, "grad_norm": 0.8774491128392785, "learning_rate": 3.91683750513623e-07, "loss": 1.5579, "step": 468 }, { "epoch": 0.09734329597343296, "grad_norm": 1.1621072478103602, "learning_rate": 3.916484507159101e-07, "loss": 1.6187, "step": 469 }, { "epoch": 0.09755085097550852, "grad_norm": 1.26745053442655, "learning_rate": 3.9161307793783307e-07, "loss": 1.556, "step": 470 }, { "epoch": 0.09775840597758406, "grad_norm": 0.8241008796622631, "learning_rate": 3.9157763219443133e-07, "loss": 1.535, "step": 471 }, { "epoch": 0.09796596097965961, "grad_norm": 1.057297813454511, "learning_rate": 3.9154211350077547e-07, "loss": 1.5801, "step": 472 }, { "epoch": 0.09817351598173515, "grad_norm": 1.1276494790143259, "learning_rate": 3.915065218719672e-07, "loss": 1.613, "step": 473 }, { "epoch": 0.09838107098381071, "grad_norm": 1.0677648323961761, "learning_rate": 3.9147085732313903e-07, "loss": 1.538, "step": 474 }, { "epoch": 0.09858862598588626, "grad_norm": 0.7344424296296667, "learning_rate": 3.914351198694546e-07, "loss": 1.6277, "step": 475 }, { "epoch": 0.0987961809879618, "grad_norm": 0.7417892882452545, "learning_rate": 3.9139930952610853e-07, "loss": 1.5946, "step": 476 }, { "epoch": 0.09900373599003735, "grad_norm": 0.7502122706201256, "learning_rate": 3.9136342630832647e-07, "loss": 1.5773, "step": 477 }, { "epoch": 0.09921129099211291, "grad_norm": 0.8505035409534525, "learning_rate": 3.9132747023136496e-07, "loss": 1.5699, "step": 478 }, { "epoch": 0.09941884599418846, "grad_norm": 0.7733069680646283, "learning_rate": 3.9129144131051163e-07, "loss": 1.5438, "step": 479 }, { "epoch": 0.099626400996264, "grad_norm": 0.8433486036854327, "learning_rate": 3.9125533956108495e-07, "loss": 1.5143, "step": 480 }, { "epoch": 0.09983395599833957, "grad_norm": 0.7807036090808576, "learning_rate": 3.9121916499843454e-07, "loss": 1.5665, "step": 481 }, { "epoch": 0.10004151100041511, "grad_norm": 0.8926143161957559, "learning_rate": 3.9118291763794067e-07, "loss": 1.5545, "step": 482 }, { "epoch": 0.10024906600249066, "grad_norm": 1.0697963944564333, "learning_rate": 3.9114659749501494e-07, "loss": 1.6027, "step": 483 }, { "epoch": 0.1004566210045662, "grad_norm": 1.2286496973445413, "learning_rate": 3.911102045850996e-07, "loss": 1.5047, "step": 484 }, { "epoch": 0.10066417600664176, "grad_norm": 0.7703375560644409, "learning_rate": 3.91073738923668e-07, "loss": 1.6369, "step": 485 }, { "epoch": 0.10087173100871731, "grad_norm": 0.9181149106643345, "learning_rate": 3.910372005262244e-07, "loss": 1.6047, "step": 486 }, { "epoch": 0.10107928601079286, "grad_norm": 0.8469189591961899, "learning_rate": 3.910005894083039e-07, "loss": 1.533, "step": 487 }, { "epoch": 0.1012868410128684, "grad_norm": 0.7074828317641544, "learning_rate": 3.9096390558547254e-07, "loss": 1.5259, "step": 488 }, { "epoch": 0.10149439601494396, "grad_norm": 0.6772308842654708, "learning_rate": 3.9092714907332743e-07, "loss": 1.5527, "step": 489 }, { "epoch": 0.10170195101701951, "grad_norm": 0.9981354518878973, "learning_rate": 3.9089031988749637e-07, "loss": 1.6285, "step": 490 }, { "epoch": 0.10190950601909506, "grad_norm": 1.4375112189702568, "learning_rate": 3.908534180436381e-07, "loss": 1.4481, "step": 491 }, { "epoch": 0.10211706102117062, "grad_norm": 0.8634973329581775, "learning_rate": 3.9081644355744246e-07, "loss": 1.5023, "step": 492 }, { "epoch": 0.10232461602324616, "grad_norm": 9.850171705636878, "learning_rate": 3.907793964446299e-07, "loss": 1.6363, "step": 493 }, { "epoch": 0.10253217102532171, "grad_norm": 0.70073257959331, "learning_rate": 3.9074227672095195e-07, "loss": 1.4827, "step": 494 }, { "epoch": 0.10273972602739725, "grad_norm": 1.2798585806140574, "learning_rate": 3.907050844021909e-07, "loss": 1.5361, "step": 495 }, { "epoch": 0.10294728102947281, "grad_norm": 2.034903110718286, "learning_rate": 3.9066781950415985e-07, "loss": 1.556, "step": 496 }, { "epoch": 0.10315483603154836, "grad_norm": 0.9206364005779825, "learning_rate": 3.906304820427029e-07, "loss": 1.5563, "step": 497 }, { "epoch": 0.10336239103362391, "grad_norm": 0.7203478132170901, "learning_rate": 3.905930720336951e-07, "loss": 1.5716, "step": 498 }, { "epoch": 0.10356994603569947, "grad_norm": 0.6886531134581791, "learning_rate": 3.9055558949304196e-07, "loss": 1.5734, "step": 499 }, { "epoch": 0.10377750103777501, "grad_norm": 0.7754719826551352, "learning_rate": 3.905180344366802e-07, "loss": 1.5378, "step": 500 }, { "epoch": 0.10398505603985056, "grad_norm": 0.7858072375779781, "learning_rate": 3.904804068805772e-07, "loss": 1.5048, "step": 501 }, { "epoch": 0.1041926110419261, "grad_norm": 0.6312276157134135, "learning_rate": 3.904427068407311e-07, "loss": 1.5893, "step": 502 }, { "epoch": 0.10440016604400167, "grad_norm": 0.8300735260334831, "learning_rate": 3.9040493433317115e-07, "loss": 1.5449, "step": 503 }, { "epoch": 0.10460772104607721, "grad_norm": 1.6859111070979174, "learning_rate": 3.9036708937395705e-07, "loss": 1.5615, "step": 504 }, { "epoch": 0.10481527604815276, "grad_norm": 0.8529431035924254, "learning_rate": 3.903291719791796e-07, "loss": 1.3919, "step": 505 }, { "epoch": 0.1050228310502283, "grad_norm": 0.721944519553621, "learning_rate": 3.902911821649602e-07, "loss": 1.5649, "step": 506 }, { "epoch": 0.10523038605230386, "grad_norm": 0.9336295365454339, "learning_rate": 3.9025311994745106e-07, "loss": 1.5755, "step": 507 }, { "epoch": 0.10543794105437941, "grad_norm": 0.6434889668535757, "learning_rate": 3.9021498534283534e-07, "loss": 1.5289, "step": 508 }, { "epoch": 0.10564549605645496, "grad_norm": 1.2063611498097493, "learning_rate": 3.901767783673267e-07, "loss": 1.5352, "step": 509 }, { "epoch": 0.10585305105853052, "grad_norm": 0.9968073691449556, "learning_rate": 3.9013849903716996e-07, "loss": 1.5469, "step": 510 }, { "epoch": 0.10606060606060606, "grad_norm": 0.6601769239968291, "learning_rate": 3.9010014736864026e-07, "loss": 1.5644, "step": 511 }, { "epoch": 0.10626816106268161, "grad_norm": 0.7924401093548435, "learning_rate": 3.9006172337804387e-07, "loss": 1.5263, "step": 512 }, { "epoch": 0.10647571606475716, "grad_norm": 0.7636002241439529, "learning_rate": 3.900232270817176e-07, "loss": 1.5028, "step": 513 }, { "epoch": 0.10668327106683272, "grad_norm": 1.1518429581330218, "learning_rate": 3.89984658496029e-07, "loss": 1.5726, "step": 514 }, { "epoch": 0.10689082606890826, "grad_norm": 0.9644898224754817, "learning_rate": 3.8994601763737644e-07, "loss": 1.5764, "step": 515 }, { "epoch": 0.10709838107098381, "grad_norm": 0.9217493040293064, "learning_rate": 3.8990730452218897e-07, "loss": 1.5671, "step": 516 }, { "epoch": 0.10730593607305935, "grad_norm": 0.840426624234865, "learning_rate": 3.898685191669264e-07, "loss": 1.5513, "step": 517 }, { "epoch": 0.10751349107513491, "grad_norm": 0.9775317833774693, "learning_rate": 3.8982966158807923e-07, "loss": 1.5325, "step": 518 }, { "epoch": 0.10772104607721046, "grad_norm": 0.8040808044825589, "learning_rate": 3.897907318021687e-07, "loss": 1.5659, "step": 519 }, { "epoch": 0.10792860107928601, "grad_norm": 5.82499117295419, "learning_rate": 3.897517298257467e-07, "loss": 1.5402, "step": 520 }, { "epoch": 0.10813615608136157, "grad_norm": 0.8871012219416111, "learning_rate": 3.897126556753958e-07, "loss": 1.5841, "step": 521 }, { "epoch": 0.10834371108343711, "grad_norm": 0.9564522521750942, "learning_rate": 3.8967350936772934e-07, "loss": 1.6025, "step": 522 }, { "epoch": 0.10855126608551266, "grad_norm": 1.0770848251096499, "learning_rate": 3.8963429091939124e-07, "loss": 1.6, "step": 523 }, { "epoch": 0.1087588210875882, "grad_norm": 0.7609324386765703, "learning_rate": 3.8959500034705625e-07, "loss": 1.4665, "step": 524 }, { "epoch": 0.10896637608966377, "grad_norm": 1.4349158794392791, "learning_rate": 3.8955563766742957e-07, "loss": 1.5503, "step": 525 }, { "epoch": 0.10917393109173931, "grad_norm": 1.402721486633433, "learning_rate": 3.895162028972472e-07, "loss": 1.5648, "step": 526 }, { "epoch": 0.10938148609381486, "grad_norm": 0.7625075276425013, "learning_rate": 3.894766960532757e-07, "loss": 1.5724, "step": 527 }, { "epoch": 0.1095890410958904, "grad_norm": 0.7975975533982912, "learning_rate": 3.8943711715231245e-07, "loss": 1.5366, "step": 528 }, { "epoch": 0.10979659609796596, "grad_norm": 0.7923064234091599, "learning_rate": 3.8939746621118527e-07, "loss": 1.5028, "step": 529 }, { "epoch": 0.11000415110004151, "grad_norm": 0.6529154776680075, "learning_rate": 3.893577432467527e-07, "loss": 1.5727, "step": 530 }, { "epoch": 0.11021170610211706, "grad_norm": 0.9128103636720795, "learning_rate": 3.893179482759039e-07, "loss": 1.5505, "step": 531 }, { "epoch": 0.11041926110419262, "grad_norm": 0.6191678444772217, "learning_rate": 3.892780813155586e-07, "loss": 1.5546, "step": 532 }, { "epoch": 0.11062681610626816, "grad_norm": 0.9440919177992679, "learning_rate": 3.8923814238266724e-07, "loss": 1.5667, "step": 533 }, { "epoch": 0.11083437110834371, "grad_norm": 0.8064140974658476, "learning_rate": 3.8919813149421076e-07, "loss": 1.4646, "step": 534 }, { "epoch": 0.11104192611041926, "grad_norm": 0.7003708073176622, "learning_rate": 3.8915804866720074e-07, "loss": 1.5063, "step": 535 }, { "epoch": 0.11124948111249482, "grad_norm": 0.7781231267946289, "learning_rate": 3.891178939186793e-07, "loss": 1.4644, "step": 536 }, { "epoch": 0.11145703611457036, "grad_norm": 0.7510775999916478, "learning_rate": 3.8907766726571915e-07, "loss": 1.5103, "step": 537 }, { "epoch": 0.11166459111664591, "grad_norm": 0.7672721518525151, "learning_rate": 3.8903736872542366e-07, "loss": 1.508, "step": 538 }, { "epoch": 0.11187214611872145, "grad_norm": 0.8585082254464806, "learning_rate": 3.8899699831492676e-07, "loss": 1.5373, "step": 539 }, { "epoch": 0.11207970112079702, "grad_norm": 0.7975243202660994, "learning_rate": 3.889565560513927e-07, "loss": 1.5246, "step": 540 }, { "epoch": 0.11228725612287256, "grad_norm": 0.7568660447624274, "learning_rate": 3.8891604195201654e-07, "loss": 1.5514, "step": 541 }, { "epoch": 0.11249481112494811, "grad_norm": 1.0936425859018952, "learning_rate": 3.888754560340238e-07, "loss": 1.5369, "step": 542 }, { "epoch": 0.11270236612702367, "grad_norm": 0.8112449139981895, "learning_rate": 3.888347983146706e-07, "loss": 1.5613, "step": 543 }, { "epoch": 0.11290992112909921, "grad_norm": 0.802355469239535, "learning_rate": 3.887940688112434e-07, "loss": 1.5359, "step": 544 }, { "epoch": 0.11311747613117476, "grad_norm": 0.6830065056849106, "learning_rate": 3.8875326754105937e-07, "loss": 1.5481, "step": 545 }, { "epoch": 0.1133250311332503, "grad_norm": 1.4454059629737248, "learning_rate": 3.887123945214662e-07, "loss": 1.5787, "step": 546 }, { "epoch": 0.11353258613532587, "grad_norm": 0.7262830908336716, "learning_rate": 3.886714497698419e-07, "loss": 1.5275, "step": 547 }, { "epoch": 0.11374014113740141, "grad_norm": 0.9513931205545216, "learning_rate": 3.886304333035951e-07, "loss": 1.5453, "step": 548 }, { "epoch": 0.11394769613947696, "grad_norm": 0.7397803180818738, "learning_rate": 3.8858934514016497e-07, "loss": 1.5529, "step": 549 }, { "epoch": 0.1141552511415525, "grad_norm": 0.7839787107858897, "learning_rate": 3.885481852970211e-07, "loss": 1.6007, "step": 550 }, { "epoch": 0.11436280614362807, "grad_norm": 2.311068445407042, "learning_rate": 3.8850695379166356e-07, "loss": 1.5355, "step": 551 }, { "epoch": 0.11457036114570361, "grad_norm": 0.7580724649485342, "learning_rate": 3.884656506416228e-07, "loss": 1.5161, "step": 552 }, { "epoch": 0.11477791614777916, "grad_norm": 1.0810735916216159, "learning_rate": 3.8842427586445994e-07, "loss": 1.5971, "step": 553 }, { "epoch": 0.11498547114985472, "grad_norm": 0.7095799681671573, "learning_rate": 3.883828294777664e-07, "loss": 1.5519, "step": 554 }, { "epoch": 0.11519302615193026, "grad_norm": 0.7776217604791965, "learning_rate": 3.8834131149916407e-07, "loss": 1.513, "step": 555 }, { "epoch": 0.11540058115400581, "grad_norm": 1.5652313619207674, "learning_rate": 3.882997219463053e-07, "loss": 1.5832, "step": 556 }, { "epoch": 0.11560813615608136, "grad_norm": 0.6584277949747235, "learning_rate": 3.8825806083687285e-07, "loss": 1.5643, "step": 557 }, { "epoch": 0.11581569115815692, "grad_norm": 0.782165356640896, "learning_rate": 3.882163281885799e-07, "loss": 1.5433, "step": 558 }, { "epoch": 0.11602324616023246, "grad_norm": 0.7278153007422951, "learning_rate": 3.8817452401917017e-07, "loss": 1.6027, "step": 559 }, { "epoch": 0.11623080116230801, "grad_norm": 0.6554798336177031, "learning_rate": 3.881326483464175e-07, "loss": 1.5715, "step": 560 }, { "epoch": 0.11643835616438356, "grad_norm": 0.6512248296103849, "learning_rate": 3.8809070118812647e-07, "loss": 1.5346, "step": 561 }, { "epoch": 0.11664591116645912, "grad_norm": 0.7426102454894176, "learning_rate": 3.880486825621319e-07, "loss": 1.5214, "step": 562 }, { "epoch": 0.11685346616853466, "grad_norm": 0.7986407710365486, "learning_rate": 3.880065924862989e-07, "loss": 1.574, "step": 563 }, { "epoch": 0.11706102117061021, "grad_norm": 0.8621164944700053, "learning_rate": 3.8796443097852313e-07, "loss": 1.531, "step": 564 }, { "epoch": 0.11726857617268577, "grad_norm": 0.9289635560898413, "learning_rate": 3.8792219805673043e-07, "loss": 1.5677, "step": 565 }, { "epoch": 0.11747613117476131, "grad_norm": 0.7164626952110217, "learning_rate": 3.878798937388773e-07, "loss": 1.5604, "step": 566 }, { "epoch": 0.11768368617683686, "grad_norm": 0.690439289874298, "learning_rate": 3.8783751804295024e-07, "loss": 1.5507, "step": 567 }, { "epoch": 0.1178912411789124, "grad_norm": 1.4033573937712738, "learning_rate": 3.8779507098696637e-07, "loss": 1.5704, "step": 568 }, { "epoch": 0.11809879618098797, "grad_norm": 0.9927587413657801, "learning_rate": 3.8775255258897304e-07, "loss": 1.5556, "step": 569 }, { "epoch": 0.11830635118306351, "grad_norm": 0.7819344606449811, "learning_rate": 3.877099628670479e-07, "loss": 1.5152, "step": 570 }, { "epoch": 0.11851390618513906, "grad_norm": 1.1938556215032117, "learning_rate": 3.8766730183929893e-07, "loss": 1.5554, "step": 571 }, { "epoch": 0.1187214611872146, "grad_norm": 1.0039061048079752, "learning_rate": 3.8762456952386466e-07, "loss": 1.5625, "step": 572 }, { "epoch": 0.11892901618929017, "grad_norm": 0.8867948281147221, "learning_rate": 3.875817659389135e-07, "loss": 1.5517, "step": 573 }, { "epoch": 0.11913657119136571, "grad_norm": 0.6159689623344549, "learning_rate": 3.8753889110264455e-07, "loss": 1.5001, "step": 574 }, { "epoch": 0.11934412619344126, "grad_norm": 0.7145769013620279, "learning_rate": 3.87495945033287e-07, "loss": 1.4948, "step": 575 }, { "epoch": 0.11955168119551682, "grad_norm": 0.6740816373877853, "learning_rate": 3.874529277491004e-07, "loss": 1.6317, "step": 576 }, { "epoch": 0.11975923619759236, "grad_norm": 0.7061900464119396, "learning_rate": 3.8740983926837455e-07, "loss": 1.5431, "step": 577 }, { "epoch": 0.11996679119966791, "grad_norm": 0.8517078816663815, "learning_rate": 3.873666796094295e-07, "loss": 1.5775, "step": 578 }, { "epoch": 0.12017434620174346, "grad_norm": 0.7228339470868413, "learning_rate": 3.8732344879061565e-07, "loss": 1.5779, "step": 579 }, { "epoch": 0.12038190120381902, "grad_norm": 1.302469628343104, "learning_rate": 3.8728014683031353e-07, "loss": 1.6022, "step": 580 }, { "epoch": 0.12058945620589456, "grad_norm": 0.8567454068966431, "learning_rate": 3.87236773746934e-07, "loss": 1.598, "step": 581 }, { "epoch": 0.12079701120797011, "grad_norm": 0.8655054910951147, "learning_rate": 3.8719332955891815e-07, "loss": 1.5958, "step": 582 }, { "epoch": 0.12100456621004566, "grad_norm": 0.9165375458943107, "learning_rate": 3.8714981428473736e-07, "loss": 1.6328, "step": 583 }, { "epoch": 0.12121212121212122, "grad_norm": 0.7174289586456537, "learning_rate": 3.8710622794289304e-07, "loss": 1.4962, "step": 584 }, { "epoch": 0.12141967621419676, "grad_norm": 0.8361028688471015, "learning_rate": 3.8706257055191706e-07, "loss": 1.5587, "step": 585 }, { "epoch": 0.12162723121627231, "grad_norm": 0.816584592324343, "learning_rate": 3.870188421303713e-07, "loss": 1.49, "step": 586 }, { "epoch": 0.12183478621834787, "grad_norm": 0.9940894117635181, "learning_rate": 3.86975042696848e-07, "loss": 1.597, "step": 587 }, { "epoch": 0.12204234122042341, "grad_norm": 0.7345683751755205, "learning_rate": 3.869311722699695e-07, "loss": 1.5783, "step": 588 }, { "epoch": 0.12224989622249896, "grad_norm": 1.7678038059645846, "learning_rate": 3.868872308683883e-07, "loss": 1.6367, "step": 589 }, { "epoch": 0.1224574512245745, "grad_norm": 0.7465387206711173, "learning_rate": 3.8684321851078714e-07, "loss": 1.5989, "step": 590 }, { "epoch": 0.12266500622665007, "grad_norm": 0.6844292182568991, "learning_rate": 3.8679913521587893e-07, "loss": 1.5181, "step": 591 }, { "epoch": 0.12287256122872561, "grad_norm": 0.7188466303561866, "learning_rate": 3.8675498100240664e-07, "loss": 1.5175, "step": 592 }, { "epoch": 0.12308011623080116, "grad_norm": 0.8255480499699501, "learning_rate": 3.8671075588914355e-07, "loss": 1.5483, "step": 593 }, { "epoch": 0.1232876712328767, "grad_norm": 0.7309100774824837, "learning_rate": 3.8666645989489293e-07, "loss": 1.5319, "step": 594 }, { "epoch": 0.12349522623495227, "grad_norm": 0.8771897348607793, "learning_rate": 3.866220930384884e-07, "loss": 1.5643, "step": 595 }, { "epoch": 0.12370278123702781, "grad_norm": 0.7195101915185197, "learning_rate": 3.865776553387934e-07, "loss": 1.555, "step": 596 }, { "epoch": 0.12391033623910336, "grad_norm": 0.896838102764189, "learning_rate": 3.865331468147018e-07, "loss": 1.5872, "step": 597 }, { "epoch": 0.12411789124117892, "grad_norm": 0.8948986325445162, "learning_rate": 3.864885674851372e-07, "loss": 1.5039, "step": 598 }, { "epoch": 0.12432544624325446, "grad_norm": 0.989534865109656, "learning_rate": 3.8644391736905393e-07, "loss": 1.5894, "step": 599 }, { "epoch": 0.12453300124533001, "grad_norm": 1.1838407541445375, "learning_rate": 3.8639919648543576e-07, "loss": 1.565, "step": 600 }, { "epoch": 0.12474055624740556, "grad_norm": 0.8347719714864444, "learning_rate": 3.8635440485329686e-07, "loss": 1.5311, "step": 601 }, { "epoch": 0.12494811124948112, "grad_norm": 1.2274611309321541, "learning_rate": 3.8630954249168156e-07, "loss": 1.5233, "step": 602 }, { "epoch": 0.12515566625155666, "grad_norm": 1.1781426891102558, "learning_rate": 3.8626460941966397e-07, "loss": 1.5471, "step": 603 }, { "epoch": 0.12536322125363222, "grad_norm": 0.8713674847966427, "learning_rate": 3.8621960565634854e-07, "loss": 1.4641, "step": 604 }, { "epoch": 0.12557077625570776, "grad_norm": 0.9842933850239068, "learning_rate": 3.861745312208697e-07, "loss": 1.5688, "step": 605 }, { "epoch": 0.12577833125778332, "grad_norm": 0.6892514619832997, "learning_rate": 3.861293861323919e-07, "loss": 1.4371, "step": 606 }, { "epoch": 0.12598588625985888, "grad_norm": 0.7223007839764674, "learning_rate": 3.8608417041010954e-07, "loss": 1.5333, "step": 607 }, { "epoch": 0.1261934412619344, "grad_norm": 1.0990379125141843, "learning_rate": 3.8603888407324724e-07, "loss": 1.48, "step": 608 }, { "epoch": 0.12640099626400997, "grad_norm": 1.4726701349794962, "learning_rate": 3.859935271410595e-07, "loss": 1.5996, "step": 609 }, { "epoch": 0.1266085512660855, "grad_norm": 0.6901756012999231, "learning_rate": 3.8594809963283083e-07, "loss": 1.5191, "step": 610 }, { "epoch": 0.12681610626816106, "grad_norm": 0.7790444189146841, "learning_rate": 3.85902601567876e-07, "loss": 1.5627, "step": 611 }, { "epoch": 0.12702366127023662, "grad_norm": 1.300846328301186, "learning_rate": 3.8585703296553934e-07, "loss": 1.5636, "step": 612 }, { "epoch": 0.12723121627231215, "grad_norm": 0.6181564106990775, "learning_rate": 3.858113938451956e-07, "loss": 1.5084, "step": 613 }, { "epoch": 0.1274387712743877, "grad_norm": 0.7520014176919081, "learning_rate": 3.857656842262492e-07, "loss": 1.4436, "step": 614 }, { "epoch": 0.12764632627646327, "grad_norm": 0.7423989341624727, "learning_rate": 3.857199041281346e-07, "loss": 1.4885, "step": 615 }, { "epoch": 0.1278538812785388, "grad_norm": 0.8581992205715749, "learning_rate": 3.8567405357031647e-07, "loss": 1.4732, "step": 616 }, { "epoch": 0.12806143628061437, "grad_norm": 0.79655873764379, "learning_rate": 3.856281325722892e-07, "loss": 1.5012, "step": 617 }, { "epoch": 0.12826899128268993, "grad_norm": 0.8829311096154718, "learning_rate": 3.8558214115357705e-07, "loss": 1.4866, "step": 618 }, { "epoch": 0.12847654628476546, "grad_norm": 0.9121143481722199, "learning_rate": 3.855360793337345e-07, "loss": 1.654, "step": 619 }, { "epoch": 0.12868410128684102, "grad_norm": 0.644108671989746, "learning_rate": 3.854899471323457e-07, "loss": 1.5721, "step": 620 }, { "epoch": 0.12889165628891655, "grad_norm": 0.8867091132403353, "learning_rate": 3.85443744569025e-07, "loss": 1.5161, "step": 621 }, { "epoch": 0.1290992112909921, "grad_norm": 0.7994182237521943, "learning_rate": 3.8539747166341625e-07, "loss": 1.5188, "step": 622 }, { "epoch": 0.12930676629306767, "grad_norm": 1.0066442837051297, "learning_rate": 3.8535112843519373e-07, "loss": 1.5315, "step": 623 }, { "epoch": 0.1295143212951432, "grad_norm": 1.0004063717443135, "learning_rate": 3.8530471490406107e-07, "loss": 1.5701, "step": 624 }, { "epoch": 0.12972187629721876, "grad_norm": 0.973800307539712, "learning_rate": 3.8525823108975234e-07, "loss": 1.6129, "step": 625 }, { "epoch": 0.12992943129929432, "grad_norm": 0.8411830780758435, "learning_rate": 3.8521167701203103e-07, "loss": 1.5453, "step": 626 }, { "epoch": 0.13013698630136986, "grad_norm": 0.9558907308226784, "learning_rate": 3.8516505269069083e-07, "loss": 1.5749, "step": 627 }, { "epoch": 0.13034454130344542, "grad_norm": 1.0057699880050142, "learning_rate": 3.851183581455551e-07, "loss": 1.5444, "step": 628 }, { "epoch": 0.13055209630552098, "grad_norm": 0.7043392810991221, "learning_rate": 3.850715933964771e-07, "loss": 1.4307, "step": 629 }, { "epoch": 0.1307596513075965, "grad_norm": 1.0214604581379991, "learning_rate": 3.850247584633401e-07, "loss": 1.5608, "step": 630 }, { "epoch": 0.13096720630967207, "grad_norm": 0.9167031619971703, "learning_rate": 3.849778533660568e-07, "loss": 1.4976, "step": 631 }, { "epoch": 0.1311747613117476, "grad_norm": 1.1057247208033947, "learning_rate": 3.849308781245703e-07, "loss": 1.5619, "step": 632 }, { "epoch": 0.13138231631382316, "grad_norm": 0.6765005367552381, "learning_rate": 3.8488383275885297e-07, "loss": 1.6289, "step": 633 }, { "epoch": 0.13158987131589872, "grad_norm": 0.6786993235088649, "learning_rate": 3.848367172889075e-07, "loss": 1.6148, "step": 634 }, { "epoch": 0.13179742631797425, "grad_norm": 0.7672669496048602, "learning_rate": 3.847895317347659e-07, "loss": 1.5458, "step": 635 }, { "epoch": 0.1320049813200498, "grad_norm": 0.9748677926790578, "learning_rate": 3.847422761164903e-07, "loss": 1.4943, "step": 636 }, { "epoch": 0.13221253632212537, "grad_norm": 0.7427900249427577, "learning_rate": 3.8469495045417266e-07, "loss": 1.5393, "step": 637 }, { "epoch": 0.1324200913242009, "grad_norm": 1.1474297045666646, "learning_rate": 3.8464755476793443e-07, "loss": 1.538, "step": 638 }, { "epoch": 0.13262764632627647, "grad_norm": 0.7909303657018172, "learning_rate": 3.84600089077927e-07, "loss": 1.5623, "step": 639 }, { "epoch": 0.13283520132835203, "grad_norm": 0.9958495430750541, "learning_rate": 3.8455255340433164e-07, "loss": 1.5238, "step": 640 }, { "epoch": 0.13304275633042756, "grad_norm": 0.9154236743521144, "learning_rate": 3.845049477673592e-07, "loss": 1.519, "step": 641 }, { "epoch": 0.13325031133250312, "grad_norm": 0.7310468539749906, "learning_rate": 3.8445727218725034e-07, "loss": 1.582, "step": 642 }, { "epoch": 0.13345786633457865, "grad_norm": 0.943954823112964, "learning_rate": 3.8440952668427537e-07, "loss": 1.5671, "step": 643 }, { "epoch": 0.1336654213366542, "grad_norm": 0.6532790701942882, "learning_rate": 3.8436171127873456e-07, "loss": 1.5546, "step": 644 }, { "epoch": 0.13387297633872977, "grad_norm": 1.5859068736896227, "learning_rate": 3.8431382599095765e-07, "loss": 1.5196, "step": 645 }, { "epoch": 0.1340805313408053, "grad_norm": 0.9087888700789262, "learning_rate": 3.842658708413042e-07, "loss": 1.5413, "step": 646 }, { "epoch": 0.13428808634288086, "grad_norm": 0.7069536964471304, "learning_rate": 3.842178458501634e-07, "loss": 1.5058, "step": 647 }, { "epoch": 0.13449564134495642, "grad_norm": 0.6928209414743897, "learning_rate": 3.841697510379544e-07, "loss": 1.5758, "step": 648 }, { "epoch": 0.13470319634703196, "grad_norm": 0.8900614144594915, "learning_rate": 3.841215864251257e-07, "loss": 1.5876, "step": 649 }, { "epoch": 0.13491075134910752, "grad_norm": 0.8163751453549446, "learning_rate": 3.8407335203215555e-07, "loss": 1.5545, "step": 650 }, { "epoch": 0.13511830635118308, "grad_norm": 1.571663378176316, "learning_rate": 3.8402504787955214e-07, "loss": 1.5665, "step": 651 }, { "epoch": 0.1353258613532586, "grad_norm": 0.8743853191616004, "learning_rate": 3.839766739878529e-07, "loss": 1.5691, "step": 652 }, { "epoch": 0.13553341635533417, "grad_norm": 0.6463130888916893, "learning_rate": 3.8392823037762524e-07, "loss": 1.4404, "step": 653 }, { "epoch": 0.1357409713574097, "grad_norm": 0.7358669078667202, "learning_rate": 3.8387971706946607e-07, "loss": 1.4908, "step": 654 }, { "epoch": 0.13594852635948526, "grad_norm": 4.926003852656765, "learning_rate": 3.8383113408400195e-07, "loss": 1.5607, "step": 655 }, { "epoch": 0.13615608136156082, "grad_norm": 0.7534878643020217, "learning_rate": 3.8378248144188905e-07, "loss": 1.5844, "step": 656 }, { "epoch": 0.13636363636363635, "grad_norm": 1.452755487773869, "learning_rate": 3.837337591638133e-07, "loss": 1.5123, "step": 657 }, { "epoch": 0.1365711913657119, "grad_norm": 1.206691011529855, "learning_rate": 3.8368496727049e-07, "loss": 1.5295, "step": 658 }, { "epoch": 0.13677874636778747, "grad_norm": 0.8921177737430342, "learning_rate": 3.8363610578266423e-07, "loss": 1.602, "step": 659 }, { "epoch": 0.136986301369863, "grad_norm": 0.8811356956302794, "learning_rate": 3.835871747211105e-07, "loss": 1.5455, "step": 660 }, { "epoch": 0.13719385637193857, "grad_norm": 0.7778385237965392, "learning_rate": 3.8353817410663314e-07, "loss": 1.467, "step": 661 }, { "epoch": 0.13740141137401413, "grad_norm": 1.138715814867982, "learning_rate": 3.834891039600658e-07, "loss": 1.4708, "step": 662 }, { "epoch": 0.13760896637608966, "grad_norm": 0.6583494532720251, "learning_rate": 3.834399643022719e-07, "loss": 1.514, "step": 663 }, { "epoch": 0.13781652137816522, "grad_norm": 0.7021534458239477, "learning_rate": 3.833907551541442e-07, "loss": 1.4935, "step": 664 }, { "epoch": 0.13802407638024075, "grad_norm": 15.807717176804518, "learning_rate": 3.833414765366052e-07, "loss": 1.6038, "step": 665 }, { "epoch": 0.1382316313823163, "grad_norm": 0.7486182420724226, "learning_rate": 3.832921284706069e-07, "loss": 1.4913, "step": 666 }, { "epoch": 0.13843918638439187, "grad_norm": 0.6721947178541794, "learning_rate": 3.8324271097713066e-07, "loss": 1.5022, "step": 667 }, { "epoch": 0.1386467413864674, "grad_norm": 0.8283243237841391, "learning_rate": 3.831932240771876e-07, "loss": 1.5541, "step": 668 }, { "epoch": 0.13885429638854296, "grad_norm": 0.708608180990852, "learning_rate": 3.831436677918182e-07, "loss": 1.6247, "step": 669 }, { "epoch": 0.13906185139061852, "grad_norm": 0.7473719286750843, "learning_rate": 3.8309404214209245e-07, "loss": 1.4948, "step": 670 }, { "epoch": 0.13926940639269406, "grad_norm": 0.7672536569018873, "learning_rate": 3.830443471491099e-07, "loss": 1.5006, "step": 671 }, { "epoch": 0.13947696139476962, "grad_norm": 0.7381461212903716, "learning_rate": 3.8299458283399956e-07, "loss": 1.5185, "step": 672 }, { "epoch": 0.13968451639684518, "grad_norm": 1.164312657342, "learning_rate": 3.8294474921791975e-07, "loss": 1.5114, "step": 673 }, { "epoch": 0.1398920713989207, "grad_norm": 0.8278958094453844, "learning_rate": 3.8289484632205856e-07, "loss": 1.6098, "step": 674 }, { "epoch": 0.14009962640099627, "grad_norm": 0.6128504397269805, "learning_rate": 3.828448741676334e-07, "loss": 1.4965, "step": 675 }, { "epoch": 0.1403071814030718, "grad_norm": 0.9878558357310917, "learning_rate": 3.827948327758909e-07, "loss": 1.5582, "step": 676 }, { "epoch": 0.14051473640514736, "grad_norm": 0.81411650946353, "learning_rate": 3.827447221681076e-07, "loss": 1.5509, "step": 677 }, { "epoch": 0.14072229140722292, "grad_norm": 1.0184776693565, "learning_rate": 3.82694542365589e-07, "loss": 1.531, "step": 678 }, { "epoch": 0.14092984640929845, "grad_norm": 1.0808084582591457, "learning_rate": 3.8264429338967025e-07, "loss": 1.5492, "step": 679 }, { "epoch": 0.141137401411374, "grad_norm": 0.9865246289865527, "learning_rate": 3.8259397526171593e-07, "loss": 1.5187, "step": 680 }, { "epoch": 0.14134495641344957, "grad_norm": 0.7618113957239873, "learning_rate": 3.8254358800311997e-07, "loss": 1.4566, "step": 681 }, { "epoch": 0.1415525114155251, "grad_norm": 0.8558260283975787, "learning_rate": 3.824931316353057e-07, "loss": 1.5643, "step": 682 }, { "epoch": 0.14176006641760067, "grad_norm": 0.8080672825169757, "learning_rate": 3.824426061797258e-07, "loss": 1.6707, "step": 683 }, { "epoch": 0.14196762141967623, "grad_norm": 0.8515026243443249, "learning_rate": 3.823920116578623e-07, "loss": 1.5116, "step": 684 }, { "epoch": 0.14217517642175176, "grad_norm": 0.835529042057423, "learning_rate": 3.823413480912267e-07, "loss": 1.6197, "step": 685 }, { "epoch": 0.14238273142382732, "grad_norm": 0.7493096217585754, "learning_rate": 3.8229061550135994e-07, "loss": 1.5813, "step": 686 }, { "epoch": 0.14259028642590285, "grad_norm": 2.785738130234306, "learning_rate": 3.822398139098319e-07, "loss": 1.4898, "step": 687 }, { "epoch": 0.1427978414279784, "grad_norm": 1.0164311430707942, "learning_rate": 3.821889433382422e-07, "loss": 1.5319, "step": 688 }, { "epoch": 0.14300539643005397, "grad_norm": 0.6812387551679115, "learning_rate": 3.8213800380821974e-07, "loss": 1.5434, "step": 689 }, { "epoch": 0.1432129514321295, "grad_norm": 0.6798379050196808, "learning_rate": 3.820869953414226e-07, "loss": 1.5579, "step": 690 }, { "epoch": 0.14342050643420506, "grad_norm": 0.8432181785766621, "learning_rate": 3.8203591795953815e-07, "loss": 1.4414, "step": 691 }, { "epoch": 0.14362806143628062, "grad_norm": 0.9751639850631328, "learning_rate": 3.819847716842832e-07, "loss": 1.5897, "step": 692 }, { "epoch": 0.14383561643835616, "grad_norm": 0.7946200883797693, "learning_rate": 3.819335565374038e-07, "loss": 1.5444, "step": 693 }, { "epoch": 0.14404317144043172, "grad_norm": 0.9780082218452316, "learning_rate": 3.818822725406752e-07, "loss": 1.6139, "step": 694 }, { "epoch": 0.14425072644250728, "grad_norm": 0.6312627232542098, "learning_rate": 3.818309197159021e-07, "loss": 1.5396, "step": 695 }, { "epoch": 0.1444582814445828, "grad_norm": 0.8541201372466952, "learning_rate": 3.8177949808491834e-07, "loss": 1.5175, "step": 696 }, { "epoch": 0.14466583644665837, "grad_norm": 1.9111219014234557, "learning_rate": 3.8172800766958694e-07, "loss": 1.5128, "step": 697 }, { "epoch": 0.1448733914487339, "grad_norm": 0.8248647559250584, "learning_rate": 3.816764484918003e-07, "loss": 1.591, "step": 698 }, { "epoch": 0.14508094645080946, "grad_norm": 2.759536118128399, "learning_rate": 3.8162482057348007e-07, "loss": 1.5544, "step": 699 }, { "epoch": 0.14528850145288502, "grad_norm": 1.552080997371948, "learning_rate": 3.81573123936577e-07, "loss": 1.5411, "step": 700 }, { "epoch": 0.14549605645496055, "grad_norm": 0.7236085099946248, "learning_rate": 3.815213586030711e-07, "loss": 1.5529, "step": 701 }, { "epoch": 0.14570361145703611, "grad_norm": 1.0174563013448228, "learning_rate": 3.814695245949718e-07, "loss": 1.5399, "step": 702 }, { "epoch": 0.14591116645911167, "grad_norm": 0.654142014727374, "learning_rate": 3.814176219343173e-07, "loss": 1.5909, "step": 703 }, { "epoch": 0.1461187214611872, "grad_norm": 1.1732915640822246, "learning_rate": 3.813656506431754e-07, "loss": 1.5265, "step": 704 }, { "epoch": 0.14632627646326277, "grad_norm": 0.9579251669320937, "learning_rate": 3.8131361074364287e-07, "loss": 1.5227, "step": 705 }, { "epoch": 0.14653383146533833, "grad_norm": 1.3655500747977294, "learning_rate": 3.8126150225784563e-07, "loss": 1.5429, "step": 706 }, { "epoch": 0.14674138646741386, "grad_norm": 0.7642083193775192, "learning_rate": 3.812093252079389e-07, "loss": 1.5179, "step": 707 }, { "epoch": 0.14694894146948942, "grad_norm": 0.6267765803780283, "learning_rate": 3.81157079616107e-07, "loss": 1.5097, "step": 708 }, { "epoch": 0.14715649647156495, "grad_norm": 1.1608112381567908, "learning_rate": 3.8110476550456325e-07, "loss": 1.4683, "step": 709 }, { "epoch": 0.1473640514736405, "grad_norm": 0.6530399334044678, "learning_rate": 3.810523828955504e-07, "loss": 1.5067, "step": 710 }, { "epoch": 0.14757160647571607, "grad_norm": 2.295421073228988, "learning_rate": 3.8099993181134e-07, "loss": 1.5557, "step": 711 }, { "epoch": 0.1477791614777916, "grad_norm": 1.3289193607487875, "learning_rate": 3.8094741227423286e-07, "loss": 1.5812, "step": 712 }, { "epoch": 0.14798671647986716, "grad_norm": 0.721445697819485, "learning_rate": 3.8089482430655895e-07, "loss": 1.4894, "step": 713 }, { "epoch": 0.14819427148194272, "grad_norm": 1.0615642218969679, "learning_rate": 3.808421679306772e-07, "loss": 1.5256, "step": 714 }, { "epoch": 0.14840182648401826, "grad_norm": 0.701827676972807, "learning_rate": 3.807894431689759e-07, "loss": 1.5193, "step": 715 }, { "epoch": 0.14860938148609382, "grad_norm": 1.0217882565219119, "learning_rate": 3.8073665004387194e-07, "loss": 1.535, "step": 716 }, { "epoch": 0.14881693648816938, "grad_norm": 0.8035415744500214, "learning_rate": 3.806837885778118e-07, "loss": 1.5679, "step": 717 }, { "epoch": 0.1490244914902449, "grad_norm": 1.652905192817568, "learning_rate": 3.806308587932706e-07, "loss": 1.5671, "step": 718 }, { "epoch": 0.14923204649232047, "grad_norm": 0.7475609740898325, "learning_rate": 3.805778607127528e-07, "loss": 1.5687, "step": 719 }, { "epoch": 0.149439601494396, "grad_norm": 0.6266407338147819, "learning_rate": 3.805247943587917e-07, "loss": 1.6176, "step": 720 }, { "epoch": 0.14964715649647156, "grad_norm": 11.176807186817417, "learning_rate": 3.8047165975394974e-07, "loss": 1.6668, "step": 721 }, { "epoch": 0.14985471149854712, "grad_norm": 0.8493468480389307, "learning_rate": 3.8041845692081833e-07, "loss": 1.5556, "step": 722 }, { "epoch": 0.15006226650062265, "grad_norm": 0.6460753231064308, "learning_rate": 3.803651858820179e-07, "loss": 1.5545, "step": 723 }, { "epoch": 0.15026982150269821, "grad_norm": 0.7042804767654001, "learning_rate": 3.803118466601979e-07, "loss": 1.5868, "step": 724 }, { "epoch": 0.15047737650477377, "grad_norm": 0.7717898594920679, "learning_rate": 3.802584392780367e-07, "loss": 1.5569, "step": 725 }, { "epoch": 0.1506849315068493, "grad_norm": 0.8045647664531013, "learning_rate": 3.802049637582418e-07, "loss": 1.6684, "step": 726 }, { "epoch": 0.15089248650892487, "grad_norm": 0.7333857345999618, "learning_rate": 3.801514201235495e-07, "loss": 1.5355, "step": 727 }, { "epoch": 0.15110004151100043, "grad_norm": 0.7489112675243124, "learning_rate": 3.8009780839672504e-07, "loss": 1.5582, "step": 728 }, { "epoch": 0.15130759651307596, "grad_norm": 0.7184462961272369, "learning_rate": 3.8004412860056293e-07, "loss": 1.5919, "step": 729 }, { "epoch": 0.15151515151515152, "grad_norm": 0.8550277459910752, "learning_rate": 3.799903807578862e-07, "loss": 1.5117, "step": 730 }, { "epoch": 0.15172270651722705, "grad_norm": 1.672129718069143, "learning_rate": 3.799365648915471e-07, "loss": 1.5472, "step": 731 }, { "epoch": 0.1519302615193026, "grad_norm": 1.095696233055675, "learning_rate": 3.798826810244267e-07, "loss": 1.5499, "step": 732 }, { "epoch": 0.15213781652137817, "grad_norm": 0.878717413960594, "learning_rate": 3.798287291794349e-07, "loss": 1.5424, "step": 733 }, { "epoch": 0.1523453715234537, "grad_norm": 0.7309357288980483, "learning_rate": 3.7977470937951073e-07, "loss": 1.5546, "step": 734 }, { "epoch": 0.15255292652552926, "grad_norm": 0.7631169956712764, "learning_rate": 3.797206216476219e-07, "loss": 1.5668, "step": 735 }, { "epoch": 0.15276048152760482, "grad_norm": 0.7792678761367231, "learning_rate": 3.7966646600676515e-07, "loss": 1.5557, "step": 736 }, { "epoch": 0.15296803652968036, "grad_norm": 1.3692447146852718, "learning_rate": 3.7961224247996585e-07, "loss": 1.5175, "step": 737 }, { "epoch": 0.15317559153175592, "grad_norm": 1.3137847200593238, "learning_rate": 3.7955795109027854e-07, "loss": 1.4981, "step": 738 }, { "epoch": 0.15338314653383148, "grad_norm": 1.7798928876772429, "learning_rate": 3.7950359186078647e-07, "loss": 1.5234, "step": 739 }, { "epoch": 0.153590701535907, "grad_norm": 0.9522088672416029, "learning_rate": 3.794491648146017e-07, "loss": 1.5268, "step": 740 }, { "epoch": 0.15379825653798257, "grad_norm": 0.9478963550459683, "learning_rate": 3.7939466997486516e-07, "loss": 1.4965, "step": 741 }, { "epoch": 0.1540058115400581, "grad_norm": 1.221112506801334, "learning_rate": 3.793401073647467e-07, "loss": 1.5211, "step": 742 }, { "epoch": 0.15421336654213366, "grad_norm": 0.8030798001089229, "learning_rate": 3.792854770074448e-07, "loss": 1.5356, "step": 743 }, { "epoch": 0.15442092154420922, "grad_norm": 0.7140004349172026, "learning_rate": 3.7923077892618686e-07, "loss": 1.6048, "step": 744 }, { "epoch": 0.15462847654628475, "grad_norm": 1.0222560299712333, "learning_rate": 3.791760131442291e-07, "loss": 1.6742, "step": 745 }, { "epoch": 0.15483603154836031, "grad_norm": 0.8331242423356838, "learning_rate": 3.791211796848563e-07, "loss": 1.5261, "step": 746 }, { "epoch": 0.15504358655043587, "grad_norm": 0.7886865686912353, "learning_rate": 3.7906627857138245e-07, "loss": 1.4505, "step": 747 }, { "epoch": 0.1552511415525114, "grad_norm": 10.464886450965508, "learning_rate": 3.790113098271499e-07, "loss": 1.5242, "step": 748 }, { "epoch": 0.15545869655458697, "grad_norm": 0.6984608327745523, "learning_rate": 3.7895627347552994e-07, "loss": 1.4947, "step": 749 }, { "epoch": 0.15566625155666253, "grad_norm": 0.6997890084690677, "learning_rate": 3.7890116953992245e-07, "loss": 1.54, "step": 750 }, { "epoch": 0.15587380655873806, "grad_norm": 0.7052120132426379, "learning_rate": 3.7884599804375637e-07, "loss": 1.5285, "step": 751 }, { "epoch": 0.15608136156081362, "grad_norm": 0.761894932884719, "learning_rate": 3.78790759010489e-07, "loss": 1.4949, "step": 752 }, { "epoch": 0.15628891656288915, "grad_norm": 0.8524777648347153, "learning_rate": 3.787354524636066e-07, "loss": 1.5505, "step": 753 }, { "epoch": 0.1564964715649647, "grad_norm": 0.7626434368528677, "learning_rate": 3.7868007842662394e-07, "loss": 1.5718, "step": 754 }, { "epoch": 0.15670402656704027, "grad_norm": 0.9902730510266881, "learning_rate": 3.786246369230846e-07, "loss": 1.5988, "step": 755 }, { "epoch": 0.1569115815691158, "grad_norm": 0.7973870159346816, "learning_rate": 3.78569127976561e-07, "loss": 1.6035, "step": 756 }, { "epoch": 0.15711913657119136, "grad_norm": 0.890835409053048, "learning_rate": 3.785135516106539e-07, "loss": 1.4627, "step": 757 }, { "epoch": 0.15732669157326692, "grad_norm": 0.8050582158496478, "learning_rate": 3.784579078489929e-07, "loss": 1.567, "step": 758 }, { "epoch": 0.15753424657534246, "grad_norm": 0.7023748694599736, "learning_rate": 3.784021967152364e-07, "loss": 1.5677, "step": 759 }, { "epoch": 0.15774180157741802, "grad_norm": 0.961776143990859, "learning_rate": 3.7834641823307115e-07, "loss": 1.5946, "step": 760 }, { "epoch": 0.15794935657949358, "grad_norm": 0.7591178517703842, "learning_rate": 3.782905724262127e-07, "loss": 1.5049, "step": 761 }, { "epoch": 0.1581569115815691, "grad_norm": 0.7277432229358922, "learning_rate": 3.782346593184053e-07, "loss": 1.5254, "step": 762 }, { "epoch": 0.15836446658364467, "grad_norm": 0.6975379805271739, "learning_rate": 3.781786789334216e-07, "loss": 1.6107, "step": 763 }, { "epoch": 0.1585720215857202, "grad_norm": 0.7293344042173459, "learning_rate": 3.78122631295063e-07, "loss": 1.5843, "step": 764 }, { "epoch": 0.15877957658779576, "grad_norm": 1.0190360085064099, "learning_rate": 3.780665164271595e-07, "loss": 1.5502, "step": 765 }, { "epoch": 0.15898713158987132, "grad_norm": 1.1213296450340349, "learning_rate": 3.780103343535697e-07, "loss": 1.4563, "step": 766 }, { "epoch": 0.15919468659194685, "grad_norm": 2.250107999233427, "learning_rate": 3.779540850981806e-07, "loss": 1.6008, "step": 767 }, { "epoch": 0.15940224159402241, "grad_norm": 0.6866332849690373, "learning_rate": 3.77897768684908e-07, "loss": 1.4932, "step": 768 }, { "epoch": 0.15960979659609797, "grad_norm": 0.882567229268511, "learning_rate": 3.778413851376961e-07, "loss": 1.5265, "step": 769 }, { "epoch": 0.1598173515981735, "grad_norm": 0.8981771172535955, "learning_rate": 3.777849344805177e-07, "loss": 1.4927, "step": 770 }, { "epoch": 0.16002490660024907, "grad_norm": 1.8562640629195606, "learning_rate": 3.7772841673737406e-07, "loss": 1.5417, "step": 771 }, { "epoch": 0.16023246160232463, "grad_norm": 0.7691663161504118, "learning_rate": 3.776718319322951e-07, "loss": 1.557, "step": 772 }, { "epoch": 0.16044001660440016, "grad_norm": 0.7789547731555111, "learning_rate": 3.776151800893392e-07, "loss": 1.5567, "step": 773 }, { "epoch": 0.16064757160647572, "grad_norm": 1.0760219210740214, "learning_rate": 3.7755846123259316e-07, "loss": 1.5842, "step": 774 }, { "epoch": 0.16085512660855128, "grad_norm": 1.4460571422597417, "learning_rate": 3.7750167538617225e-07, "loss": 1.552, "step": 775 }, { "epoch": 0.1610626816106268, "grad_norm": 0.7845528737582512, "learning_rate": 3.7744482257422046e-07, "loss": 1.4935, "step": 776 }, { "epoch": 0.16127023661270237, "grad_norm": 1.2498517734264016, "learning_rate": 3.7738790282091e-07, "loss": 1.4891, "step": 777 }, { "epoch": 0.1614777916147779, "grad_norm": 1.0218972126350483, "learning_rate": 3.773309161504417e-07, "loss": 1.5006, "step": 778 }, { "epoch": 0.16168534661685346, "grad_norm": 0.819571163501824, "learning_rate": 3.7727386258704484e-07, "loss": 1.5268, "step": 779 }, { "epoch": 0.16189290161892902, "grad_norm": 0.8005862565577075, "learning_rate": 3.772167421549769e-07, "loss": 1.5299, "step": 780 }, { "epoch": 0.16210045662100456, "grad_norm": 0.9139233624251284, "learning_rate": 3.7715955487852404e-07, "loss": 1.4307, "step": 781 }, { "epoch": 0.16230801162308012, "grad_norm": 0.7901949312198839, "learning_rate": 3.7710230078200087e-07, "loss": 1.5438, "step": 782 }, { "epoch": 0.16251556662515568, "grad_norm": 0.7869595762301359, "learning_rate": 3.770449798897502e-07, "loss": 1.4998, "step": 783 }, { "epoch": 0.1627231216272312, "grad_norm": 1.1239926204726076, "learning_rate": 3.7698759222614333e-07, "loss": 1.5444, "step": 784 }, { "epoch": 0.16293067662930677, "grad_norm": 0.9839945942049779, "learning_rate": 3.7693013781558007e-07, "loss": 1.5096, "step": 785 }, { "epoch": 0.16313823163138233, "grad_norm": 0.8446556706083901, "learning_rate": 3.7687261668248846e-07, "loss": 1.6214, "step": 786 }, { "epoch": 0.16334578663345786, "grad_norm": 0.8215434477984572, "learning_rate": 3.7681502885132505e-07, "loss": 1.5724, "step": 787 }, { "epoch": 0.16355334163553342, "grad_norm": 1.3136394967632907, "learning_rate": 3.7675737434657443e-07, "loss": 1.5573, "step": 788 }, { "epoch": 0.16376089663760895, "grad_norm": 0.9960939864046063, "learning_rate": 3.7669965319275007e-07, "loss": 1.5317, "step": 789 }, { "epoch": 0.16396845163968451, "grad_norm": 2.882541005839287, "learning_rate": 3.766418654143932e-07, "loss": 1.5517, "step": 790 }, { "epoch": 0.16417600664176007, "grad_norm": 1.9054982923296206, "learning_rate": 3.765840110360738e-07, "loss": 1.6166, "step": 791 }, { "epoch": 0.1643835616438356, "grad_norm": 0.7741643971008524, "learning_rate": 3.7652609008238994e-07, "loss": 1.613, "step": 792 }, { "epoch": 0.16459111664591117, "grad_norm": 0.7858165882736603, "learning_rate": 3.7646810257796815e-07, "loss": 1.5427, "step": 793 }, { "epoch": 0.16479867164798673, "grad_norm": 0.7609302281547298, "learning_rate": 3.7641004854746316e-07, "loss": 1.4877, "step": 794 }, { "epoch": 0.16500622665006226, "grad_norm": 1.0767136850229462, "learning_rate": 3.763519280155579e-07, "loss": 1.4492, "step": 795 }, { "epoch": 0.16521378165213782, "grad_norm": 1.0139039715169584, "learning_rate": 3.762937410069638e-07, "loss": 1.6387, "step": 796 }, { "epoch": 0.16542133665421338, "grad_norm": 1.0029450432854055, "learning_rate": 3.762354875464204e-07, "loss": 1.5181, "step": 797 }, { "epoch": 0.1656288916562889, "grad_norm": 0.6617654130628386, "learning_rate": 3.761771676586955e-07, "loss": 1.6173, "step": 798 }, { "epoch": 0.16583644665836447, "grad_norm": 0.7694723985765943, "learning_rate": 3.7611878136858515e-07, "loss": 1.5429, "step": 799 }, { "epoch": 0.16604400166044, "grad_norm": 1.4364158600848693, "learning_rate": 3.7606032870091375e-07, "loss": 1.5577, "step": 800 }, { "epoch": 0.16625155666251556, "grad_norm": 0.9573899326543218, "learning_rate": 3.7600180968053367e-07, "loss": 1.5363, "step": 801 }, { "epoch": 0.16645911166459113, "grad_norm": 0.7776387411836702, "learning_rate": 3.7594322433232577e-07, "loss": 1.4798, "step": 802 }, { "epoch": 0.16666666666666666, "grad_norm": 0.7716791738728939, "learning_rate": 3.7588457268119895e-07, "loss": 1.4645, "step": 803 }, { "epoch": 0.16687422166874222, "grad_norm": 0.739806369506702, "learning_rate": 3.7582585475209034e-07, "loss": 1.5515, "step": 804 }, { "epoch": 0.16708177667081778, "grad_norm": 0.7183959212972962, "learning_rate": 3.7576707056996513e-07, "loss": 1.5006, "step": 805 }, { "epoch": 0.1672893316728933, "grad_norm": 0.6887813563888008, "learning_rate": 3.75708220159817e-07, "loss": 1.5263, "step": 806 }, { "epoch": 0.16749688667496887, "grad_norm": 1.299764998227441, "learning_rate": 3.7564930354666746e-07, "loss": 1.5331, "step": 807 }, { "epoch": 0.16770444167704443, "grad_norm": 1.285255017078133, "learning_rate": 3.755903207555663e-07, "loss": 1.6044, "step": 808 }, { "epoch": 0.16791199667911996, "grad_norm": 0.9214982513478615, "learning_rate": 3.7553127181159143e-07, "loss": 1.5554, "step": 809 }, { "epoch": 0.16811955168119552, "grad_norm": 0.951877560860422, "learning_rate": 3.7547215673984887e-07, "loss": 1.4915, "step": 810 }, { "epoch": 0.16832710668327105, "grad_norm": 0.7164806531172606, "learning_rate": 3.754129755654729e-07, "loss": 1.4705, "step": 811 }, { "epoch": 0.16853466168534662, "grad_norm": 0.7600708642780232, "learning_rate": 3.753537283136256e-07, "loss": 1.5385, "step": 812 }, { "epoch": 0.16874221668742218, "grad_norm": 1.3021846223681952, "learning_rate": 3.7529441500949746e-07, "loss": 1.5315, "step": 813 }, { "epoch": 0.1689497716894977, "grad_norm": 1.0454533350964896, "learning_rate": 3.7523503567830695e-07, "loss": 1.5139, "step": 814 }, { "epoch": 0.16915732669157327, "grad_norm": 1.063871912964034, "learning_rate": 3.751755903453005e-07, "loss": 1.4556, "step": 815 }, { "epoch": 0.16936488169364883, "grad_norm": 0.7236547821306161, "learning_rate": 3.751160790357527e-07, "loss": 1.4297, "step": 816 }, { "epoch": 0.16957243669572436, "grad_norm": 0.8154056622027921, "learning_rate": 3.750565017749662e-07, "loss": 1.5575, "step": 817 }, { "epoch": 0.16977999169779992, "grad_norm": 0.7501780708777914, "learning_rate": 3.7499685858827163e-07, "loss": 1.5315, "step": 818 }, { "epoch": 0.16998754669987548, "grad_norm": 1.5401868414692843, "learning_rate": 3.7493714950102775e-07, "loss": 1.4922, "step": 819 }, { "epoch": 0.170195101701951, "grad_norm": 0.8836452541027164, "learning_rate": 3.748773745386212e-07, "loss": 1.6307, "step": 820 }, { "epoch": 0.17040265670402657, "grad_norm": 0.9940818448233938, "learning_rate": 3.748175337264669e-07, "loss": 1.5827, "step": 821 }, { "epoch": 0.1706102117061021, "grad_norm": 1.2346967925024543, "learning_rate": 3.747576270900073e-07, "loss": 1.5735, "step": 822 }, { "epoch": 0.17081776670817767, "grad_norm": 0.7040502153082228, "learning_rate": 3.746976546547132e-07, "loss": 1.5422, "step": 823 }, { "epoch": 0.17102532171025323, "grad_norm": 2.1395500261759843, "learning_rate": 3.7463761644608345e-07, "loss": 1.5326, "step": 824 }, { "epoch": 0.17123287671232876, "grad_norm": 0.8031348156184384, "learning_rate": 3.7457751248964453e-07, "loss": 1.5139, "step": 825 }, { "epoch": 0.17144043171440432, "grad_norm": 0.9867951561957972, "learning_rate": 3.7451734281095113e-07, "loss": 1.5467, "step": 826 }, { "epoch": 0.17164798671647988, "grad_norm": 0.7330872726008544, "learning_rate": 3.744571074355857e-07, "loss": 1.4889, "step": 827 }, { "epoch": 0.1718555417185554, "grad_norm": 0.6702570202411895, "learning_rate": 3.7439680638915883e-07, "loss": 1.5805, "step": 828 }, { "epoch": 0.17206309672063097, "grad_norm": 1.7414225240176324, "learning_rate": 3.743364396973089e-07, "loss": 1.5067, "step": 829 }, { "epoch": 0.17227065172270653, "grad_norm": 1.3788620966291183, "learning_rate": 3.7427600738570223e-07, "loss": 1.5045, "step": 830 }, { "epoch": 0.17247820672478206, "grad_norm": 1.0441706124192782, "learning_rate": 3.7421550948003293e-07, "loss": 1.5275, "step": 831 }, { "epoch": 0.17268576172685762, "grad_norm": 0.7625393754633494, "learning_rate": 3.741549460060233e-07, "loss": 1.4881, "step": 832 }, { "epoch": 0.17289331672893316, "grad_norm": 1.0895229101877355, "learning_rate": 3.740943169894232e-07, "loss": 1.5459, "step": 833 }, { "epoch": 0.17310087173100872, "grad_norm": 0.8040144948497726, "learning_rate": 3.740336224560104e-07, "loss": 1.4751, "step": 834 }, { "epoch": 0.17330842673308428, "grad_norm": 0.7585509525417161, "learning_rate": 3.739728624315907e-07, "loss": 1.6045, "step": 835 }, { "epoch": 0.1735159817351598, "grad_norm": 0.7499658473582391, "learning_rate": 3.739120369419977e-07, "loss": 1.5168, "step": 836 }, { "epoch": 0.17372353673723537, "grad_norm": 0.6851865494435526, "learning_rate": 3.738511460130927e-07, "loss": 1.5575, "step": 837 }, { "epoch": 0.17393109173931093, "grad_norm": 0.7406286892034353, "learning_rate": 3.737901896707649e-07, "loss": 1.5889, "step": 838 }, { "epoch": 0.17413864674138646, "grad_norm": 1.5302447360162559, "learning_rate": 3.737291679409314e-07, "loss": 1.5361, "step": 839 }, { "epoch": 0.17434620174346202, "grad_norm": 0.7674357037181953, "learning_rate": 3.7366808084953694e-07, "loss": 1.517, "step": 840 }, { "epoch": 0.17455375674553758, "grad_norm": 0.8205955230958445, "learning_rate": 3.736069284225542e-07, "loss": 1.5043, "step": 841 }, { "epoch": 0.1747613117476131, "grad_norm": 1.0117168377981043, "learning_rate": 3.7354571068598346e-07, "loss": 1.5449, "step": 842 }, { "epoch": 0.17496886674968867, "grad_norm": 1.246631413984115, "learning_rate": 3.7348442766585297e-07, "loss": 1.4927, "step": 843 }, { "epoch": 0.1751764217517642, "grad_norm": 0.9163661457828539, "learning_rate": 3.734230793882186e-07, "loss": 1.5516, "step": 844 }, { "epoch": 0.17538397675383977, "grad_norm": 1.9286964648397114, "learning_rate": 3.733616658791641e-07, "loss": 1.5384, "step": 845 }, { "epoch": 0.17559153175591533, "grad_norm": 0.7276743750278539, "learning_rate": 3.733001871648007e-07, "loss": 1.5774, "step": 846 }, { "epoch": 0.17579908675799086, "grad_norm": 0.69341900550815, "learning_rate": 3.732386432712677e-07, "loss": 1.5085, "step": 847 }, { "epoch": 0.17600664176006642, "grad_norm": 1.2247329914173868, "learning_rate": 3.7317703422473176e-07, "loss": 1.6026, "step": 848 }, { "epoch": 0.17621419676214198, "grad_norm": 0.7201691933069883, "learning_rate": 3.731153600513874e-07, "loss": 1.53, "step": 849 }, { "epoch": 0.1764217517642175, "grad_norm": 0.7935021363305134, "learning_rate": 3.730536207774571e-07, "loss": 1.5283, "step": 850 }, { "epoch": 0.17662930676629307, "grad_norm": 0.8553407464966226, "learning_rate": 3.729918164291905e-07, "loss": 1.5535, "step": 851 }, { "epoch": 0.17683686176836863, "grad_norm": 0.8521875048893446, "learning_rate": 3.729299470328653e-07, "loss": 1.5066, "step": 852 }, { "epoch": 0.17704441677044416, "grad_norm": 0.6300596698085998, "learning_rate": 3.728680126147867e-07, "loss": 1.4873, "step": 853 }, { "epoch": 0.17725197177251972, "grad_norm": 0.9716499542134922, "learning_rate": 3.728060132012875e-07, "loss": 1.4733, "step": 854 }, { "epoch": 0.17745952677459526, "grad_norm": 0.9470577841313167, "learning_rate": 3.7274394881872825e-07, "loss": 1.5461, "step": 855 }, { "epoch": 0.17766708177667082, "grad_norm": 0.6711753702436024, "learning_rate": 3.7268181949349707e-07, "loss": 1.4783, "step": 856 }, { "epoch": 0.17787463677874638, "grad_norm": 0.7533868330444985, "learning_rate": 3.7261962525200975e-07, "loss": 1.5497, "step": 857 }, { "epoch": 0.1780821917808219, "grad_norm": 0.771491046373101, "learning_rate": 3.725573661207096e-07, "loss": 1.6642, "step": 858 }, { "epoch": 0.17828974678289747, "grad_norm": 2.0122488463077763, "learning_rate": 3.724950421260675e-07, "loss": 1.6609, "step": 859 }, { "epoch": 0.17849730178497303, "grad_norm": 1.0075132812128422, "learning_rate": 3.7243265329458207e-07, "loss": 1.4208, "step": 860 }, { "epoch": 0.17870485678704856, "grad_norm": 0.6737551666121391, "learning_rate": 3.7237019965277925e-07, "loss": 1.5304, "step": 861 }, { "epoch": 0.17891241178912412, "grad_norm": 0.7536570759048754, "learning_rate": 3.7230768122721276e-07, "loss": 1.4994, "step": 862 }, { "epoch": 0.17911996679119968, "grad_norm": 0.8170442508203793, "learning_rate": 3.7224509804446374e-07, "loss": 1.5288, "step": 863 }, { "epoch": 0.1793275217932752, "grad_norm": 1.0398047490069944, "learning_rate": 3.7218245013114096e-07, "loss": 1.5256, "step": 864 }, { "epoch": 0.17953507679535077, "grad_norm": 1.060674553509126, "learning_rate": 3.721197375138805e-07, "loss": 1.6188, "step": 865 }, { "epoch": 0.1797426317974263, "grad_norm": 0.705775955581245, "learning_rate": 3.720569602193463e-07, "loss": 1.5185, "step": 866 }, { "epoch": 0.17995018679950187, "grad_norm": 0.8728614881558762, "learning_rate": 3.7199411827422945e-07, "loss": 1.5611, "step": 867 }, { "epoch": 0.18015774180157743, "grad_norm": 0.6688368020576857, "learning_rate": 3.719312117052487e-07, "loss": 1.5199, "step": 868 }, { "epoch": 0.18036529680365296, "grad_norm": 0.7063100522250276, "learning_rate": 3.7186824053915037e-07, "loss": 1.5521, "step": 869 }, { "epoch": 0.18057285180572852, "grad_norm": 0.8757094097801504, "learning_rate": 3.7180520480270794e-07, "loss": 1.5097, "step": 870 }, { "epoch": 0.18078040680780408, "grad_norm": 0.6748031450837038, "learning_rate": 3.7174210452272264e-07, "loss": 1.6066, "step": 871 }, { "epoch": 0.1809879618098796, "grad_norm": 0.8329110099647076, "learning_rate": 3.716789397260231e-07, "loss": 1.4707, "step": 872 }, { "epoch": 0.18119551681195517, "grad_norm": 2.6263362634731564, "learning_rate": 3.7161571043946514e-07, "loss": 1.4995, "step": 873 }, { "epoch": 0.18140307181403073, "grad_norm": 0.804298429773292, "learning_rate": 3.715524166899323e-07, "loss": 1.5935, "step": 874 }, { "epoch": 0.18161062681610626, "grad_norm": 0.9471952862287576, "learning_rate": 3.714890585043354e-07, "loss": 1.4568, "step": 875 }, { "epoch": 0.18181818181818182, "grad_norm": 0.6480174575770449, "learning_rate": 3.7142563590961257e-07, "loss": 1.5473, "step": 876 }, { "epoch": 0.18202573682025736, "grad_norm": 0.6933639173981995, "learning_rate": 3.7136214893272957e-07, "loss": 1.4835, "step": 877 }, { "epoch": 0.18223329182233292, "grad_norm": 0.877362937258417, "learning_rate": 3.712985976006792e-07, "loss": 1.4228, "step": 878 }, { "epoch": 0.18244084682440848, "grad_norm": 0.9080778927031753, "learning_rate": 3.712349819404819e-07, "loss": 1.573, "step": 879 }, { "epoch": 0.182648401826484, "grad_norm": 0.7452963501835226, "learning_rate": 3.7117130197918535e-07, "loss": 1.5271, "step": 880 }, { "epoch": 0.18285595682855957, "grad_norm": 0.7400615040014613, "learning_rate": 3.711075577438645e-07, "loss": 1.6004, "step": 881 }, { "epoch": 0.18306351183063513, "grad_norm": 0.6838797669559068, "learning_rate": 3.7104374926162186e-07, "loss": 1.4635, "step": 882 }, { "epoch": 0.18327106683271066, "grad_norm": 0.7255774359287342, "learning_rate": 3.70979876559587e-07, "loss": 1.493, "step": 883 }, { "epoch": 0.18347862183478622, "grad_norm": 0.6900781540426623, "learning_rate": 3.709159396649169e-07, "loss": 1.5631, "step": 884 }, { "epoch": 0.18368617683686178, "grad_norm": 0.8364041827031164, "learning_rate": 3.708519386047959e-07, "loss": 1.5954, "step": 885 }, { "epoch": 0.1838937318389373, "grad_norm": 0.8083933588543124, "learning_rate": 3.707878734064354e-07, "loss": 1.5223, "step": 886 }, { "epoch": 0.18410128684101287, "grad_norm": 0.7386738334350924, "learning_rate": 3.7072374409707437e-07, "loss": 1.537, "step": 887 }, { "epoch": 0.1843088418430884, "grad_norm": 1.380905427986444, "learning_rate": 3.7065955070397884e-07, "loss": 1.5444, "step": 888 }, { "epoch": 0.18451639684516397, "grad_norm": 1.6275491039959697, "learning_rate": 3.705952932544421e-07, "loss": 1.4941, "step": 889 }, { "epoch": 0.18472395184723953, "grad_norm": 1.5905168044510367, "learning_rate": 3.7053097177578477e-07, "loss": 1.5098, "step": 890 }, { "epoch": 0.18493150684931506, "grad_norm": 0.8233384585158533, "learning_rate": 3.7046658629535463e-07, "loss": 1.5493, "step": 891 }, { "epoch": 0.18513906185139062, "grad_norm": 0.6738388983752422, "learning_rate": 3.704021368405266e-07, "loss": 1.5084, "step": 892 }, { "epoch": 0.18534661685346618, "grad_norm": 1.0263124288396517, "learning_rate": 3.70337623438703e-07, "loss": 1.5326, "step": 893 }, { "epoch": 0.1855541718555417, "grad_norm": 0.653321010932783, "learning_rate": 3.7027304611731314e-07, "loss": 1.4953, "step": 894 }, { "epoch": 0.18576172685761727, "grad_norm": 1.6973143792482301, "learning_rate": 3.702084049038136e-07, "loss": 1.5212, "step": 895 }, { "epoch": 0.18596928185969283, "grad_norm": 0.6893987881742105, "learning_rate": 3.7014369982568806e-07, "loss": 1.5523, "step": 896 }, { "epoch": 0.18617683686176836, "grad_norm": 0.8584832425803451, "learning_rate": 3.700789309104475e-07, "loss": 1.4919, "step": 897 }, { "epoch": 0.18638439186384392, "grad_norm": 0.7513471717647343, "learning_rate": 3.700140981856298e-07, "loss": 1.5182, "step": 898 }, { "epoch": 0.18659194686591946, "grad_norm": 0.7136741726048977, "learning_rate": 3.699492016788003e-07, "loss": 1.5409, "step": 899 }, { "epoch": 0.18679950186799502, "grad_norm": 0.7957898079494814, "learning_rate": 3.6988424141755104e-07, "loss": 1.5401, "step": 900 }, { "epoch": 0.18700705687007058, "grad_norm": 0.9799306871328185, "learning_rate": 3.6981921742950164e-07, "loss": 1.603, "step": 901 }, { "epoch": 0.1872146118721461, "grad_norm": 1.5115008253198206, "learning_rate": 3.6975412974229847e-07, "loss": 1.5116, "step": 902 }, { "epoch": 0.18742216687422167, "grad_norm": 0.6879554751128457, "learning_rate": 3.6968897838361505e-07, "loss": 1.5114, "step": 903 }, { "epoch": 0.18762972187629723, "grad_norm": 0.9754190823941983, "learning_rate": 3.69623763381152e-07, "loss": 1.4938, "step": 904 }, { "epoch": 0.18783727687837276, "grad_norm": 1.4617529588786387, "learning_rate": 3.6955848476263706e-07, "loss": 1.5417, "step": 905 }, { "epoch": 0.18804483188044832, "grad_norm": 0.7202516177903175, "learning_rate": 3.69493142555825e-07, "loss": 1.5363, "step": 906 }, { "epoch": 0.18825238688252388, "grad_norm": 0.8183756175881098, "learning_rate": 3.694277367884975e-07, "loss": 1.5479, "step": 907 }, { "epoch": 0.1884599418845994, "grad_norm": 0.7059895066638223, "learning_rate": 3.693622674884634e-07, "loss": 1.5138, "step": 908 }, { "epoch": 0.18866749688667497, "grad_norm": 0.810042677809564, "learning_rate": 3.6929673468355846e-07, "loss": 1.513, "step": 909 }, { "epoch": 0.1888750518887505, "grad_norm": 0.6904054094558383, "learning_rate": 3.692311384016457e-07, "loss": 1.5107, "step": 910 }, { "epoch": 0.18908260689082607, "grad_norm": 0.711361133621851, "learning_rate": 3.691654786706146e-07, "loss": 1.561, "step": 911 }, { "epoch": 0.18929016189290163, "grad_norm": 0.9074535190825969, "learning_rate": 3.6909975551838215e-07, "loss": 1.5117, "step": 912 }, { "epoch": 0.18949771689497716, "grad_norm": 1.0371280666571567, "learning_rate": 3.69033968972892e-07, "loss": 1.5839, "step": 913 }, { "epoch": 0.18970527189705272, "grad_norm": 0.7474696174083512, "learning_rate": 3.689681190621149e-07, "loss": 1.484, "step": 914 }, { "epoch": 0.18991282689912828, "grad_norm": 0.8819371001062164, "learning_rate": 3.689022058140484e-07, "loss": 1.4513, "step": 915 }, { "epoch": 0.1901203819012038, "grad_norm": 0.7444399301468259, "learning_rate": 3.6883622925671715e-07, "loss": 1.6168, "step": 916 }, { "epoch": 0.19032793690327937, "grad_norm": 0.9405246697918023, "learning_rate": 3.687701894181726e-07, "loss": 1.5099, "step": 917 }, { "epoch": 0.19053549190535493, "grad_norm": 0.8131743488768268, "learning_rate": 3.6870408632649315e-07, "loss": 1.5917, "step": 918 }, { "epoch": 0.19074304690743046, "grad_norm": 0.7051740423254481, "learning_rate": 3.68637920009784e-07, "loss": 1.6081, "step": 919 }, { "epoch": 0.19095060190950602, "grad_norm": 0.66563976531375, "learning_rate": 3.6857169049617746e-07, "loss": 1.4995, "step": 920 }, { "epoch": 0.19115815691158156, "grad_norm": 1.0106331116387341, "learning_rate": 3.6850539781383237e-07, "loss": 1.5841, "step": 921 }, { "epoch": 0.19136571191365712, "grad_norm": 1.6137885675277175, "learning_rate": 3.684390419909348e-07, "loss": 1.4778, "step": 922 }, { "epoch": 0.19157326691573268, "grad_norm": 0.9493816808872487, "learning_rate": 3.6837262305569744e-07, "loss": 1.5693, "step": 923 }, { "epoch": 0.1917808219178082, "grad_norm": 1.2255426237804923, "learning_rate": 3.6830614103635976e-07, "loss": 1.46, "step": 924 }, { "epoch": 0.19198837691988377, "grad_norm": 0.7348959172677256, "learning_rate": 3.6823959596118825e-07, "loss": 1.5317, "step": 925 }, { "epoch": 0.19219593192195933, "grad_norm": 0.683884654379837, "learning_rate": 3.68172987858476e-07, "loss": 1.5956, "step": 926 }, { "epoch": 0.19240348692403486, "grad_norm": 1.1680965814363948, "learning_rate": 3.6810631675654316e-07, "loss": 1.5995, "step": 927 }, { "epoch": 0.19261104192611042, "grad_norm": 1.0226787982557772, "learning_rate": 3.680395826837364e-07, "loss": 1.5572, "step": 928 }, { "epoch": 0.19281859692818598, "grad_norm": 0.7532513466103506, "learning_rate": 3.6797278566842935e-07, "loss": 1.5199, "step": 929 }, { "epoch": 0.1930261519302615, "grad_norm": 1.281973832103749, "learning_rate": 3.679059257390223e-07, "loss": 1.4541, "step": 930 }, { "epoch": 0.19323370693233707, "grad_norm": 0.7431152726788054, "learning_rate": 3.678390029239422e-07, "loss": 1.4473, "step": 931 }, { "epoch": 0.1934412619344126, "grad_norm": 0.693932407814135, "learning_rate": 3.6777201725164303e-07, "loss": 1.5397, "step": 932 }, { "epoch": 0.19364881693648817, "grad_norm": 0.7926036579526063, "learning_rate": 3.6770496875060525e-07, "loss": 1.5205, "step": 933 }, { "epoch": 0.19385637193856373, "grad_norm": 0.77602026900396, "learning_rate": 3.6763785744933614e-07, "loss": 1.5172, "step": 934 }, { "epoch": 0.19406392694063926, "grad_norm": 0.7133631793500962, "learning_rate": 3.6757068337636955e-07, "loss": 1.5946, "step": 935 }, { "epoch": 0.19427148194271482, "grad_norm": 0.8588582311253812, "learning_rate": 3.6750344656026617e-07, "loss": 1.56, "step": 936 }, { "epoch": 0.19447903694479038, "grad_norm": 1.525273574937671, "learning_rate": 3.6743614702961334e-07, "loss": 1.4686, "step": 937 }, { "epoch": 0.1946865919468659, "grad_norm": 0.7366300680010816, "learning_rate": 3.673687848130249e-07, "loss": 1.5038, "step": 938 }, { "epoch": 0.19489414694894147, "grad_norm": 2.1051036264314247, "learning_rate": 3.673013599391417e-07, "loss": 1.5599, "step": 939 }, { "epoch": 0.19510170195101703, "grad_norm": 0.6794846997394501, "learning_rate": 3.672338724366308e-07, "loss": 1.4862, "step": 940 }, { "epoch": 0.19530925695309256, "grad_norm": 0.8128017325234509, "learning_rate": 3.6716632233418623e-07, "loss": 1.5856, "step": 941 }, { "epoch": 0.19551681195516812, "grad_norm": 0.7310885949394232, "learning_rate": 3.6709870966052844e-07, "loss": 1.4743, "step": 942 }, { "epoch": 0.19572436695724366, "grad_norm": 0.8202315102887064, "learning_rate": 3.6703103444440453e-07, "loss": 1.4866, "step": 943 }, { "epoch": 0.19593192195931922, "grad_norm": 0.8378121328849348, "learning_rate": 3.6696329671458827e-07, "loss": 1.5544, "step": 944 }, { "epoch": 0.19613947696139478, "grad_norm": 2.4985488718347386, "learning_rate": 3.6689549649987983e-07, "loss": 1.5403, "step": 945 }, { "epoch": 0.1963470319634703, "grad_norm": 0.751146122439667, "learning_rate": 3.668276338291062e-07, "loss": 1.5288, "step": 946 }, { "epoch": 0.19655458696554587, "grad_norm": 0.7436847988145037, "learning_rate": 3.6675970873112065e-07, "loss": 1.5191, "step": 947 }, { "epoch": 0.19676214196762143, "grad_norm": 0.7199080583641725, "learning_rate": 3.6669172123480326e-07, "loss": 1.5131, "step": 948 }, { "epoch": 0.19696969696969696, "grad_norm": 0.7160545360517595, "learning_rate": 3.666236713690604e-07, "loss": 1.5916, "step": 949 }, { "epoch": 0.19717725197177252, "grad_norm": 0.7501750418287525, "learning_rate": 3.6655555916282515e-07, "loss": 1.5683, "step": 950 }, { "epoch": 0.19738480697384808, "grad_norm": 2.8370077003246053, "learning_rate": 3.6648738464505697e-07, "loss": 1.4981, "step": 951 }, { "epoch": 0.1975923619759236, "grad_norm": 0.6644047637169577, "learning_rate": 3.664191478447418e-07, "loss": 1.4997, "step": 952 }, { "epoch": 0.19779991697799917, "grad_norm": 0.7954094260949119, "learning_rate": 3.6635084879089224e-07, "loss": 1.5484, "step": 953 }, { "epoch": 0.1980074719800747, "grad_norm": 0.7248770408981893, "learning_rate": 3.662824875125471e-07, "loss": 1.5204, "step": 954 }, { "epoch": 0.19821502698215027, "grad_norm": 0.7250852204620137, "learning_rate": 3.662140640387719e-07, "loss": 1.4125, "step": 955 }, { "epoch": 0.19842258198422583, "grad_norm": 3.1496163510577446, "learning_rate": 3.661455783986584e-07, "loss": 1.5167, "step": 956 }, { "epoch": 0.19863013698630136, "grad_norm": 1.2661657661674592, "learning_rate": 3.6607703062132496e-07, "loss": 1.5511, "step": 957 }, { "epoch": 0.19883769198837692, "grad_norm": 0.6500819347470116, "learning_rate": 3.660084207359162e-07, "loss": 1.5676, "step": 958 }, { "epoch": 0.19904524699045248, "grad_norm": 1.3592590933844422, "learning_rate": 3.659397487716032e-07, "loss": 1.5252, "step": 959 }, { "epoch": 0.199252801992528, "grad_norm": 2.669712555389213, "learning_rate": 3.658710147575836e-07, "loss": 1.5638, "step": 960 }, { "epoch": 0.19946035699460357, "grad_norm": 1.3276804343738915, "learning_rate": 3.6580221872308117e-07, "loss": 1.4969, "step": 961 }, { "epoch": 0.19966791199667913, "grad_norm": 1.332614242418297, "learning_rate": 3.6573336069734607e-07, "loss": 1.5383, "step": 962 }, { "epoch": 0.19987546699875466, "grad_norm": 0.6914687241593803, "learning_rate": 3.656644407096551e-07, "loss": 1.6373, "step": 963 }, { "epoch": 0.20008302200083022, "grad_norm": 0.7055726189531973, "learning_rate": 3.65595458789311e-07, "loss": 1.49, "step": 964 }, { "epoch": 0.20029057700290576, "grad_norm": 1.0737501275797365, "learning_rate": 3.655264149656432e-07, "loss": 1.4932, "step": 965 }, { "epoch": 0.20049813200498132, "grad_norm": 0.8143517884994579, "learning_rate": 3.6545730926800734e-07, "loss": 1.5204, "step": 966 }, { "epoch": 0.20070568700705688, "grad_norm": 0.7068450228989227, "learning_rate": 3.653881417257852e-07, "loss": 1.4741, "step": 967 }, { "epoch": 0.2009132420091324, "grad_norm": 0.8868365348832494, "learning_rate": 3.65318912368385e-07, "loss": 1.5691, "step": 968 }, { "epoch": 0.20112079701120797, "grad_norm": 1.1549741126915505, "learning_rate": 3.6524962122524133e-07, "loss": 1.5953, "step": 969 }, { "epoch": 0.20132835201328353, "grad_norm": 0.7458480493247286, "learning_rate": 3.6518026832581483e-07, "loss": 1.533, "step": 970 }, { "epoch": 0.20153590701535906, "grad_norm": 0.7608734275214517, "learning_rate": 3.6511085369959256e-07, "loss": 1.4789, "step": 971 }, { "epoch": 0.20174346201743462, "grad_norm": 0.7324264738249613, "learning_rate": 3.650413773760878e-07, "loss": 1.5215, "step": 972 }, { "epoch": 0.20195101701951018, "grad_norm": 0.6115945082172204, "learning_rate": 3.6497183938484e-07, "loss": 1.4563, "step": 973 }, { "epoch": 0.20215857202158571, "grad_norm": 0.9220845304511379, "learning_rate": 3.6490223975541486e-07, "loss": 1.5536, "step": 974 }, { "epoch": 0.20236612702366127, "grad_norm": 0.6752012126198906, "learning_rate": 3.648325785174043e-07, "loss": 1.5684, "step": 975 }, { "epoch": 0.2025736820257368, "grad_norm": 0.7300003639798002, "learning_rate": 3.647628557004265e-07, "loss": 1.4659, "step": 976 }, { "epoch": 0.20278123702781237, "grad_norm": 1.1357719371195425, "learning_rate": 3.6469307133412563e-07, "loss": 1.4672, "step": 977 }, { "epoch": 0.20298879202988793, "grad_norm": 0.7413194904020192, "learning_rate": 3.646232254481722e-07, "loss": 1.5692, "step": 978 }, { "epoch": 0.20319634703196346, "grad_norm": 0.7805517462648436, "learning_rate": 3.645533180722629e-07, "loss": 1.6109, "step": 979 }, { "epoch": 0.20340390203403902, "grad_norm": 0.7060715170778643, "learning_rate": 3.644833492361204e-07, "loss": 1.5008, "step": 980 }, { "epoch": 0.20361145703611458, "grad_norm": 0.8046242189341722, "learning_rate": 3.6441331896949357e-07, "loss": 1.5687, "step": 981 }, { "epoch": 0.2038190120381901, "grad_norm": 0.7023078197237743, "learning_rate": 3.643432273021575e-07, "loss": 1.6225, "step": 982 }, { "epoch": 0.20402656704026567, "grad_norm": 0.9867549783489455, "learning_rate": 3.6427307426391334e-07, "loss": 1.5635, "step": 983 }, { "epoch": 0.20423412204234123, "grad_norm": 1.1974338432887788, "learning_rate": 3.642028598845882e-07, "loss": 1.5652, "step": 984 }, { "epoch": 0.20444167704441676, "grad_norm": 0.9192244124163903, "learning_rate": 3.6413258419403536e-07, "loss": 1.5241, "step": 985 }, { "epoch": 0.20464923204649232, "grad_norm": 0.7081790678194302, "learning_rate": 3.640622472221342e-07, "loss": 1.5447, "step": 986 }, { "epoch": 0.20485678704856788, "grad_norm": 1.1077921292682111, "learning_rate": 3.6399184899879023e-07, "loss": 1.5354, "step": 987 }, { "epoch": 0.20506434205064342, "grad_norm": 1.4672286013569629, "learning_rate": 3.639213895539349e-07, "loss": 1.4619, "step": 988 }, { "epoch": 0.20527189705271898, "grad_norm": 0.804495510022731, "learning_rate": 3.6385086891752546e-07, "loss": 1.498, "step": 989 }, { "epoch": 0.2054794520547945, "grad_norm": 0.7088277940209514, "learning_rate": 3.6378028711954565e-07, "loss": 1.538, "step": 990 }, { "epoch": 0.20568700705687007, "grad_norm": 0.9804258263528919, "learning_rate": 3.637096441900049e-07, "loss": 1.5722, "step": 991 }, { "epoch": 0.20589456205894563, "grad_norm": 0.915570671843891, "learning_rate": 3.6363894015893876e-07, "loss": 1.507, "step": 992 }, { "epoch": 0.20610211706102116, "grad_norm": 0.7401175250292767, "learning_rate": 3.6356817505640865e-07, "loss": 1.5251, "step": 993 }, { "epoch": 0.20630967206309672, "grad_norm": 0.7839081841918215, "learning_rate": 3.63497348912502e-07, "loss": 1.5706, "step": 994 }, { "epoch": 0.20651722706517228, "grad_norm": 0.923237808745583, "learning_rate": 3.6342646175733226e-07, "loss": 1.4727, "step": 995 }, { "epoch": 0.20672478206724781, "grad_norm": 0.7647205333375906, "learning_rate": 3.633555136210387e-07, "loss": 1.5317, "step": 996 }, { "epoch": 0.20693233706932337, "grad_norm": 0.7055991007802039, "learning_rate": 3.632845045337866e-07, "loss": 1.5603, "step": 997 }, { "epoch": 0.20713989207139893, "grad_norm": 0.6390969879755061, "learning_rate": 3.6321343452576716e-07, "loss": 1.5589, "step": 998 }, { "epoch": 0.20734744707347447, "grad_norm": 0.7282093784272053, "learning_rate": 3.631423036271975e-07, "loss": 1.6313, "step": 999 }, { "epoch": 0.20755500207555003, "grad_norm": 1.475766775717144, "learning_rate": 3.6307111186832057e-07, "loss": 1.5253, "step": 1000 }, { "epoch": 0.20776255707762556, "grad_norm": 0.9252510632177006, "learning_rate": 3.6299985927940517e-07, "loss": 1.4318, "step": 1001 }, { "epoch": 0.20797011207970112, "grad_norm": 0.7737218048296447, "learning_rate": 3.6292854589074604e-07, "loss": 1.6498, "step": 1002 }, { "epoch": 0.20817766708177668, "grad_norm": 1.0312417872879511, "learning_rate": 3.6285717173266377e-07, "loss": 1.5135, "step": 1003 }, { "epoch": 0.2083852220838522, "grad_norm": 0.9303580669090593, "learning_rate": 3.6278573683550464e-07, "loss": 1.461, "step": 1004 }, { "epoch": 0.20859277708592777, "grad_norm": 0.8987816836485442, "learning_rate": 3.62714241229641e-07, "loss": 1.5267, "step": 1005 }, { "epoch": 0.20880033208800333, "grad_norm": 1.0573161719993172, "learning_rate": 3.626426849454708e-07, "loss": 1.4593, "step": 1006 }, { "epoch": 0.20900788709007886, "grad_norm": 0.882982330954709, "learning_rate": 3.6257106801341796e-07, "loss": 1.5643, "step": 1007 }, { "epoch": 0.20921544209215442, "grad_norm": 0.6274525117755215, "learning_rate": 3.62499390463932e-07, "loss": 1.6032, "step": 1008 }, { "epoch": 0.20942299709422998, "grad_norm": 1.0591429128905, "learning_rate": 3.6242765232748835e-07, "loss": 1.5049, "step": 1009 }, { "epoch": 0.20963055209630552, "grad_norm": 0.72923807888404, "learning_rate": 3.6235585363458826e-07, "loss": 1.4759, "step": 1010 }, { "epoch": 0.20983810709838108, "grad_norm": 0.786796182824585, "learning_rate": 3.6228399441575847e-07, "loss": 1.4956, "step": 1011 }, { "epoch": 0.2100456621004566, "grad_norm": 0.726427879589452, "learning_rate": 3.622120747015517e-07, "loss": 1.5475, "step": 1012 }, { "epoch": 0.21025321710253217, "grad_norm": 0.692097382892561, "learning_rate": 3.621400945225463e-07, "loss": 1.5879, "step": 1013 }, { "epoch": 0.21046077210460773, "grad_norm": 0.8368173235338542, "learning_rate": 3.620680539093463e-07, "loss": 1.5438, "step": 1014 }, { "epoch": 0.21066832710668326, "grad_norm": 0.7655157283855119, "learning_rate": 3.619959528925814e-07, "loss": 1.5525, "step": 1015 }, { "epoch": 0.21087588210875882, "grad_norm": 0.9930040953786426, "learning_rate": 3.619237915029072e-07, "loss": 1.5809, "step": 1016 }, { "epoch": 0.21108343711083438, "grad_norm": 1.2270945411752896, "learning_rate": 3.6185156977100465e-07, "loss": 1.6398, "step": 1017 }, { "epoch": 0.21129099211290991, "grad_norm": 0.7786878778901891, "learning_rate": 3.617792877275806e-07, "loss": 1.5092, "step": 1018 }, { "epoch": 0.21149854711498547, "grad_norm": 0.760819941879524, "learning_rate": 3.617069454033675e-07, "loss": 1.5585, "step": 1019 }, { "epoch": 0.21170610211706103, "grad_norm": 0.9630157097008851, "learning_rate": 3.616345428291232e-07, "loss": 1.6263, "step": 1020 }, { "epoch": 0.21191365711913657, "grad_norm": 1.7057018584263368, "learning_rate": 3.6156208003563154e-07, "loss": 1.4996, "step": 1021 }, { "epoch": 0.21212121212121213, "grad_norm": 0.7251056277062933, "learning_rate": 3.614895570537017e-07, "loss": 1.5644, "step": 1022 }, { "epoch": 0.21232876712328766, "grad_norm": 1.1121933879502364, "learning_rate": 3.6141697391416856e-07, "loss": 1.4901, "step": 1023 }, { "epoch": 0.21253632212536322, "grad_norm": 0.9028882682012563, "learning_rate": 3.613443306478925e-07, "loss": 1.5683, "step": 1024 }, { "epoch": 0.21274387712743878, "grad_norm": 0.6827236143260521, "learning_rate": 3.6127162728575957e-07, "loss": 1.5061, "step": 1025 }, { "epoch": 0.2129514321295143, "grad_norm": 0.8541047091180142, "learning_rate": 3.6119886385868117e-07, "loss": 1.4914, "step": 1026 }, { "epoch": 0.21315898713158987, "grad_norm": 0.8808583449154139, "learning_rate": 3.6112604039759455e-07, "loss": 1.54, "step": 1027 }, { "epoch": 0.21336654213366543, "grad_norm": 0.673300656333197, "learning_rate": 3.610531569334622e-07, "loss": 1.4505, "step": 1028 }, { "epoch": 0.21357409713574096, "grad_norm": 0.7648703777499639, "learning_rate": 3.6098021349727226e-07, "loss": 1.5625, "step": 1029 }, { "epoch": 0.21378165213781652, "grad_norm": 0.9545381270153108, "learning_rate": 3.609072101200384e-07, "loss": 1.5664, "step": 1030 }, { "epoch": 0.21398920713989208, "grad_norm": 0.9043053771299132, "learning_rate": 3.6083414683279966e-07, "loss": 1.472, "step": 1031 }, { "epoch": 0.21419676214196762, "grad_norm": 0.7919651328353591, "learning_rate": 3.6076102366662056e-07, "loss": 1.585, "step": 1032 }, { "epoch": 0.21440431714404318, "grad_norm": 0.6758118295057567, "learning_rate": 3.606878406525913e-07, "loss": 1.4898, "step": 1033 }, { "epoch": 0.2146118721461187, "grad_norm": 0.8554468899246301, "learning_rate": 3.6061459782182715e-07, "loss": 1.5818, "step": 1034 }, { "epoch": 0.21481942714819427, "grad_norm": 1.1528963523575075, "learning_rate": 3.6054129520546913e-07, "loss": 1.6178, "step": 1035 }, { "epoch": 0.21502698215026983, "grad_norm": 0.7750790035936163, "learning_rate": 3.604679328346836e-07, "loss": 1.5743, "step": 1036 }, { "epoch": 0.21523453715234536, "grad_norm": 0.7969765912929939, "learning_rate": 3.6039451074066214e-07, "loss": 1.5569, "step": 1037 }, { "epoch": 0.21544209215442092, "grad_norm": 2.2800106728768097, "learning_rate": 3.60321028954622e-07, "loss": 1.4997, "step": 1038 }, { "epoch": 0.21564964715649648, "grad_norm": 0.6253355536777663, "learning_rate": 3.602474875078058e-07, "loss": 1.5601, "step": 1039 }, { "epoch": 0.21585720215857201, "grad_norm": 0.7061438907606232, "learning_rate": 3.6017388643148117e-07, "loss": 1.5718, "step": 1040 }, { "epoch": 0.21606475716064757, "grad_norm": 1.3897658753454325, "learning_rate": 3.601002257569414e-07, "loss": 1.5356, "step": 1041 }, { "epoch": 0.21627231216272313, "grad_norm": 0.7199812896850963, "learning_rate": 3.6002650551550515e-07, "loss": 1.5841, "step": 1042 }, { "epoch": 0.21647986716479867, "grad_norm": 0.6314436721875729, "learning_rate": 3.599527257385162e-07, "loss": 1.5291, "step": 1043 }, { "epoch": 0.21668742216687423, "grad_norm": 0.8129927224059432, "learning_rate": 3.5987888645734385e-07, "loss": 1.6218, "step": 1044 }, { "epoch": 0.21689497716894976, "grad_norm": 2.537115508337458, "learning_rate": 3.598049877033825e-07, "loss": 1.5163, "step": 1045 }, { "epoch": 0.21710253217102532, "grad_norm": 1.1715340746389704, "learning_rate": 3.59731029508052e-07, "loss": 1.5736, "step": 1046 }, { "epoch": 0.21731008717310088, "grad_norm": 0.8132969706930291, "learning_rate": 3.596570119027974e-07, "loss": 1.526, "step": 1047 }, { "epoch": 0.2175176421751764, "grad_norm": 0.8127787669924381, "learning_rate": 3.59582934919089e-07, "loss": 1.6048, "step": 1048 }, { "epoch": 0.21772519717725197, "grad_norm": 0.8264637428695621, "learning_rate": 3.5950879858842246e-07, "loss": 1.5814, "step": 1049 }, { "epoch": 0.21793275217932753, "grad_norm": 0.7115903203561558, "learning_rate": 3.594346029423184e-07, "loss": 1.5222, "step": 1050 }, { "epoch": 0.21814030718140306, "grad_norm": 0.8055282482632105, "learning_rate": 3.5936034801232306e-07, "loss": 1.4447, "step": 1051 }, { "epoch": 0.21834786218347862, "grad_norm": 0.762652301141905, "learning_rate": 3.592860338300075e-07, "loss": 1.5596, "step": 1052 }, { "epoch": 0.21855541718555418, "grad_norm": 0.8503416758302333, "learning_rate": 3.592116604269682e-07, "loss": 1.4316, "step": 1053 }, { "epoch": 0.21876297218762972, "grad_norm": 0.9139415303597738, "learning_rate": 3.5913722783482675e-07, "loss": 1.5539, "step": 1054 }, { "epoch": 0.21897052718970528, "grad_norm": 1.0700224764690445, "learning_rate": 3.5906273608522984e-07, "loss": 1.5497, "step": 1055 }, { "epoch": 0.2191780821917808, "grad_norm": 0.8810454182499513, "learning_rate": 3.5898818520984955e-07, "loss": 1.5458, "step": 1056 }, { "epoch": 0.21938563719385637, "grad_norm": 0.6512086563955461, "learning_rate": 3.589135752403828e-07, "loss": 1.4351, "step": 1057 }, { "epoch": 0.21959319219593193, "grad_norm": 0.8475784004616901, "learning_rate": 3.588389062085518e-07, "loss": 1.6219, "step": 1058 }, { "epoch": 0.21980074719800746, "grad_norm": 0.6729858419411026, "learning_rate": 3.5876417814610385e-07, "loss": 1.5021, "step": 1059 }, { "epoch": 0.22000830220008302, "grad_norm": 1.0081611937448638, "learning_rate": 3.5868939108481135e-07, "loss": 1.5535, "step": 1060 }, { "epoch": 0.22021585720215858, "grad_norm": 0.8878982715761973, "learning_rate": 3.586145450564717e-07, "loss": 1.5483, "step": 1061 }, { "epoch": 0.22042341220423411, "grad_norm": 0.6796615513726271, "learning_rate": 3.5853964009290755e-07, "loss": 1.508, "step": 1062 }, { "epoch": 0.22063096720630967, "grad_norm": 0.7574182326379953, "learning_rate": 3.584646762259664e-07, "loss": 1.6801, "step": 1063 }, { "epoch": 0.22083852220838524, "grad_norm": 0.9577551078700357, "learning_rate": 3.58389653487521e-07, "loss": 1.5759, "step": 1064 }, { "epoch": 0.22104607721046077, "grad_norm": 0.6856193490323519, "learning_rate": 3.5831457190946896e-07, "loss": 1.5297, "step": 1065 }, { "epoch": 0.22125363221253633, "grad_norm": 0.9724950740515332, "learning_rate": 3.582394315237329e-07, "loss": 1.6117, "step": 1066 }, { "epoch": 0.22146118721461186, "grad_norm": 1.0497306707331313, "learning_rate": 3.581642323622607e-07, "loss": 1.5273, "step": 1067 }, { "epoch": 0.22166874221668742, "grad_norm": 0.6898577089872816, "learning_rate": 3.58088974457025e-07, "loss": 1.52, "step": 1068 }, { "epoch": 0.22187629721876298, "grad_norm": 0.7527731255815604, "learning_rate": 3.580136578400233e-07, "loss": 1.5127, "step": 1069 }, { "epoch": 0.2220838522208385, "grad_norm": 0.6751514202057007, "learning_rate": 3.579382825432784e-07, "loss": 1.5542, "step": 1070 }, { "epoch": 0.22229140722291407, "grad_norm": 1.5318541751033339, "learning_rate": 3.578628485988378e-07, "loss": 1.6134, "step": 1071 }, { "epoch": 0.22249896222498963, "grad_norm": 1.023045302434103, "learning_rate": 3.57787356038774e-07, "loss": 1.5041, "step": 1072 }, { "epoch": 0.22270651722706516, "grad_norm": 1.082469823241705, "learning_rate": 3.5771180489518457e-07, "loss": 1.5004, "step": 1073 }, { "epoch": 0.22291407222914073, "grad_norm": 0.8228670000560644, "learning_rate": 3.576361952001916e-07, "loss": 1.5062, "step": 1074 }, { "epoch": 0.22312162723121629, "grad_norm": 1.0069016348680366, "learning_rate": 3.575605269859425e-07, "loss": 1.531, "step": 1075 }, { "epoch": 0.22332918223329182, "grad_norm": 0.9620336005511578, "learning_rate": 3.574848002846094e-07, "loss": 1.5252, "step": 1076 }, { "epoch": 0.22353673723536738, "grad_norm": 0.8815693906658952, "learning_rate": 3.574090151283892e-07, "loss": 1.4453, "step": 1077 }, { "epoch": 0.2237442922374429, "grad_norm": 2.2965043054358794, "learning_rate": 3.5733317154950384e-07, "loss": 1.4871, "step": 1078 }, { "epoch": 0.22395184723951847, "grad_norm": 0.7449502536445629, "learning_rate": 3.5725726958019984e-07, "loss": 1.606, "step": 1079 }, { "epoch": 0.22415940224159403, "grad_norm": 1.0794989441493348, "learning_rate": 3.5718130925274876e-07, "loss": 1.5439, "step": 1080 }, { "epoch": 0.22436695724366956, "grad_norm": 0.7358056517263604, "learning_rate": 3.5710529059944703e-07, "loss": 1.5326, "step": 1081 }, { "epoch": 0.22457451224574512, "grad_norm": 3.387124782423694, "learning_rate": 3.570292136526156e-07, "loss": 1.4424, "step": 1082 }, { "epoch": 0.22478206724782068, "grad_norm": 0.981958200213979, "learning_rate": 3.5695307844460047e-07, "loss": 1.5692, "step": 1083 }, { "epoch": 0.22498962224989622, "grad_norm": 0.7052055577090767, "learning_rate": 3.568768850077723e-07, "loss": 1.5457, "step": 1084 }, { "epoch": 0.22519717725197178, "grad_norm": 0.832611158573069, "learning_rate": 3.5680063337452656e-07, "loss": 1.5511, "step": 1085 }, { "epoch": 0.22540473225404734, "grad_norm": 0.7052347028988847, "learning_rate": 3.5672432357728323e-07, "loss": 1.5436, "step": 1086 }, { "epoch": 0.22561228725612287, "grad_norm": 0.6933559865988063, "learning_rate": 3.5664795564848747e-07, "loss": 1.5242, "step": 1087 }, { "epoch": 0.22581984225819843, "grad_norm": 1.2194582774699534, "learning_rate": 3.565715296206086e-07, "loss": 1.4851, "step": 1088 }, { "epoch": 0.22602739726027396, "grad_norm": 0.6719228601750747, "learning_rate": 3.5649504552614126e-07, "loss": 1.5395, "step": 1089 }, { "epoch": 0.22623495226234952, "grad_norm": 0.7326097834920743, "learning_rate": 3.5641850339760423e-07, "loss": 1.5297, "step": 1090 }, { "epoch": 0.22644250726442508, "grad_norm": 0.7912133563872042, "learning_rate": 3.563419032675413e-07, "loss": 1.4655, "step": 1091 }, { "epoch": 0.2266500622665006, "grad_norm": 1.0059830551169904, "learning_rate": 3.562652451685207e-07, "loss": 1.5194, "step": 1092 }, { "epoch": 0.22685761726857617, "grad_norm": 2.3626578532189755, "learning_rate": 3.5618852913313555e-07, "loss": 1.5309, "step": 1093 }, { "epoch": 0.22706517227065173, "grad_norm": 0.7136902498245019, "learning_rate": 3.5611175519400336e-07, "loss": 1.5376, "step": 1094 }, { "epoch": 0.22727272727272727, "grad_norm": 1.0857788038310587, "learning_rate": 3.560349233837665e-07, "loss": 1.5435, "step": 1095 }, { "epoch": 0.22748028227480283, "grad_norm": 1.0249268271304817, "learning_rate": 3.559580337350917e-07, "loss": 1.5516, "step": 1096 }, { "epoch": 0.22768783727687839, "grad_norm": 0.74967194236441, "learning_rate": 3.558810862806704e-07, "loss": 1.5891, "step": 1097 }, { "epoch": 0.22789539227895392, "grad_norm": 0.6840225869912029, "learning_rate": 3.558040810532187e-07, "loss": 1.5944, "step": 1098 }, { "epoch": 0.22810294728102948, "grad_norm": 1.186527241920316, "learning_rate": 3.557270180854771e-07, "loss": 1.4723, "step": 1099 }, { "epoch": 0.228310502283105, "grad_norm": 3.081113774950376, "learning_rate": 3.556498974102108e-07, "loss": 1.5258, "step": 1100 }, { "epoch": 0.22851805728518057, "grad_norm": 0.6837190802544153, "learning_rate": 3.555727190602094e-07, "loss": 1.5978, "step": 1101 }, { "epoch": 0.22872561228725613, "grad_norm": 0.7006446109094233, "learning_rate": 3.5549548306828705e-07, "loss": 1.5107, "step": 1102 }, { "epoch": 0.22893316728933166, "grad_norm": 1.142427625081276, "learning_rate": 3.5541818946728254e-07, "loss": 1.4936, "step": 1103 }, { "epoch": 0.22914072229140722, "grad_norm": 0.9869752524805545, "learning_rate": 3.553408382900591e-07, "loss": 1.4953, "step": 1104 }, { "epoch": 0.22934827729348278, "grad_norm": 0.8679464248422917, "learning_rate": 3.552634295695042e-07, "loss": 1.5196, "step": 1105 }, { "epoch": 0.22955583229555832, "grad_norm": 0.8451704782978928, "learning_rate": 3.551859633385301e-07, "loss": 1.4815, "step": 1106 }, { "epoch": 0.22976338729763388, "grad_norm": 0.819531145071641, "learning_rate": 3.551084396300734e-07, "loss": 1.4952, "step": 1107 }, { "epoch": 0.22997094229970944, "grad_norm": 0.6780431844481092, "learning_rate": 3.5503085847709513e-07, "loss": 1.5472, "step": 1108 }, { "epoch": 0.23017849730178497, "grad_norm": 0.8607147218439937, "learning_rate": 3.5495321991258066e-07, "loss": 1.6053, "step": 1109 }, { "epoch": 0.23038605230386053, "grad_norm": 0.920641234673832, "learning_rate": 3.548755239695399e-07, "loss": 1.5012, "step": 1110 }, { "epoch": 0.23059360730593606, "grad_norm": 1.1636507847521098, "learning_rate": 3.547977706810071e-07, "loss": 1.535, "step": 1111 }, { "epoch": 0.23080116230801162, "grad_norm": 0.7499729548501789, "learning_rate": 3.5471996008004086e-07, "loss": 1.5131, "step": 1112 }, { "epoch": 0.23100871731008718, "grad_norm": 0.8270941212869587, "learning_rate": 3.5464209219972425e-07, "loss": 1.5212, "step": 1113 }, { "epoch": 0.2312162723121627, "grad_norm": 0.7444398963804137, "learning_rate": 3.545641670731645e-07, "loss": 1.5278, "step": 1114 }, { "epoch": 0.23142382731423827, "grad_norm": 0.7729624006333331, "learning_rate": 3.5448618473349344e-07, "loss": 1.5791, "step": 1115 }, { "epoch": 0.23163138231631383, "grad_norm": 0.7844818799700299, "learning_rate": 3.5440814521386703e-07, "loss": 1.5555, "step": 1116 }, { "epoch": 0.23183893731838937, "grad_norm": 0.889674195599544, "learning_rate": 3.543300485474656e-07, "loss": 1.6032, "step": 1117 }, { "epoch": 0.23204649232046493, "grad_norm": 0.9318115382968022, "learning_rate": 3.542518947674938e-07, "loss": 1.4618, "step": 1118 }, { "epoch": 0.23225404732254049, "grad_norm": 0.788639798112483, "learning_rate": 3.541736839071805e-07, "loss": 1.5614, "step": 1119 }, { "epoch": 0.23246160232461602, "grad_norm": 1.047236924867649, "learning_rate": 3.5409541599977895e-07, "loss": 1.5627, "step": 1120 }, { "epoch": 0.23266915732669158, "grad_norm": 0.7247750234226287, "learning_rate": 3.540170910785665e-07, "loss": 1.5887, "step": 1121 }, { "epoch": 0.2328767123287671, "grad_norm": 0.7439214209403497, "learning_rate": 3.539387091768449e-07, "loss": 1.4641, "step": 1122 }, { "epoch": 0.23308426733084267, "grad_norm": 5.678155893947435, "learning_rate": 3.538602703279401e-07, "loss": 1.5128, "step": 1123 }, { "epoch": 0.23329182233291823, "grad_norm": 1.1120363709533163, "learning_rate": 3.53781774565202e-07, "loss": 1.5181, "step": 1124 }, { "epoch": 0.23349937733499376, "grad_norm": 0.7045229084175086, "learning_rate": 3.5370322192200514e-07, "loss": 1.5184, "step": 1125 }, { "epoch": 0.23370693233706932, "grad_norm": 0.7956027699305732, "learning_rate": 3.5362461243174795e-07, "loss": 1.5407, "step": 1126 }, { "epoch": 0.23391448733914488, "grad_norm": 0.7025158285313601, "learning_rate": 3.5354594612785306e-07, "loss": 1.4801, "step": 1127 }, { "epoch": 0.23412204234122042, "grad_norm": 0.7680847312865177, "learning_rate": 3.5346722304376734e-07, "loss": 1.5415, "step": 1128 }, { "epoch": 0.23432959734329598, "grad_norm": 0.9389118848809208, "learning_rate": 3.5338844321296174e-07, "loss": 1.5022, "step": 1129 }, { "epoch": 0.23453715234537154, "grad_norm": 1.7567025870509037, "learning_rate": 3.533096066689313e-07, "loss": 1.5667, "step": 1130 }, { "epoch": 0.23474470734744707, "grad_norm": 0.8307813913763255, "learning_rate": 3.5323071344519526e-07, "loss": 1.5235, "step": 1131 }, { "epoch": 0.23495226234952263, "grad_norm": 0.6933498374453867, "learning_rate": 3.5315176357529705e-07, "loss": 1.6025, "step": 1132 }, { "epoch": 0.23515981735159816, "grad_norm": 0.8281707935692568, "learning_rate": 3.5307275709280386e-07, "loss": 1.5123, "step": 1133 }, { "epoch": 0.23536737235367372, "grad_norm": 0.7713002853216047, "learning_rate": 3.529936940313073e-07, "loss": 1.4931, "step": 1134 }, { "epoch": 0.23557492735574928, "grad_norm": 0.8616488564890379, "learning_rate": 3.529145744244227e-07, "loss": 1.5537, "step": 1135 }, { "epoch": 0.2357824823578248, "grad_norm": 2.080763493759813, "learning_rate": 3.5283539830578986e-07, "loss": 1.4982, "step": 1136 }, { "epoch": 0.23599003735990037, "grad_norm": 1.9578650706350422, "learning_rate": 3.527561657090722e-07, "loss": 1.5173, "step": 1137 }, { "epoch": 0.23619759236197593, "grad_norm": 1.834435107650694, "learning_rate": 3.526768766679573e-07, "loss": 1.5602, "step": 1138 }, { "epoch": 0.23640514736405147, "grad_norm": 0.934201369962471, "learning_rate": 3.525975312161569e-07, "loss": 1.4968, "step": 1139 }, { "epoch": 0.23661270236612703, "grad_norm": 1.1060332806062492, "learning_rate": 3.525181293874064e-07, "loss": 1.5265, "step": 1140 }, { "epoch": 0.23682025736820259, "grad_norm": 0.7449954987756889, "learning_rate": 3.5243867121546554e-07, "loss": 1.5451, "step": 1141 }, { "epoch": 0.23702781237027812, "grad_norm": 0.6175282902791747, "learning_rate": 3.5235915673411765e-07, "loss": 1.5583, "step": 1142 }, { "epoch": 0.23723536737235368, "grad_norm": 0.7755460477275391, "learning_rate": 3.522795859771703e-07, "loss": 1.5564, "step": 1143 }, { "epoch": 0.2374429223744292, "grad_norm": 0.6660308330359466, "learning_rate": 3.521999589784548e-07, "loss": 1.534, "step": 1144 }, { "epoch": 0.23765047737650477, "grad_norm": 0.8255607264198397, "learning_rate": 3.521202757718264e-07, "loss": 1.4659, "step": 1145 }, { "epoch": 0.23785803237858033, "grad_norm": 0.8266755481388101, "learning_rate": 3.520405363911644e-07, "loss": 1.4976, "step": 1146 }, { "epoch": 0.23806558738065586, "grad_norm": 0.735387448014218, "learning_rate": 3.5196074087037185e-07, "loss": 1.4569, "step": 1147 }, { "epoch": 0.23827314238273142, "grad_norm": 0.8202692357684406, "learning_rate": 3.5188088924337554e-07, "loss": 1.5411, "step": 1148 }, { "epoch": 0.23848069738480698, "grad_norm": 1.1035662026438207, "learning_rate": 3.518009815441264e-07, "loss": 1.5346, "step": 1149 }, { "epoch": 0.23868825238688252, "grad_norm": 0.6996633125514471, "learning_rate": 3.51721017806599e-07, "loss": 1.536, "step": 1150 }, { "epoch": 0.23889580738895808, "grad_norm": 0.8105114216585455, "learning_rate": 3.516409980647919e-07, "loss": 1.5087, "step": 1151 }, { "epoch": 0.23910336239103364, "grad_norm": 0.9357776618975657, "learning_rate": 3.515609223527272e-07, "loss": 1.512, "step": 1152 }, { "epoch": 0.23931091739310917, "grad_norm": 0.7231486849647978, "learning_rate": 3.514807907044511e-07, "loss": 1.496, "step": 1153 }, { "epoch": 0.23951847239518473, "grad_norm": 0.9511119162522497, "learning_rate": 3.514006031540334e-07, "loss": 1.5639, "step": 1154 }, { "epoch": 0.23972602739726026, "grad_norm": 1.1355007292956965, "learning_rate": 3.5132035973556773e-07, "loss": 1.5735, "step": 1155 }, { "epoch": 0.23993358239933582, "grad_norm": 0.7332457034201849, "learning_rate": 3.512400604831715e-07, "loss": 1.4312, "step": 1156 }, { "epoch": 0.24014113740141138, "grad_norm": 0.7122281484165404, "learning_rate": 3.511597054309857e-07, "loss": 1.5995, "step": 1157 }, { "epoch": 0.2403486924034869, "grad_norm": 0.8250980426862565, "learning_rate": 3.510792946131753e-07, "loss": 1.5275, "step": 1158 }, { "epoch": 0.24055624740556247, "grad_norm": 0.9133774491873744, "learning_rate": 3.5099882806392874e-07, "loss": 1.5615, "step": 1159 }, { "epoch": 0.24076380240763803, "grad_norm": 1.0882884865806335, "learning_rate": 3.5091830581745833e-07, "loss": 1.5537, "step": 1160 }, { "epoch": 0.24097135740971357, "grad_norm": 0.8910267983356208, "learning_rate": 3.50837727908e-07, "loss": 1.482, "step": 1161 }, { "epoch": 0.24117891241178913, "grad_norm": 0.8907082108587128, "learning_rate": 3.5075709436981325e-07, "loss": 1.5999, "step": 1162 }, { "epoch": 0.24138646741386469, "grad_norm": 0.6677218222628633, "learning_rate": 3.5067640523718145e-07, "loss": 1.53, "step": 1163 }, { "epoch": 0.24159402241594022, "grad_norm": 1.792291287577258, "learning_rate": 3.5059566054441143e-07, "loss": 1.5525, "step": 1164 }, { "epoch": 0.24180157741801578, "grad_norm": 1.8506946777712572, "learning_rate": 3.5051486032583354e-07, "loss": 1.5414, "step": 1165 }, { "epoch": 0.2420091324200913, "grad_norm": 0.9293223406808399, "learning_rate": 3.504340046158021e-07, "loss": 1.5237, "step": 1166 }, { "epoch": 0.24221668742216687, "grad_norm": 0.7879397529892875, "learning_rate": 3.503530934486947e-07, "loss": 1.493, "step": 1167 }, { "epoch": 0.24242424242424243, "grad_norm": 0.8808044646451905, "learning_rate": 3.502721268589126e-07, "loss": 1.5988, "step": 1168 }, { "epoch": 0.24263179742631796, "grad_norm": 0.8643682076342636, "learning_rate": 3.501911048808808e-07, "loss": 1.5408, "step": 1169 }, { "epoch": 0.24283935242839352, "grad_norm": 0.7270991475676218, "learning_rate": 3.5011002754904744e-07, "loss": 1.5179, "step": 1170 }, { "epoch": 0.24304690743046908, "grad_norm": 3.0668902138950065, "learning_rate": 3.5002889489788465e-07, "loss": 1.554, "step": 1171 }, { "epoch": 0.24325446243254462, "grad_norm": 0.7705644809816546, "learning_rate": 3.499477069618879e-07, "loss": 1.5145, "step": 1172 }, { "epoch": 0.24346201743462018, "grad_norm": 0.7605964639763112, "learning_rate": 3.4986646377557595e-07, "loss": 1.514, "step": 1173 }, { "epoch": 0.24366957243669574, "grad_norm": 0.9229541121161166, "learning_rate": 3.4978516537349144e-07, "loss": 1.4982, "step": 1174 }, { "epoch": 0.24387712743877127, "grad_norm": 0.7433458249266687, "learning_rate": 3.497038117902002e-07, "loss": 1.3846, "step": 1175 }, { "epoch": 0.24408468244084683, "grad_norm": 2.502884423063433, "learning_rate": 3.4962240306029155e-07, "loss": 1.4937, "step": 1176 }, { "epoch": 0.24429223744292236, "grad_norm": 0.650115175962014, "learning_rate": 3.4954093921837844e-07, "loss": 1.5192, "step": 1177 }, { "epoch": 0.24449979244499792, "grad_norm": 0.8937346709079073, "learning_rate": 3.494594202990971e-07, "loss": 1.5943, "step": 1178 }, { "epoch": 0.24470734744707348, "grad_norm": 0.7765759918713224, "learning_rate": 3.493778463371073e-07, "loss": 1.5569, "step": 1179 }, { "epoch": 0.244914902449149, "grad_norm": 0.7078389298835821, "learning_rate": 3.492962173670919e-07, "loss": 1.531, "step": 1180 }, { "epoch": 0.24512245745122457, "grad_norm": 0.9731015621045409, "learning_rate": 3.4921453342375753e-07, "loss": 1.5447, "step": 1181 }, { "epoch": 0.24533001245330013, "grad_norm": 0.9026889670118812, "learning_rate": 3.4913279454183393e-07, "loss": 1.5436, "step": 1182 }, { "epoch": 0.24553756745537567, "grad_norm": 2.118697915346356, "learning_rate": 3.490510007560745e-07, "loss": 1.5048, "step": 1183 }, { "epoch": 0.24574512245745123, "grad_norm": 0.7595503866912212, "learning_rate": 3.4896915210125556e-07, "loss": 1.5406, "step": 1184 }, { "epoch": 0.2459526774595268, "grad_norm": 0.7273656000166633, "learning_rate": 3.4888724861217715e-07, "loss": 1.5175, "step": 1185 }, { "epoch": 0.24616023246160232, "grad_norm": 1.087952926147746, "learning_rate": 3.488052903236624e-07, "loss": 1.5207, "step": 1186 }, { "epoch": 0.24636778746367788, "grad_norm": 0.7850053086727021, "learning_rate": 3.4872327727055767e-07, "loss": 1.4612, "step": 1187 }, { "epoch": 0.2465753424657534, "grad_norm": 0.672850110925625, "learning_rate": 3.48641209487733e-07, "loss": 1.5236, "step": 1188 }, { "epoch": 0.24678289746782897, "grad_norm": 0.946695807598679, "learning_rate": 3.485590870100812e-07, "loss": 1.51, "step": 1189 }, { "epoch": 0.24699045246990453, "grad_norm": 0.8087733984327321, "learning_rate": 3.484769098725186e-07, "loss": 1.4873, "step": 1190 }, { "epoch": 0.24719800747198006, "grad_norm": 0.6202165210617465, "learning_rate": 3.483946781099849e-07, "loss": 1.6002, "step": 1191 }, { "epoch": 0.24740556247405562, "grad_norm": 1.0469577936140428, "learning_rate": 3.483123917574427e-07, "loss": 1.506, "step": 1192 }, { "epoch": 0.24761311747613118, "grad_norm": 0.6722479637831869, "learning_rate": 3.482300508498781e-07, "loss": 1.4863, "step": 1193 }, { "epoch": 0.24782067247820672, "grad_norm": 0.6866664848811579, "learning_rate": 3.4814765542230016e-07, "loss": 1.5029, "step": 1194 }, { "epoch": 0.24802822748028228, "grad_norm": 0.6951110690195187, "learning_rate": 3.480652055097412e-07, "loss": 1.4922, "step": 1195 }, { "epoch": 0.24823578248235784, "grad_norm": 0.6210389677477952, "learning_rate": 3.479827011472568e-07, "loss": 1.5763, "step": 1196 }, { "epoch": 0.24844333748443337, "grad_norm": 0.9507026327261713, "learning_rate": 3.479001423699257e-07, "loss": 1.5166, "step": 1197 }, { "epoch": 0.24865089248650893, "grad_norm": 0.736680360946047, "learning_rate": 3.4781752921284957e-07, "loss": 1.4805, "step": 1198 }, { "epoch": 0.24885844748858446, "grad_norm": 0.8055130492879062, "learning_rate": 3.4773486171115336e-07, "loss": 1.5249, "step": 1199 }, { "epoch": 0.24906600249066002, "grad_norm": 0.7479198464303374, "learning_rate": 3.476521398999851e-07, "loss": 1.5196, "step": 1200 }, { "epoch": 0.24927355749273558, "grad_norm": 0.864718739179395, "learning_rate": 3.4756936381451604e-07, "loss": 1.5517, "step": 1201 }, { "epoch": 0.2494811124948111, "grad_norm": 2.2051962954573923, "learning_rate": 3.4748653348994013e-07, "loss": 1.4927, "step": 1202 }, { "epoch": 0.24968866749688667, "grad_norm": 0.7097734943702743, "learning_rate": 3.474036489614748e-07, "loss": 1.5037, "step": 1203 }, { "epoch": 0.24989622249896223, "grad_norm": 0.8835450203673174, "learning_rate": 3.473207102643603e-07, "loss": 1.5148, "step": 1204 }, { "epoch": 0.2501037775010378, "grad_norm": 0.663424009458325, "learning_rate": 3.4723771743386e-07, "loss": 1.5525, "step": 1205 }, { "epoch": 0.2503113325031133, "grad_norm": 0.8740416597945138, "learning_rate": 3.471546705052602e-07, "loss": 1.54, "step": 1206 }, { "epoch": 0.25051888750518886, "grad_norm": 0.9838597095129007, "learning_rate": 3.470715695138703e-07, "loss": 1.4978, "step": 1207 }, { "epoch": 0.25072644250726445, "grad_norm": 1.8699868039163179, "learning_rate": 3.4698841449502255e-07, "loss": 1.4988, "step": 1208 }, { "epoch": 0.25093399750934, "grad_norm": 1.0253374426664066, "learning_rate": 3.4690520548407234e-07, "loss": 1.598, "step": 1209 }, { "epoch": 0.2511415525114155, "grad_norm": 0.7399656560368909, "learning_rate": 3.4682194251639785e-07, "loss": 1.5229, "step": 1210 }, { "epoch": 0.2513491075134911, "grad_norm": 1.0208535629189506, "learning_rate": 3.467386256274004e-07, "loss": 1.5697, "step": 1211 }, { "epoch": 0.25155666251556663, "grad_norm": 1.4083379372509373, "learning_rate": 3.46655254852504e-07, "loss": 1.5417, "step": 1212 }, { "epoch": 0.25176421751764216, "grad_norm": 0.7794650017371283, "learning_rate": 3.465718302271558e-07, "loss": 1.4597, "step": 1213 }, { "epoch": 0.25197177251971775, "grad_norm": 0.9491926350594242, "learning_rate": 3.464883517868256e-07, "loss": 1.4628, "step": 1214 }, { "epoch": 0.2521793275217933, "grad_norm": 0.8708290005181476, "learning_rate": 3.4640481956700633e-07, "loss": 1.4727, "step": 1215 }, { "epoch": 0.2523868825238688, "grad_norm": 1.2105880211786428, "learning_rate": 3.463212336032137e-07, "loss": 1.5313, "step": 1216 }, { "epoch": 0.25259443752594435, "grad_norm": 1.629160809077407, "learning_rate": 3.462375939309861e-07, "loss": 1.5039, "step": 1217 }, { "epoch": 0.25280199252801994, "grad_norm": 0.9428744088974746, "learning_rate": 3.46153900585885e-07, "loss": 1.5279, "step": 1218 }, { "epoch": 0.25300954753009547, "grad_norm": 1.0797364122207653, "learning_rate": 3.4607015360349456e-07, "loss": 1.4873, "step": 1219 }, { "epoch": 0.253217102532171, "grad_norm": 0.6416712690429094, "learning_rate": 3.4598635301942177e-07, "loss": 1.5224, "step": 1220 }, { "epoch": 0.2534246575342466, "grad_norm": 0.747285449443041, "learning_rate": 3.4590249886929647e-07, "loss": 1.4692, "step": 1221 }, { "epoch": 0.2536322125363221, "grad_norm": 0.910949929056786, "learning_rate": 3.4581859118877117e-07, "loss": 1.5673, "step": 1222 }, { "epoch": 0.25383976753839765, "grad_norm": 0.9346303546125491, "learning_rate": 3.4573463001352116e-07, "loss": 1.5621, "step": 1223 }, { "epoch": 0.25404732254047324, "grad_norm": 0.6971197936963709, "learning_rate": 3.456506153792445e-07, "loss": 1.51, "step": 1224 }, { "epoch": 0.2542548775425488, "grad_norm": 2.8675278436766005, "learning_rate": 3.4556654732166204e-07, "loss": 1.526, "step": 1225 }, { "epoch": 0.2544624325446243, "grad_norm": 1.4822535966762262, "learning_rate": 3.454824258765173e-07, "loss": 1.5517, "step": 1226 }, { "epoch": 0.2546699875466999, "grad_norm": 1.1853114078047695, "learning_rate": 3.4539825107957643e-07, "loss": 1.5357, "step": 1227 }, { "epoch": 0.2548775425487754, "grad_norm": 1.5281746994092364, "learning_rate": 3.4531402296662827e-07, "loss": 1.5171, "step": 1228 }, { "epoch": 0.25508509755085096, "grad_norm": 0.6757285200932763, "learning_rate": 3.4522974157348455e-07, "loss": 1.4501, "step": 1229 }, { "epoch": 0.25529265255292655, "grad_norm": 1.689320557195999, "learning_rate": 3.4514540693597935e-07, "loss": 1.56, "step": 1230 }, { "epoch": 0.2555002075550021, "grad_norm": 1.2034464716258129, "learning_rate": 3.450610190899695e-07, "loss": 1.5487, "step": 1231 }, { "epoch": 0.2557077625570776, "grad_norm": 0.8955944160494794, "learning_rate": 3.449765780713345e-07, "loss": 1.6101, "step": 1232 }, { "epoch": 0.2559153175591532, "grad_norm": 1.116097814053556, "learning_rate": 3.4489208391597645e-07, "loss": 1.4694, "step": 1233 }, { "epoch": 0.25612287256122873, "grad_norm": 0.7482192292543534, "learning_rate": 3.4480753665982007e-07, "loss": 1.5272, "step": 1234 }, { "epoch": 0.25633042756330426, "grad_norm": 0.6543260224943683, "learning_rate": 3.4472293633881253e-07, "loss": 1.5059, "step": 1235 }, { "epoch": 0.25653798256537985, "grad_norm": 1.1024405593086009, "learning_rate": 3.4463828298892363e-07, "loss": 1.521, "step": 1236 }, { "epoch": 0.2567455375674554, "grad_norm": 0.9239975172305424, "learning_rate": 3.445535766461458e-07, "loss": 1.5815, "step": 1237 }, { "epoch": 0.2569530925695309, "grad_norm": 0.7747494025182335, "learning_rate": 3.4446881734649387e-07, "loss": 1.5642, "step": 1238 }, { "epoch": 0.25716064757160645, "grad_norm": 0.8008164601774657, "learning_rate": 3.443840051260053e-07, "loss": 1.5377, "step": 1239 }, { "epoch": 0.25736820257368204, "grad_norm": 0.7783037284441887, "learning_rate": 3.442991400207399e-07, "loss": 1.5056, "step": 1240 }, { "epoch": 0.25757575757575757, "grad_norm": 0.8229201099728607, "learning_rate": 3.442142220667802e-07, "loss": 1.5631, "step": 1241 }, { "epoch": 0.2577833125778331, "grad_norm": 0.9064519814581001, "learning_rate": 3.4412925130023086e-07, "loss": 1.5287, "step": 1242 }, { "epoch": 0.2579908675799087, "grad_norm": 0.8732323880198436, "learning_rate": 3.440442277572194e-07, "loss": 1.5394, "step": 1243 }, { "epoch": 0.2581984225819842, "grad_norm": 8.232261890447024, "learning_rate": 3.439591514738954e-07, "loss": 1.5204, "step": 1244 }, { "epoch": 0.25840597758405975, "grad_norm": 0.8898526844814135, "learning_rate": 3.438740224864312e-07, "loss": 1.5105, "step": 1245 }, { "epoch": 0.25861353258613534, "grad_norm": 0.9339172497931623, "learning_rate": 3.437888408310213e-07, "loss": 1.514, "step": 1246 }, { "epoch": 0.2588210875882109, "grad_norm": 0.9333694988769258, "learning_rate": 3.437036065438827e-07, "loss": 1.6093, "step": 1247 }, { "epoch": 0.2590286425902864, "grad_norm": 0.6794141893096668, "learning_rate": 3.4361831966125474e-07, "loss": 1.446, "step": 1248 }, { "epoch": 0.259236197592362, "grad_norm": 1.0455423324954585, "learning_rate": 3.4353298021939907e-07, "loss": 1.5688, "step": 1249 }, { "epoch": 0.2594437525944375, "grad_norm": 1.017536821245971, "learning_rate": 3.434475882545999e-07, "loss": 1.5796, "step": 1250 }, { "epoch": 0.25965130759651306, "grad_norm": 1.1193592725916437, "learning_rate": 3.433621438031635e-07, "loss": 1.5464, "step": 1251 }, { "epoch": 0.25985886259858865, "grad_norm": 1.766508056696705, "learning_rate": 3.4327664690141865e-07, "loss": 1.6188, "step": 1252 }, { "epoch": 0.2600664176006642, "grad_norm": 0.7096153328756882, "learning_rate": 3.4319109758571635e-07, "loss": 1.5531, "step": 1253 }, { "epoch": 0.2602739726027397, "grad_norm": 0.7521727115271983, "learning_rate": 3.431054958924299e-07, "loss": 1.5499, "step": 1254 }, { "epoch": 0.2604815276048153, "grad_norm": 1.0492497629841584, "learning_rate": 3.4301984185795487e-07, "loss": 1.4762, "step": 1255 }, { "epoch": 0.26068908260689083, "grad_norm": 0.8157158938997188, "learning_rate": 3.429341355187091e-07, "loss": 1.5297, "step": 1256 }, { "epoch": 0.26089663760896636, "grad_norm": 0.7112893149121823, "learning_rate": 3.4284837691113255e-07, "loss": 1.538, "step": 1257 }, { "epoch": 0.26110419261104195, "grad_norm": 1.456030997449654, "learning_rate": 3.427625660716876e-07, "loss": 1.5762, "step": 1258 }, { "epoch": 0.2613117476131175, "grad_norm": 0.7537865709228221, "learning_rate": 3.426767030368587e-07, "loss": 1.5248, "step": 1259 }, { "epoch": 0.261519302615193, "grad_norm": 0.9498334019222445, "learning_rate": 3.425907878431526e-07, "loss": 1.4351, "step": 1260 }, { "epoch": 0.26172685761726855, "grad_norm": 0.6692248470470078, "learning_rate": 3.425048205270981e-07, "loss": 1.4829, "step": 1261 }, { "epoch": 0.26193441261934414, "grad_norm": 1.0225832291961392, "learning_rate": 3.424188011252462e-07, "loss": 1.5155, "step": 1262 }, { "epoch": 0.26214196762141967, "grad_norm": 1.3243499833191412, "learning_rate": 3.4233272967417006e-07, "loss": 1.561, "step": 1263 }, { "epoch": 0.2623495226234952, "grad_norm": 0.6854015525731081, "learning_rate": 3.422466062104651e-07, "loss": 1.5237, "step": 1264 }, { "epoch": 0.2625570776255708, "grad_norm": 0.658758538643818, "learning_rate": 3.421604307707486e-07, "loss": 1.5389, "step": 1265 }, { "epoch": 0.2627646326276463, "grad_norm": 0.6743833453979914, "learning_rate": 3.420742033916601e-07, "loss": 1.5162, "step": 1266 }, { "epoch": 0.26297218762972185, "grad_norm": 0.94853333417178, "learning_rate": 3.419879241098612e-07, "loss": 1.5222, "step": 1267 }, { "epoch": 0.26317974263179744, "grad_norm": 0.7766625914771926, "learning_rate": 3.419015929620356e-07, "loss": 1.4911, "step": 1268 }, { "epoch": 0.263387297633873, "grad_norm": 0.6997670424334552, "learning_rate": 3.4181520998488895e-07, "loss": 1.56, "step": 1269 }, { "epoch": 0.2635948526359485, "grad_norm": 0.77325213552909, "learning_rate": 3.4172877521514905e-07, "loss": 1.5923, "step": 1270 }, { "epoch": 0.2638024076380241, "grad_norm": 1.539055402435133, "learning_rate": 3.4164228868956563e-07, "loss": 1.574, "step": 1271 }, { "epoch": 0.2640099626400996, "grad_norm": 1.2536444220547092, "learning_rate": 3.415557504449105e-07, "loss": 1.5091, "step": 1272 }, { "epoch": 0.26421751764217516, "grad_norm": 0.7575551879670133, "learning_rate": 3.4146916051797735e-07, "loss": 1.5458, "step": 1273 }, { "epoch": 0.26442507264425075, "grad_norm": 0.807457667790524, "learning_rate": 3.41382518945582e-07, "loss": 1.5439, "step": 1274 }, { "epoch": 0.2646326276463263, "grad_norm": 0.7248201442239209, "learning_rate": 3.4129582576456213e-07, "loss": 1.5607, "step": 1275 }, { "epoch": 0.2648401826484018, "grad_norm": 0.7703701356946464, "learning_rate": 3.412090810117774e-07, "loss": 1.5348, "step": 1276 }, { "epoch": 0.2650477376504774, "grad_norm": 0.6790812074627258, "learning_rate": 3.411222847241092e-07, "loss": 1.605, "step": 1277 }, { "epoch": 0.26525529265255293, "grad_norm": 0.8150481788979888, "learning_rate": 3.4103543693846126e-07, "loss": 1.5238, "step": 1278 }, { "epoch": 0.26546284765462846, "grad_norm": 0.9352100500133862, "learning_rate": 3.409485376917589e-07, "loss": 1.5305, "step": 1279 }, { "epoch": 0.26567040265670405, "grad_norm": 0.7780215728437782, "learning_rate": 3.408615870209492e-07, "loss": 1.5196, "step": 1280 }, { "epoch": 0.2658779576587796, "grad_norm": 0.6253958895837718, "learning_rate": 3.4077458496300145e-07, "loss": 1.5393, "step": 1281 }, { "epoch": 0.2660855126608551, "grad_norm": 0.9639590714174837, "learning_rate": 3.406875315549066e-07, "loss": 1.5241, "step": 1282 }, { "epoch": 0.26629306766293065, "grad_norm": 0.6804335985210204, "learning_rate": 3.406004268336773e-07, "loss": 1.5337, "step": 1283 }, { "epoch": 0.26650062266500624, "grad_norm": 0.9608046822867599, "learning_rate": 3.405132708363483e-07, "loss": 1.6063, "step": 1284 }, { "epoch": 0.26670817766708177, "grad_norm": 0.8008725822506221, "learning_rate": 3.40426063599976e-07, "loss": 1.5602, "step": 1285 }, { "epoch": 0.2669157326691573, "grad_norm": 0.9153818179597167, "learning_rate": 3.4033880516163855e-07, "loss": 1.5005, "step": 1286 }, { "epoch": 0.2671232876712329, "grad_norm": 0.7747693875887444, "learning_rate": 3.40251495558436e-07, "loss": 1.5147, "step": 1287 }, { "epoch": 0.2673308426733084, "grad_norm": 0.8738491672385178, "learning_rate": 3.4016413482749006e-07, "loss": 1.6194, "step": 1288 }, { "epoch": 0.26753839767538395, "grad_norm": 7.597456785044188, "learning_rate": 3.4007672300594414e-07, "loss": 1.4556, "step": 1289 }, { "epoch": 0.26774595267745954, "grad_norm": 0.6610778339741703, "learning_rate": 3.3998926013096345e-07, "loss": 1.5336, "step": 1290 }, { "epoch": 0.2679535076795351, "grad_norm": 0.7975392858296627, "learning_rate": 3.399017462397349e-07, "loss": 1.6103, "step": 1291 }, { "epoch": 0.2681610626816106, "grad_norm": 0.870348270223364, "learning_rate": 3.39814181369467e-07, "loss": 1.6068, "step": 1292 }, { "epoch": 0.2683686176836862, "grad_norm": 0.9142153106628116, "learning_rate": 3.397265655573901e-07, "loss": 1.5689, "step": 1293 }, { "epoch": 0.2685761726857617, "grad_norm": 0.831001094053461, "learning_rate": 3.3963889884075613e-07, "loss": 1.5699, "step": 1294 }, { "epoch": 0.26878372768783726, "grad_norm": 1.0630847627657554, "learning_rate": 3.395511812568386e-07, "loss": 1.4947, "step": 1295 }, { "epoch": 0.26899128268991285, "grad_norm": 1.0434866633138011, "learning_rate": 3.394634128429326e-07, "loss": 1.5765, "step": 1296 }, { "epoch": 0.2691988376919884, "grad_norm": 1.4759916928688286, "learning_rate": 3.3937559363635517e-07, "loss": 1.5064, "step": 1297 }, { "epoch": 0.2694063926940639, "grad_norm": 0.8175486075063759, "learning_rate": 3.392877236744445e-07, "loss": 1.551, "step": 1298 }, { "epoch": 0.2696139476961395, "grad_norm": 1.2467956338180735, "learning_rate": 3.391998029945606e-07, "loss": 1.5079, "step": 1299 }, { "epoch": 0.26982150269821503, "grad_norm": 0.7962141657570511, "learning_rate": 3.391118316340851e-07, "loss": 1.5688, "step": 1300 }, { "epoch": 0.27002905770029056, "grad_norm": 0.8663931529115972, "learning_rate": 3.3902380963042103e-07, "loss": 1.4879, "step": 1301 }, { "epoch": 0.27023661270236615, "grad_norm": 1.5227446569908352, "learning_rate": 3.3893573702099295e-07, "loss": 1.5449, "step": 1302 }, { "epoch": 0.2704441677044417, "grad_norm": 1.5952769930727484, "learning_rate": 3.388476138432471e-07, "loss": 1.5795, "step": 1303 }, { "epoch": 0.2706517227065172, "grad_norm": 0.9920665270345874, "learning_rate": 3.3875944013465106e-07, "loss": 1.6059, "step": 1304 }, { "epoch": 0.27085927770859275, "grad_norm": 0.7767794174497293, "learning_rate": 3.38671215932694e-07, "loss": 1.4674, "step": 1305 }, { "epoch": 0.27106683271066834, "grad_norm": 0.7223885972104012, "learning_rate": 3.3858294127488636e-07, "loss": 1.5108, "step": 1306 }, { "epoch": 0.27127438771274387, "grad_norm": 0.7703347315478367, "learning_rate": 3.3849461619876033e-07, "loss": 1.5391, "step": 1307 }, { "epoch": 0.2714819427148194, "grad_norm": 1.083558951353024, "learning_rate": 3.3840624074186946e-07, "loss": 1.5693, "step": 1308 }, { "epoch": 0.271689497716895, "grad_norm": 0.7960948613082768, "learning_rate": 3.383178149417884e-07, "loss": 1.6374, "step": 1309 }, { "epoch": 0.2718970527189705, "grad_norm": 0.8181028809341938, "learning_rate": 3.382293388361136e-07, "loss": 1.5521, "step": 1310 }, { "epoch": 0.27210460772104605, "grad_norm": 0.822942233450186, "learning_rate": 3.381408124624628e-07, "loss": 1.5792, "step": 1311 }, { "epoch": 0.27231216272312164, "grad_norm": 1.0283885868319704, "learning_rate": 3.3805223585847493e-07, "loss": 1.5356, "step": 1312 }, { "epoch": 0.2725197177251972, "grad_norm": 0.9953628936821117, "learning_rate": 3.3796360906181054e-07, "loss": 1.525, "step": 1313 }, { "epoch": 0.2727272727272727, "grad_norm": 1.6313072242785849, "learning_rate": 3.3787493211015133e-07, "loss": 1.5858, "step": 1314 }, { "epoch": 0.2729348277293483, "grad_norm": 0.7614107450589407, "learning_rate": 3.377862050412003e-07, "loss": 1.4772, "step": 1315 }, { "epoch": 0.2731423827314238, "grad_norm": 0.7517826941209897, "learning_rate": 3.376974278926821e-07, "loss": 1.5588, "step": 1316 }, { "epoch": 0.27334993773349936, "grad_norm": 0.9205421541542501, "learning_rate": 3.376086007023421e-07, "loss": 1.5645, "step": 1317 }, { "epoch": 0.27355749273557495, "grad_norm": 0.644662343214532, "learning_rate": 3.375197235079474e-07, "loss": 1.5083, "step": 1318 }, { "epoch": 0.2737650477376505, "grad_norm": 0.6802868003081509, "learning_rate": 3.3743079634728637e-07, "loss": 1.6246, "step": 1319 }, { "epoch": 0.273972602739726, "grad_norm": 0.768816148544853, "learning_rate": 3.3734181925816824e-07, "loss": 1.5766, "step": 1320 }, { "epoch": 0.2741801577418016, "grad_norm": 0.7713671834891636, "learning_rate": 3.3725279227842385e-07, "loss": 1.5415, "step": 1321 }, { "epoch": 0.27438771274387713, "grad_norm": 0.6992736466420109, "learning_rate": 3.371637154459051e-07, "loss": 1.5487, "step": 1322 }, { "epoch": 0.27459526774595266, "grad_norm": 1.1645797501442725, "learning_rate": 3.37074588798485e-07, "loss": 1.505, "step": 1323 }, { "epoch": 0.27480282274802825, "grad_norm": 0.8082129119597378, "learning_rate": 3.369854123740579e-07, "loss": 1.4956, "step": 1324 }, { "epoch": 0.2750103777501038, "grad_norm": 1.04479833148827, "learning_rate": 3.3689618621053924e-07, "loss": 1.5379, "step": 1325 }, { "epoch": 0.2752179327521793, "grad_norm": 0.7179713369018662, "learning_rate": 3.3680691034586565e-07, "loss": 1.4695, "step": 1326 }, { "epoch": 0.2754254877542549, "grad_norm": 0.7686131203548663, "learning_rate": 3.3671758481799484e-07, "loss": 1.4641, "step": 1327 }, { "epoch": 0.27563304275633044, "grad_norm": 4.48297428836442, "learning_rate": 3.366282096649056e-07, "loss": 1.5001, "step": 1328 }, { "epoch": 0.27584059775840597, "grad_norm": 0.6928145750710054, "learning_rate": 3.3653878492459793e-07, "loss": 1.5642, "step": 1329 }, { "epoch": 0.2760481527604815, "grad_norm": 0.9673980706438595, "learning_rate": 3.3644931063509287e-07, "loss": 1.6038, "step": 1330 }, { "epoch": 0.2762557077625571, "grad_norm": 0.7952910298893379, "learning_rate": 3.363597868344324e-07, "loss": 1.5937, "step": 1331 }, { "epoch": 0.2764632627646326, "grad_norm": 1.4477721379840571, "learning_rate": 3.3627021356067987e-07, "loss": 1.5336, "step": 1332 }, { "epoch": 0.27667081776670815, "grad_norm": 0.62242184799139, "learning_rate": 3.361805908519192e-07, "loss": 1.516, "step": 1333 }, { "epoch": 0.27687837276878374, "grad_norm": 0.733347199404752, "learning_rate": 3.360909187462558e-07, "loss": 1.466, "step": 1334 }, { "epoch": 0.2770859277708593, "grad_norm": 0.7924465185545144, "learning_rate": 3.360011972818158e-07, "loss": 1.6244, "step": 1335 }, { "epoch": 0.2772934827729348, "grad_norm": 1.1516186087648017, "learning_rate": 3.359114264967463e-07, "loss": 1.5155, "step": 1336 }, { "epoch": 0.2775010377750104, "grad_norm": 0.8312352958790239, "learning_rate": 3.358216064292156e-07, "loss": 1.5276, "step": 1337 }, { "epoch": 0.2777085927770859, "grad_norm": 0.7314857669887004, "learning_rate": 3.357317371174127e-07, "loss": 1.5777, "step": 1338 }, { "epoch": 0.27791614777916146, "grad_norm": 1.3839777919660108, "learning_rate": 3.356418185995477e-07, "loss": 1.571, "step": 1339 }, { "epoch": 0.27812370278123705, "grad_norm": 0.8427359483829014, "learning_rate": 3.355518509138515e-07, "loss": 1.5695, "step": 1340 }, { "epoch": 0.2783312577833126, "grad_norm": 0.8102609613745911, "learning_rate": 3.3546183409857605e-07, "loss": 1.5913, "step": 1341 }, { "epoch": 0.2785388127853881, "grad_norm": 0.6309960468971255, "learning_rate": 3.3537176819199407e-07, "loss": 1.5117, "step": 1342 }, { "epoch": 0.2787463677874637, "grad_norm": 0.655136803538487, "learning_rate": 3.352816532323992e-07, "loss": 1.5687, "step": 1343 }, { "epoch": 0.27895392278953923, "grad_norm": 1.0736280622254268, "learning_rate": 3.35191489258106e-07, "loss": 1.526, "step": 1344 }, { "epoch": 0.27916147779161476, "grad_norm": 2.5400433135755356, "learning_rate": 3.351012763074496e-07, "loss": 1.5219, "step": 1345 }, { "epoch": 0.27936903279369035, "grad_norm": 0.8499342752672578, "learning_rate": 3.350110144187864e-07, "loss": 1.6216, "step": 1346 }, { "epoch": 0.2795765877957659, "grad_norm": 0.9035787915782699, "learning_rate": 3.3492070363049315e-07, "loss": 1.6016, "step": 1347 }, { "epoch": 0.2797841427978414, "grad_norm": 1.0699886207147347, "learning_rate": 3.3483034398096777e-07, "loss": 1.4731, "step": 1348 }, { "epoch": 0.279991697799917, "grad_norm": 2.166681950803039, "learning_rate": 3.347399355086286e-07, "loss": 1.504, "step": 1349 }, { "epoch": 0.28019925280199254, "grad_norm": 0.7351985886540563, "learning_rate": 3.3464947825191507e-07, "loss": 1.5447, "step": 1350 }, { "epoch": 0.28040680780406807, "grad_norm": 0.8037359505487619, "learning_rate": 3.3455897224928717e-07, "loss": 1.5657, "step": 1351 }, { "epoch": 0.2806143628061436, "grad_norm": 0.6751663426318365, "learning_rate": 3.3446841753922565e-07, "loss": 1.4916, "step": 1352 }, { "epoch": 0.2808219178082192, "grad_norm": 1.0977158443448278, "learning_rate": 3.343778141602319e-07, "loss": 1.486, "step": 1353 }, { "epoch": 0.2810294728102947, "grad_norm": 0.8736167191500952, "learning_rate": 3.3428716215082823e-07, "loss": 1.5328, "step": 1354 }, { "epoch": 0.28123702781237025, "grad_norm": 0.8801826959208915, "learning_rate": 3.341964615495573e-07, "loss": 1.5398, "step": 1355 }, { "epoch": 0.28144458281444584, "grad_norm": 0.8796282094919217, "learning_rate": 3.3410571239498266e-07, "loss": 1.5499, "step": 1356 }, { "epoch": 0.2816521378165214, "grad_norm": 0.722629334914455, "learning_rate": 3.3401491472568843e-07, "loss": 1.4327, "step": 1357 }, { "epoch": 0.2818596928185969, "grad_norm": 1.0383693580401288, "learning_rate": 3.339240685802794e-07, "loss": 1.5016, "step": 1358 }, { "epoch": 0.2820672478206725, "grad_norm": 1.0356899312072005, "learning_rate": 3.338331739973809e-07, "loss": 1.5734, "step": 1359 }, { "epoch": 0.282274802822748, "grad_norm": 0.7177652800960455, "learning_rate": 3.3374223101563894e-07, "loss": 1.578, "step": 1360 }, { "epoch": 0.28248235782482356, "grad_norm": 0.7229667525162314, "learning_rate": 3.3365123967372e-07, "loss": 1.5139, "step": 1361 }, { "epoch": 0.28268991282689915, "grad_norm": 1.4787085351149962, "learning_rate": 3.3356020001031126e-07, "loss": 1.5397, "step": 1362 }, { "epoch": 0.2828974678289747, "grad_norm": 0.6660004355114754, "learning_rate": 3.3346911206412033e-07, "loss": 1.5141, "step": 1363 }, { "epoch": 0.2831050228310502, "grad_norm": 0.6433280477219206, "learning_rate": 3.333779758738754e-07, "loss": 1.5792, "step": 1364 }, { "epoch": 0.2833125778331258, "grad_norm": 0.7489890878045019, "learning_rate": 3.3328679147832516e-07, "loss": 1.5808, "step": 1365 }, { "epoch": 0.28352013283520133, "grad_norm": 0.8010819810767609, "learning_rate": 3.3319555891623864e-07, "loss": 1.5154, "step": 1366 }, { "epoch": 0.28372768783727687, "grad_norm": 0.9181979113229911, "learning_rate": 3.331042782264058e-07, "loss": 1.497, "step": 1367 }, { "epoch": 0.28393524283935245, "grad_norm": 0.7165485941022961, "learning_rate": 3.3301294944763647e-07, "loss": 1.5121, "step": 1368 }, { "epoch": 0.284142797841428, "grad_norm": 0.6830942915816728, "learning_rate": 3.3292157261876146e-07, "loss": 1.482, "step": 1369 }, { "epoch": 0.2843503528435035, "grad_norm": 0.8371948148412527, "learning_rate": 3.3283014777863165e-07, "loss": 1.5007, "step": 1370 }, { "epoch": 0.2845579078455791, "grad_norm": 0.762599666183577, "learning_rate": 3.327386749661185e-07, "loss": 1.4943, "step": 1371 }, { "epoch": 0.28476546284765464, "grad_norm": 0.8275233197463118, "learning_rate": 3.326471542201137e-07, "loss": 1.5951, "step": 1372 }, { "epoch": 0.28497301784973017, "grad_norm": 0.7922012907311398, "learning_rate": 3.3255558557952965e-07, "loss": 1.4832, "step": 1373 }, { "epoch": 0.2851805728518057, "grad_norm": 0.6497828187569625, "learning_rate": 3.324639690832987e-07, "loss": 1.6181, "step": 1374 }, { "epoch": 0.2853881278538813, "grad_norm": 0.8024742814319767, "learning_rate": 3.3237230477037387e-07, "loss": 1.5728, "step": 1375 }, { "epoch": 0.2855956828559568, "grad_norm": 1.2464912098915424, "learning_rate": 3.322805926797284e-07, "loss": 1.5018, "step": 1376 }, { "epoch": 0.28580323785803236, "grad_norm": 1.2118377303976169, "learning_rate": 3.321888328503558e-07, "loss": 1.488, "step": 1377 }, { "epoch": 0.28601079286010794, "grad_norm": 0.8677127039471785, "learning_rate": 3.320970253212699e-07, "loss": 1.52, "step": 1378 }, { "epoch": 0.2862183478621835, "grad_norm": 0.7946896236611976, "learning_rate": 3.3200517013150485e-07, "loss": 1.6039, "step": 1379 }, { "epoch": 0.286425902864259, "grad_norm": 1.0209073010802001, "learning_rate": 3.31913267320115e-07, "loss": 1.51, "step": 1380 }, { "epoch": 0.2866334578663346, "grad_norm": 1.25456686165671, "learning_rate": 3.3182131692617497e-07, "loss": 1.557, "step": 1381 }, { "epoch": 0.28684101286841013, "grad_norm": 1.0615213634109164, "learning_rate": 3.3172931898877976e-07, "loss": 1.5744, "step": 1382 }, { "epoch": 0.28704856787048566, "grad_norm": 0.6858341477285249, "learning_rate": 3.316372735470444e-07, "loss": 1.4688, "step": 1383 }, { "epoch": 0.28725612287256125, "grad_norm": 0.694940603241787, "learning_rate": 3.3154518064010403e-07, "loss": 1.5382, "step": 1384 }, { "epoch": 0.2874636778746368, "grad_norm": 4.924309431518236, "learning_rate": 3.314530403071142e-07, "loss": 1.4993, "step": 1385 }, { "epoch": 0.2876712328767123, "grad_norm": 0.878152732898713, "learning_rate": 3.313608525872506e-07, "loss": 1.5711, "step": 1386 }, { "epoch": 0.2878787878787879, "grad_norm": 0.8255705074112311, "learning_rate": 3.312686175197089e-07, "loss": 1.5321, "step": 1387 }, { "epoch": 0.28808634288086343, "grad_norm": 0.670398028968472, "learning_rate": 3.311763351437051e-07, "loss": 1.529, "step": 1388 }, { "epoch": 0.28829389788293897, "grad_norm": 1.141528302956471, "learning_rate": 3.310840054984751e-07, "loss": 1.4962, "step": 1389 }, { "epoch": 0.28850145288501455, "grad_norm": 0.980374076707978, "learning_rate": 3.3099162862327517e-07, "loss": 1.571, "step": 1390 }, { "epoch": 0.2887090078870901, "grad_norm": 0.7968563122606515, "learning_rate": 3.3089920455738135e-07, "loss": 1.4797, "step": 1391 }, { "epoch": 0.2889165628891656, "grad_norm": 0.7778048330000662, "learning_rate": 3.308067333400899e-07, "loss": 1.5637, "step": 1392 }, { "epoch": 0.2891241178912412, "grad_norm": 0.6469492821123097, "learning_rate": 3.307142150107172e-07, "loss": 1.5405, "step": 1393 }, { "epoch": 0.28933167289331674, "grad_norm": 0.8904714328368389, "learning_rate": 3.306216496085996e-07, "loss": 1.5123, "step": 1394 }, { "epoch": 0.28953922789539227, "grad_norm": 0.841095944065256, "learning_rate": 3.305290371730935e-07, "loss": 1.5405, "step": 1395 }, { "epoch": 0.2897467828974678, "grad_norm": 0.9836085117911635, "learning_rate": 3.304363777435751e-07, "loss": 1.5187, "step": 1396 }, { "epoch": 0.2899543378995434, "grad_norm": 0.6847796469797756, "learning_rate": 3.3034367135944077e-07, "loss": 1.5553, "step": 1397 }, { "epoch": 0.2901618929016189, "grad_norm": 1.3767365081878549, "learning_rate": 3.302509180601069e-07, "loss": 1.4842, "step": 1398 }, { "epoch": 0.29036944790369446, "grad_norm": 1.7179775329560176, "learning_rate": 3.3015811788500965e-07, "loss": 1.5059, "step": 1399 }, { "epoch": 0.29057700290577004, "grad_norm": 0.8127238892282448, "learning_rate": 3.300652708736052e-07, "loss": 1.4776, "step": 1400 }, { "epoch": 0.2907845579078456, "grad_norm": 0.8727007834826351, "learning_rate": 3.299723770653696e-07, "loss": 1.5215, "step": 1401 }, { "epoch": 0.2909921129099211, "grad_norm": 0.6508045803468447, "learning_rate": 3.2987943649979894e-07, "loss": 1.5088, "step": 1402 }, { "epoch": 0.2911996679119967, "grad_norm": 0.8812833851828636, "learning_rate": 3.29786449216409e-07, "loss": 1.5782, "step": 1403 }, { "epoch": 0.29140722291407223, "grad_norm": 1.1967984831572973, "learning_rate": 3.2969341525473545e-07, "loss": 1.5297, "step": 1404 }, { "epoch": 0.29161477791614776, "grad_norm": 2.2821788784399586, "learning_rate": 3.2960033465433404e-07, "loss": 1.5594, "step": 1405 }, { "epoch": 0.29182233291822335, "grad_norm": 0.9436529448984353, "learning_rate": 3.2950720745477995e-07, "loss": 1.5813, "step": 1406 }, { "epoch": 0.2920298879202989, "grad_norm": 0.6671248652370418, "learning_rate": 3.294140336956686e-07, "loss": 1.6244, "step": 1407 }, { "epoch": 0.2922374429223744, "grad_norm": 0.7773983954069197, "learning_rate": 3.293208134166148e-07, "loss": 1.5582, "step": 1408 }, { "epoch": 0.29244499792445, "grad_norm": 1.306306347959827, "learning_rate": 3.292275466572535e-07, "loss": 1.5607, "step": 1409 }, { "epoch": 0.29265255292652553, "grad_norm": 0.7771192019268828, "learning_rate": 3.291342334572392e-07, "loss": 1.5766, "step": 1410 }, { "epoch": 0.29286010792860107, "grad_norm": 1.053851512481581, "learning_rate": 3.290408738562462e-07, "loss": 1.5459, "step": 1411 }, { "epoch": 0.29306766293067665, "grad_norm": 0.7854016789739342, "learning_rate": 3.2894746789396843e-07, "loss": 1.5953, "step": 1412 }, { "epoch": 0.2932752179327522, "grad_norm": 0.8811751599708441, "learning_rate": 3.288540156101197e-07, "loss": 1.5447, "step": 1413 }, { "epoch": 0.2934827729348277, "grad_norm": 0.8532443790437596, "learning_rate": 3.2876051704443356e-07, "loss": 1.504, "step": 1414 }, { "epoch": 0.2936903279369033, "grad_norm": 0.7224694881158574, "learning_rate": 3.286669722366628e-07, "loss": 1.5505, "step": 1415 }, { "epoch": 0.29389788293897884, "grad_norm": 0.754753634337428, "learning_rate": 3.2857338122658054e-07, "loss": 1.4468, "step": 1416 }, { "epoch": 0.29410543794105437, "grad_norm": 1.2362141982583785, "learning_rate": 3.2847974405397904e-07, "loss": 1.4769, "step": 1417 }, { "epoch": 0.2943129929431299, "grad_norm": 0.9164007262195232, "learning_rate": 3.283860607586703e-07, "loss": 1.4946, "step": 1418 }, { "epoch": 0.2945205479452055, "grad_norm": 1.1049340030253059, "learning_rate": 3.28292331380486e-07, "loss": 1.5109, "step": 1419 }, { "epoch": 0.294728102947281, "grad_norm": 1.1513378433777173, "learning_rate": 3.281985559592775e-07, "loss": 1.4919, "step": 1420 }, { "epoch": 0.29493565794935656, "grad_norm": 1.3770131462140454, "learning_rate": 3.281047345349154e-07, "loss": 1.6033, "step": 1421 }, { "epoch": 0.29514321295143214, "grad_norm": 0.7988219249609612, "learning_rate": 3.280108671472902e-07, "loss": 1.524, "step": 1422 }, { "epoch": 0.2953507679535077, "grad_norm": 1.8136097396798223, "learning_rate": 3.279169538363119e-07, "loss": 1.5501, "step": 1423 }, { "epoch": 0.2955583229555832, "grad_norm": 1.0985061222247552, "learning_rate": 3.2782299464190977e-07, "loss": 1.5024, "step": 1424 }, { "epoch": 0.2957658779576588, "grad_norm": 2.274843550556237, "learning_rate": 3.277289896040329e-07, "loss": 1.4161, "step": 1425 }, { "epoch": 0.29597343295973433, "grad_norm": 0.7016579066219234, "learning_rate": 3.276349387626497e-07, "loss": 1.5348, "step": 1426 }, { "epoch": 0.29618098796180986, "grad_norm": 3.9305990413286143, "learning_rate": 3.2754084215774805e-07, "loss": 1.5662, "step": 1427 }, { "epoch": 0.29638854296388545, "grad_norm": 1.3508454303397472, "learning_rate": 3.2744669982933533e-07, "loss": 1.4868, "step": 1428 }, { "epoch": 0.296596097965961, "grad_norm": 0.8133253397220569, "learning_rate": 3.273525118174385e-07, "loss": 1.5166, "step": 1429 }, { "epoch": 0.2968036529680365, "grad_norm": 0.7292492744462185, "learning_rate": 3.272582781621036e-07, "loss": 1.4296, "step": 1430 }, { "epoch": 0.2970112079701121, "grad_norm": 1.1744269924600776, "learning_rate": 3.271639989033964e-07, "loss": 1.5518, "step": 1431 }, { "epoch": 0.29721876297218763, "grad_norm": 0.8617304688474138, "learning_rate": 3.270696740814019e-07, "loss": 1.5373, "step": 1432 }, { "epoch": 0.29742631797426317, "grad_norm": 0.7748498227845443, "learning_rate": 3.2697530373622456e-07, "loss": 1.5572, "step": 1433 }, { "epoch": 0.29763387297633875, "grad_norm": 0.996435065936527, "learning_rate": 3.2688088790798805e-07, "loss": 1.5103, "step": 1434 }, { "epoch": 0.2978414279784143, "grad_norm": 0.7514739154389984, "learning_rate": 3.267864266368356e-07, "loss": 1.5786, "step": 1435 }, { "epoch": 0.2980489829804898, "grad_norm": 0.7802674892361465, "learning_rate": 3.266919199629295e-07, "loss": 1.5494, "step": 1436 }, { "epoch": 0.2982565379825654, "grad_norm": 2.4547405386977643, "learning_rate": 3.265973679264515e-07, "loss": 1.523, "step": 1437 }, { "epoch": 0.29846409298464094, "grad_norm": 1.8008797986363987, "learning_rate": 3.2650277056760277e-07, "loss": 1.5008, "step": 1438 }, { "epoch": 0.29867164798671647, "grad_norm": 0.7646547976048881, "learning_rate": 3.264081279266034e-07, "loss": 1.4195, "step": 1439 }, { "epoch": 0.298879202988792, "grad_norm": 0.8377092749577859, "learning_rate": 3.2631344004369303e-07, "loss": 1.6047, "step": 1440 }, { "epoch": 0.2990867579908676, "grad_norm": 0.7624046233112225, "learning_rate": 3.262187069591304e-07, "loss": 1.5179, "step": 1441 }, { "epoch": 0.2992943129929431, "grad_norm": 1.5539460774173561, "learning_rate": 3.2612392871319356e-07, "loss": 1.5303, "step": 1442 }, { "epoch": 0.29950186799501866, "grad_norm": 1.8339698584285773, "learning_rate": 3.2602910534617966e-07, "loss": 1.5572, "step": 1443 }, { "epoch": 0.29970942299709424, "grad_norm": 0.7229225258599659, "learning_rate": 3.2593423689840504e-07, "loss": 1.5798, "step": 1444 }, { "epoch": 0.2999169779991698, "grad_norm": 1.1951479312285485, "learning_rate": 3.2583932341020524e-07, "loss": 1.4973, "step": 1445 }, { "epoch": 0.3001245330012453, "grad_norm": 0.6510380730429101, "learning_rate": 3.2574436492193507e-07, "loss": 1.4627, "step": 1446 }, { "epoch": 0.3003320880033209, "grad_norm": 8.980845521490364, "learning_rate": 3.2564936147396826e-07, "loss": 1.5012, "step": 1447 }, { "epoch": 0.30053964300539643, "grad_norm": 0.649235346083898, "learning_rate": 3.2555431310669786e-07, "loss": 1.5352, "step": 1448 }, { "epoch": 0.30074719800747196, "grad_norm": 0.6584661656135817, "learning_rate": 3.2545921986053574e-07, "loss": 1.5303, "step": 1449 }, { "epoch": 0.30095475300954755, "grad_norm": 0.8579868121235932, "learning_rate": 3.253640817759132e-07, "loss": 1.4733, "step": 1450 }, { "epoch": 0.3011623080116231, "grad_norm": 0.874759434511237, "learning_rate": 3.252688988932803e-07, "loss": 1.5217, "step": 1451 }, { "epoch": 0.3013698630136986, "grad_norm": 1.8249093374354721, "learning_rate": 3.2517367125310633e-07, "loss": 1.5272, "step": 1452 }, { "epoch": 0.3015774180157742, "grad_norm": 0.9111732378935296, "learning_rate": 3.250783988958795e-07, "loss": 1.5509, "step": 1453 }, { "epoch": 0.30178497301784973, "grad_norm": 6.763506849431699, "learning_rate": 3.2498308186210716e-07, "loss": 1.545, "step": 1454 }, { "epoch": 0.30199252801992527, "grad_norm": 0.6229391503821824, "learning_rate": 3.248877201923156e-07, "loss": 1.4267, "step": 1455 }, { "epoch": 0.30220008302200085, "grad_norm": 0.7965010096817351, "learning_rate": 3.2479231392704994e-07, "loss": 1.5477, "step": 1456 }, { "epoch": 0.3024076380240764, "grad_norm": 0.7047090854295499, "learning_rate": 3.2469686310687453e-07, "loss": 1.5514, "step": 1457 }, { "epoch": 0.3026151930261519, "grad_norm": 1.2636396073348313, "learning_rate": 3.246013677723725e-07, "loss": 1.552, "step": 1458 }, { "epoch": 0.3028227480282275, "grad_norm": 0.6724373666837078, "learning_rate": 3.2450582796414583e-07, "loss": 1.6598, "step": 1459 }, { "epoch": 0.30303030303030304, "grad_norm": 0.8141770331673945, "learning_rate": 3.244102437228157e-07, "loss": 1.5257, "step": 1460 }, { "epoch": 0.30323785803237857, "grad_norm": 0.6351867484208295, "learning_rate": 3.2431461508902177e-07, "loss": 1.4958, "step": 1461 }, { "epoch": 0.3034454130344541, "grad_norm": 1.1014433562112305, "learning_rate": 3.2421894210342294e-07, "loss": 1.5116, "step": 1462 }, { "epoch": 0.3036529680365297, "grad_norm": 0.9439205676959231, "learning_rate": 3.241232248066967e-07, "loss": 1.6225, "step": 1463 }, { "epoch": 0.3038605230386052, "grad_norm": 1.2747688983522443, "learning_rate": 3.2402746323953973e-07, "loss": 1.5608, "step": 1464 }, { "epoch": 0.30406807804068076, "grad_norm": 1.2117643087248915, "learning_rate": 3.239316574426671e-07, "loss": 1.5473, "step": 1465 }, { "epoch": 0.30427563304275634, "grad_norm": 1.4327599611642121, "learning_rate": 3.2383580745681287e-07, "loss": 1.5785, "step": 1466 }, { "epoch": 0.3044831880448319, "grad_norm": 0.7804375176536272, "learning_rate": 3.2373991332273005e-07, "loss": 1.5125, "step": 1467 }, { "epoch": 0.3046907430469074, "grad_norm": 0.8744877261273809, "learning_rate": 3.2364397508119025e-07, "loss": 1.5426, "step": 1468 }, { "epoch": 0.304898298048983, "grad_norm": 0.652419939295772, "learning_rate": 3.235479927729838e-07, "loss": 1.5356, "step": 1469 }, { "epoch": 0.30510585305105853, "grad_norm": 0.7963126907186158, "learning_rate": 3.2345196643891997e-07, "loss": 1.575, "step": 1470 }, { "epoch": 0.30531340805313406, "grad_norm": 0.7956472285325714, "learning_rate": 3.233558961198264e-07, "loss": 1.4802, "step": 1471 }, { "epoch": 0.30552096305520965, "grad_norm": 0.7809079460953622, "learning_rate": 3.2325978185654973e-07, "loss": 1.4406, "step": 1472 }, { "epoch": 0.3057285180572852, "grad_norm": 0.8946802679198177, "learning_rate": 3.2316362368995524e-07, "loss": 1.5148, "step": 1473 }, { "epoch": 0.3059360730593607, "grad_norm": 0.9007462742163707, "learning_rate": 3.230674216609268e-07, "loss": 1.5207, "step": 1474 }, { "epoch": 0.3061436280614363, "grad_norm": 1.1915168831473029, "learning_rate": 3.2297117581036697e-07, "loss": 1.4669, "step": 1475 }, { "epoch": 0.30635118306351183, "grad_norm": 0.7114571722835049, "learning_rate": 3.2287488617919693e-07, "loss": 1.5498, "step": 1476 }, { "epoch": 0.30655873806558737, "grad_norm": 0.6974210022531402, "learning_rate": 3.227785528083564e-07, "loss": 1.5246, "step": 1477 }, { "epoch": 0.30676629306766295, "grad_norm": 0.8418907685530905, "learning_rate": 3.2268217573880387e-07, "loss": 1.508, "step": 1478 }, { "epoch": 0.3069738480697385, "grad_norm": 0.6965700820659818, "learning_rate": 3.225857550115162e-07, "loss": 1.5171, "step": 1479 }, { "epoch": 0.307181403071814, "grad_norm": 0.7194809493418677, "learning_rate": 3.2248929066748906e-07, "loss": 1.6061, "step": 1480 }, { "epoch": 0.3073889580738896, "grad_norm": 0.7097689446817526, "learning_rate": 3.2239278274773644e-07, "loss": 1.4629, "step": 1481 }, { "epoch": 0.30759651307596514, "grad_norm": 0.8917484868027727, "learning_rate": 3.2229623129329104e-07, "loss": 1.552, "step": 1482 }, { "epoch": 0.30780406807804067, "grad_norm": 0.7241034861127932, "learning_rate": 3.2219963634520385e-07, "loss": 1.5119, "step": 1483 }, { "epoch": 0.3080116230801162, "grad_norm": 0.7168320259652754, "learning_rate": 3.221029979445445e-07, "loss": 1.5064, "step": 1484 }, { "epoch": 0.3082191780821918, "grad_norm": 0.7963308947496888, "learning_rate": 3.2200631613240114e-07, "loss": 1.5255, "step": 1485 }, { "epoch": 0.3084267330842673, "grad_norm": 0.6938457222888774, "learning_rate": 3.219095909498803e-07, "loss": 1.5677, "step": 1486 }, { "epoch": 0.30863428808634286, "grad_norm": 1.2338821644468418, "learning_rate": 3.218128224381069e-07, "loss": 1.5321, "step": 1487 }, { "epoch": 0.30884184308841844, "grad_norm": 0.6823592784556315, "learning_rate": 3.217160106382244e-07, "loss": 1.5544, "step": 1488 }, { "epoch": 0.309049398090494, "grad_norm": 0.8444088039991224, "learning_rate": 3.216191555913946e-07, "loss": 1.4365, "step": 1489 }, { "epoch": 0.3092569530925695, "grad_norm": 1.1429283479613144, "learning_rate": 3.215222573387976e-07, "loss": 1.5496, "step": 1490 }, { "epoch": 0.3094645080946451, "grad_norm": 1.0426805694076446, "learning_rate": 3.214253159216321e-07, "loss": 1.5383, "step": 1491 }, { "epoch": 0.30967206309672063, "grad_norm": 0.9045859135706596, "learning_rate": 3.213283313811149e-07, "loss": 1.5721, "step": 1492 }, { "epoch": 0.30987961809879616, "grad_norm": 0.6478698967547891, "learning_rate": 3.2123130375848136e-07, "loss": 1.5669, "step": 1493 }, { "epoch": 0.31008717310087175, "grad_norm": 1.2844247912688618, "learning_rate": 3.21134233094985e-07, "loss": 1.5101, "step": 1494 }, { "epoch": 0.3102947281029473, "grad_norm": 1.26847834702015, "learning_rate": 3.210371194318977e-07, "loss": 1.5249, "step": 1495 }, { "epoch": 0.3105022831050228, "grad_norm": 0.8212468144770589, "learning_rate": 3.2093996281050956e-07, "loss": 1.5571, "step": 1496 }, { "epoch": 0.3107098381070984, "grad_norm": 0.9878428196111082, "learning_rate": 3.2084276327212905e-07, "loss": 1.5576, "step": 1497 }, { "epoch": 0.31091739310917393, "grad_norm": 0.8026416846886552, "learning_rate": 3.207455208580828e-07, "loss": 1.5075, "step": 1498 }, { "epoch": 0.31112494811124947, "grad_norm": 0.6570688907094375, "learning_rate": 3.2064823560971587e-07, "loss": 1.4921, "step": 1499 }, { "epoch": 0.31133250311332505, "grad_norm": 0.6857331667772839, "learning_rate": 3.2055090756839103e-07, "loss": 1.5331, "step": 1500 }, { "epoch": 0.3115400581154006, "grad_norm": 0.9833241251790387, "learning_rate": 3.204535367754899e-07, "loss": 1.4873, "step": 1501 }, { "epoch": 0.3117476131174761, "grad_norm": 0.8292374552917077, "learning_rate": 3.203561232724118e-07, "loss": 1.5424, "step": 1502 }, { "epoch": 0.3119551681195517, "grad_norm": 0.8069331256009402, "learning_rate": 3.202586671005743e-07, "loss": 1.5921, "step": 1503 }, { "epoch": 0.31216272312162724, "grad_norm": 0.6086234165403009, "learning_rate": 3.201611683014133e-07, "loss": 1.5202, "step": 1504 }, { "epoch": 0.31237027812370277, "grad_norm": 0.8160269642831041, "learning_rate": 3.200636269163827e-07, "loss": 1.5802, "step": 1505 }, { "epoch": 0.3125778331257783, "grad_norm": 0.7076949765826489, "learning_rate": 3.1996604298695444e-07, "loss": 1.5407, "step": 1506 }, { "epoch": 0.3127853881278539, "grad_norm": 0.7648446115921089, "learning_rate": 3.198684165546187e-07, "loss": 1.4971, "step": 1507 }, { "epoch": 0.3129929431299294, "grad_norm": 1.723302421532576, "learning_rate": 3.1977074766088355e-07, "loss": 1.5415, "step": 1508 }, { "epoch": 0.31320049813200496, "grad_norm": 0.8775475948486927, "learning_rate": 3.1967303634727525e-07, "loss": 1.5314, "step": 1509 }, { "epoch": 0.31340805313408054, "grad_norm": 0.8967103397146254, "learning_rate": 3.195752826553381e-07, "loss": 1.5252, "step": 1510 }, { "epoch": 0.3136156081361561, "grad_norm": 0.6926574208637071, "learning_rate": 3.194774866266343e-07, "loss": 1.6335, "step": 1511 }, { "epoch": 0.3138231631382316, "grad_norm": 0.6361002593920386, "learning_rate": 3.193796483027442e-07, "loss": 1.5409, "step": 1512 }, { "epoch": 0.3140307181403072, "grad_norm": 0.6470691998681614, "learning_rate": 3.1928176772526597e-07, "loss": 1.5559, "step": 1513 }, { "epoch": 0.31423827314238273, "grad_norm": 0.9979400111895484, "learning_rate": 3.1918384493581603e-07, "loss": 1.5492, "step": 1514 }, { "epoch": 0.31444582814445826, "grad_norm": 0.7579857391826024, "learning_rate": 3.1908587997602824e-07, "loss": 1.5712, "step": 1515 }, { "epoch": 0.31465338314653385, "grad_norm": 1.2558704077900733, "learning_rate": 3.189878728875549e-07, "loss": 1.624, "step": 1516 }, { "epoch": 0.3148609381486094, "grad_norm": 0.6426516950425566, "learning_rate": 3.1888982371206604e-07, "loss": 1.5013, "step": 1517 }, { "epoch": 0.3150684931506849, "grad_norm": 0.6837986299827858, "learning_rate": 3.187917324912494e-07, "loss": 1.5526, "step": 1518 }, { "epoch": 0.3152760481527605, "grad_norm": 0.7238030627995922, "learning_rate": 3.1869359926681097e-07, "loss": 1.5368, "step": 1519 }, { "epoch": 0.31548360315483603, "grad_norm": 0.7227540923543799, "learning_rate": 3.1859542408047435e-07, "loss": 1.5626, "step": 1520 }, { "epoch": 0.31569115815691157, "grad_norm": 0.7174582378218057, "learning_rate": 3.1849720697398093e-07, "loss": 1.5067, "step": 1521 }, { "epoch": 0.31589871315898715, "grad_norm": 0.8053684446427255, "learning_rate": 3.1839894798909e-07, "loss": 1.6452, "step": 1522 }, { "epoch": 0.3161062681610627, "grad_norm": 1.25638468654692, "learning_rate": 3.1830064716757875e-07, "loss": 1.529, "step": 1523 }, { "epoch": 0.3163138231631382, "grad_norm": 0.7546442315082738, "learning_rate": 3.182023045512421e-07, "loss": 1.5167, "step": 1524 }, { "epoch": 0.3165213781652138, "grad_norm": 0.7048190227954557, "learning_rate": 3.181039201818926e-07, "loss": 1.5397, "step": 1525 }, { "epoch": 0.31672893316728934, "grad_norm": 0.7912242124900313, "learning_rate": 3.180054941013608e-07, "loss": 1.5004, "step": 1526 }, { "epoch": 0.31693648816936487, "grad_norm": 0.7566673249850133, "learning_rate": 3.1790702635149483e-07, "loss": 1.4853, "step": 1527 }, { "epoch": 0.3171440431714404, "grad_norm": 1.0302422292310887, "learning_rate": 3.178085169741605e-07, "loss": 1.5521, "step": 1528 }, { "epoch": 0.317351598173516, "grad_norm": 1.5335784294377346, "learning_rate": 3.177099660112414e-07, "loss": 1.5057, "step": 1529 }, { "epoch": 0.3175591531755915, "grad_norm": 0.7161477842631548, "learning_rate": 3.1761137350463883e-07, "loss": 1.4984, "step": 1530 }, { "epoch": 0.31776670817766706, "grad_norm": 1.9737898514115628, "learning_rate": 3.175127394962717e-07, "loss": 1.5586, "step": 1531 }, { "epoch": 0.31797426317974264, "grad_norm": 0.8116458110557672, "learning_rate": 3.1741406402807655e-07, "loss": 1.4885, "step": 1532 }, { "epoch": 0.3181818181818182, "grad_norm": 1.0863419919213408, "learning_rate": 3.173153471420076e-07, "loss": 1.515, "step": 1533 }, { "epoch": 0.3183893731838937, "grad_norm": 0.7121551013623241, "learning_rate": 3.172165888800365e-07, "loss": 1.5156, "step": 1534 }, { "epoch": 0.3185969281859693, "grad_norm": 0.6970222090298633, "learning_rate": 3.171177892841528e-07, "loss": 1.5274, "step": 1535 }, { "epoch": 0.31880448318804483, "grad_norm": 0.7122464096822435, "learning_rate": 3.170189483963635e-07, "loss": 1.6166, "step": 1536 }, { "epoch": 0.31901203819012036, "grad_norm": 0.6782791764309692, "learning_rate": 3.169200662586931e-07, "loss": 1.4854, "step": 1537 }, { "epoch": 0.31921959319219595, "grad_norm": 0.7654414523252254, "learning_rate": 3.168211429131835e-07, "loss": 1.5325, "step": 1538 }, { "epoch": 0.3194271481942715, "grad_norm": 0.6722686560973263, "learning_rate": 3.1672217840189443e-07, "loss": 1.4301, "step": 1539 }, { "epoch": 0.319634703196347, "grad_norm": 0.8152407686796453, "learning_rate": 3.166231727669029e-07, "loss": 1.5538, "step": 1540 }, { "epoch": 0.3198422581984226, "grad_norm": 0.7463891481452275, "learning_rate": 3.165241260503035e-07, "loss": 1.536, "step": 1541 }, { "epoch": 0.32004981320049813, "grad_norm": 0.7667119659520718, "learning_rate": 3.164250382942083e-07, "loss": 1.5675, "step": 1542 }, { "epoch": 0.32025736820257367, "grad_norm": 0.8600403298533835, "learning_rate": 3.163259095407468e-07, "loss": 1.5936, "step": 1543 }, { "epoch": 0.32046492320464925, "grad_norm": 0.843974281229249, "learning_rate": 3.1622673983206577e-07, "loss": 1.5806, "step": 1544 }, { "epoch": 0.3206724782067248, "grad_norm": 0.8409252655849362, "learning_rate": 3.161275292103297e-07, "loss": 1.5206, "step": 1545 }, { "epoch": 0.3208800332088003, "grad_norm": 1.3683881215793297, "learning_rate": 3.160282777177203e-07, "loss": 1.5176, "step": 1546 }, { "epoch": 0.3210875882108759, "grad_norm": 0.8530317832654084, "learning_rate": 3.1592898539643653e-07, "loss": 1.546, "step": 1547 }, { "epoch": 0.32129514321295144, "grad_norm": 1.3046243530778758, "learning_rate": 3.15829652288695e-07, "loss": 1.5187, "step": 1548 }, { "epoch": 0.32150269821502697, "grad_norm": 0.6626824338689945, "learning_rate": 3.157302784367294e-07, "loss": 1.5824, "step": 1549 }, { "epoch": 0.32171025321710256, "grad_norm": 0.7150204682433556, "learning_rate": 3.156308638827909e-07, "loss": 1.5273, "step": 1550 }, { "epoch": 0.3219178082191781, "grad_norm": 0.7082587380156864, "learning_rate": 3.15531408669148e-07, "loss": 1.5714, "step": 1551 }, { "epoch": 0.3221253632212536, "grad_norm": 0.8129077176326319, "learning_rate": 3.1543191283808633e-07, "loss": 1.5416, "step": 1552 }, { "epoch": 0.32233291822332916, "grad_norm": 0.6643762418379271, "learning_rate": 3.153323764319088e-07, "loss": 1.5461, "step": 1553 }, { "epoch": 0.32254047322540474, "grad_norm": 2.719052829084681, "learning_rate": 3.1523279949293584e-07, "loss": 1.5452, "step": 1554 }, { "epoch": 0.3227480282274803, "grad_norm": 0.7982673759207699, "learning_rate": 3.151331820635048e-07, "loss": 1.4871, "step": 1555 }, { "epoch": 0.3229555832295558, "grad_norm": 1.4882366877931574, "learning_rate": 3.1503352418597043e-07, "loss": 1.5315, "step": 1556 }, { "epoch": 0.3231631382316314, "grad_norm": 0.7444536583231158, "learning_rate": 3.149338259027045e-07, "loss": 1.5269, "step": 1557 }, { "epoch": 0.32337069323370693, "grad_norm": 1.538861324119853, "learning_rate": 3.1483408725609615e-07, "loss": 1.4852, "step": 1558 }, { "epoch": 0.32357824823578246, "grad_norm": 0.6568849312673052, "learning_rate": 3.1473430828855164e-07, "loss": 1.5304, "step": 1559 }, { "epoch": 0.32378580323785805, "grad_norm": 1.1182090435908014, "learning_rate": 3.146344890424943e-07, "loss": 1.5594, "step": 1560 }, { "epoch": 0.3239933582399336, "grad_norm": 0.7766036895438667, "learning_rate": 3.145346295603646e-07, "loss": 1.4441, "step": 1561 }, { "epoch": 0.3242009132420091, "grad_norm": 0.9127264212547783, "learning_rate": 3.144347298846202e-07, "loss": 1.5319, "step": 1562 }, { "epoch": 0.3244084682440847, "grad_norm": 0.6593738763657883, "learning_rate": 3.1433479005773567e-07, "loss": 1.5741, "step": 1563 }, { "epoch": 0.32461602324616023, "grad_norm": 0.8911329221554665, "learning_rate": 3.142348101222029e-07, "loss": 1.4774, "step": 1564 }, { "epoch": 0.32482357824823577, "grad_norm": 1.1799757042561831, "learning_rate": 3.1413479012053065e-07, "loss": 1.5811, "step": 1565 }, { "epoch": 0.32503113325031135, "grad_norm": 30.82941877357409, "learning_rate": 3.140347300952448e-07, "loss": 1.6183, "step": 1566 }, { "epoch": 0.3252386882523869, "grad_norm": 0.8686183829245135, "learning_rate": 3.139346300888882e-07, "loss": 1.5173, "step": 1567 }, { "epoch": 0.3254462432544624, "grad_norm": 0.934343222546769, "learning_rate": 3.138344901440207e-07, "loss": 1.4974, "step": 1568 }, { "epoch": 0.325653798256538, "grad_norm": 0.8288251586647396, "learning_rate": 3.137343103032191e-07, "loss": 1.4969, "step": 1569 }, { "epoch": 0.32586135325861354, "grad_norm": 1.1756031358058125, "learning_rate": 3.1363409060907735e-07, "loss": 1.5287, "step": 1570 }, { "epoch": 0.32606890826068907, "grad_norm": 0.6872982056877541, "learning_rate": 3.1353383110420607e-07, "loss": 1.5368, "step": 1571 }, { "epoch": 0.32627646326276466, "grad_norm": 0.7173370722988562, "learning_rate": 3.13433531831233e-07, "loss": 1.518, "step": 1572 }, { "epoch": 0.3264840182648402, "grad_norm": 0.7077151170311493, "learning_rate": 3.1333319283280274e-07, "loss": 1.6073, "step": 1573 }, { "epoch": 0.3266915732669157, "grad_norm": 0.8533943399033741, "learning_rate": 3.1323281415157665e-07, "loss": 1.4611, "step": 1574 }, { "epoch": 0.32689912826899126, "grad_norm": 0.7216180828932892, "learning_rate": 3.1313239583023327e-07, "loss": 1.5433, "step": 1575 }, { "epoch": 0.32710668327106684, "grad_norm": 0.7657188170898415, "learning_rate": 3.1303193791146767e-07, "loss": 1.5461, "step": 1576 }, { "epoch": 0.3273142382731424, "grad_norm": 0.7547586499023344, "learning_rate": 3.129314404379919e-07, "loss": 1.5458, "step": 1577 }, { "epoch": 0.3275217932752179, "grad_norm": 0.6492722320495253, "learning_rate": 3.1283090345253494e-07, "loss": 1.449, "step": 1578 }, { "epoch": 0.3277293482772935, "grad_norm": 0.694932859316317, "learning_rate": 3.1273032699784223e-07, "loss": 1.5038, "step": 1579 }, { "epoch": 0.32793690327936903, "grad_norm": 0.760421015961153, "learning_rate": 3.1262971111667643e-07, "loss": 1.4807, "step": 1580 }, { "epoch": 0.32814445828144456, "grad_norm": 1.0312798410817425, "learning_rate": 3.125290558518166e-07, "loss": 1.5945, "step": 1581 }, { "epoch": 0.32835201328352015, "grad_norm": 1.7143441712325553, "learning_rate": 3.1242836124605866e-07, "loss": 1.4721, "step": 1582 }, { "epoch": 0.3285595682855957, "grad_norm": 0.6655217244718277, "learning_rate": 3.123276273422155e-07, "loss": 1.4986, "step": 1583 }, { "epoch": 0.3287671232876712, "grad_norm": 0.8346966955789498, "learning_rate": 3.1222685418311625e-07, "loss": 1.5709, "step": 1584 }, { "epoch": 0.3289746782897468, "grad_norm": 0.6331326576478944, "learning_rate": 3.121260418116071e-07, "loss": 1.5167, "step": 1585 }, { "epoch": 0.32918223329182233, "grad_norm": 0.7412456391611238, "learning_rate": 3.120251902705508e-07, "loss": 1.5368, "step": 1586 }, { "epoch": 0.32938978829389787, "grad_norm": 0.6753891380104349, "learning_rate": 3.1192429960282666e-07, "loss": 1.5062, "step": 1587 }, { "epoch": 0.32959734329597346, "grad_norm": 0.6640327679259239, "learning_rate": 3.1182336985133083e-07, "loss": 1.5371, "step": 1588 }, { "epoch": 0.329804898298049, "grad_norm": 1.1952829517448376, "learning_rate": 3.1172240105897596e-07, "loss": 1.5394, "step": 1589 }, { "epoch": 0.3300124533001245, "grad_norm": 0.9634946214688638, "learning_rate": 3.116213932686912e-07, "loss": 1.6199, "step": 1590 }, { "epoch": 0.3302200083022001, "grad_norm": 0.8110358709993737, "learning_rate": 3.1152034652342243e-07, "loss": 1.5702, "step": 1591 }, { "epoch": 0.33042756330427564, "grad_norm": 0.6553060442490959, "learning_rate": 3.114192608661321e-07, "loss": 1.5057, "step": 1592 }, { "epoch": 0.3306351183063512, "grad_norm": 0.624511976827147, "learning_rate": 3.1131813633979905e-07, "loss": 1.5156, "step": 1593 }, { "epoch": 0.33084267330842676, "grad_norm": 3.9508105574571273, "learning_rate": 3.1121697298741874e-07, "loss": 1.5963, "step": 1594 }, { "epoch": 0.3310502283105023, "grad_norm": 0.6582180790077864, "learning_rate": 3.1111577085200323e-07, "loss": 1.4956, "step": 1595 }, { "epoch": 0.3312577833125778, "grad_norm": 1.522087814498903, "learning_rate": 3.1101452997658097e-07, "loss": 1.5421, "step": 1596 }, { "epoch": 0.33146533831465336, "grad_norm": 0.7153593782532924, "learning_rate": 3.109132504041968e-07, "loss": 1.4977, "step": 1597 }, { "epoch": 0.33167289331672895, "grad_norm": 0.79503716445836, "learning_rate": 3.1081193217791226e-07, "loss": 1.5354, "step": 1598 }, { "epoch": 0.3318804483188045, "grad_norm": 1.2461131906558136, "learning_rate": 3.10710575340805e-07, "loss": 1.4358, "step": 1599 }, { "epoch": 0.33208800332088, "grad_norm": 0.718938286134833, "learning_rate": 3.1060917993596933e-07, "loss": 1.4774, "step": 1600 }, { "epoch": 0.3322955583229556, "grad_norm": 0.68334836747786, "learning_rate": 3.105077460065159e-07, "loss": 1.5268, "step": 1601 }, { "epoch": 0.33250311332503113, "grad_norm": 2.343952333780335, "learning_rate": 3.1040627359557175e-07, "loss": 1.4907, "step": 1602 }, { "epoch": 0.33271066832710666, "grad_norm": 0.7640612952723437, "learning_rate": 3.103047627462802e-07, "loss": 1.4855, "step": 1603 }, { "epoch": 0.33291822332918225, "grad_norm": 0.7532144797063448, "learning_rate": 3.102032135018009e-07, "loss": 1.5227, "step": 1604 }, { "epoch": 0.3331257783312578, "grad_norm": 1.386706642060875, "learning_rate": 3.101016259053101e-07, "loss": 1.4513, "step": 1605 }, { "epoch": 0.3333333333333333, "grad_norm": 0.7132200211075789, "learning_rate": 3.1e-07, "loss": 1.4837, "step": 1606 }, { "epoch": 0.3335408883354089, "grad_norm": 0.7624633968700086, "learning_rate": 3.098983358290792e-07, "loss": 1.4883, "step": 1607 }, { "epoch": 0.33374844333748444, "grad_norm": 1.6305508823080885, "learning_rate": 3.0979663343577277e-07, "loss": 1.5252, "step": 1608 }, { "epoch": 0.33395599833955997, "grad_norm": 1.8227199718129374, "learning_rate": 3.0969489286332174e-07, "loss": 1.5444, "step": 1609 }, { "epoch": 0.33416355334163556, "grad_norm": 0.7052644520628132, "learning_rate": 3.0959311415498345e-07, "loss": 1.4436, "step": 1610 }, { "epoch": 0.3343711083437111, "grad_norm": 0.9998415197954762, "learning_rate": 3.0949129735403165e-07, "loss": 1.5216, "step": 1611 }, { "epoch": 0.3345786633457866, "grad_norm": 1.9674711545113885, "learning_rate": 3.093894425037561e-07, "loss": 1.4963, "step": 1612 }, { "epoch": 0.3347862183478622, "grad_norm": 1.113441702031003, "learning_rate": 3.092875496474627e-07, "loss": 1.4454, "step": 1613 }, { "epoch": 0.33499377334993774, "grad_norm": 0.8016579227198851, "learning_rate": 3.091856188284736e-07, "loss": 1.5209, "step": 1614 }, { "epoch": 0.3352013283520133, "grad_norm": 0.6890212291227804, "learning_rate": 3.090836500901272e-07, "loss": 1.5137, "step": 1615 }, { "epoch": 0.33540888335408886, "grad_norm": 0.8209015007259222, "learning_rate": 3.0898164347577775e-07, "loss": 1.4832, "step": 1616 }, { "epoch": 0.3356164383561644, "grad_norm": 0.7018157065024992, "learning_rate": 3.0887959902879586e-07, "loss": 1.5736, "step": 1617 }, { "epoch": 0.3358239933582399, "grad_norm": 1.4473239754467748, "learning_rate": 3.087775167925681e-07, "loss": 1.6125, "step": 1618 }, { "epoch": 0.33603154836031546, "grad_norm": 0.7490796166398493, "learning_rate": 3.086753968104971e-07, "loss": 1.4745, "step": 1619 }, { "epoch": 0.33623910336239105, "grad_norm": 1.2519146778893293, "learning_rate": 3.085732391260016e-07, "loss": 1.4962, "step": 1620 }, { "epoch": 0.3364466583644666, "grad_norm": 2.8196203071619137, "learning_rate": 3.0847104378251623e-07, "loss": 1.5235, "step": 1621 }, { "epoch": 0.3366542133665421, "grad_norm": 0.815405779431192, "learning_rate": 3.083688108234919e-07, "loss": 1.6065, "step": 1622 }, { "epoch": 0.3368617683686177, "grad_norm": 0.9621634342903317, "learning_rate": 3.082665402923952e-07, "loss": 1.5325, "step": 1623 }, { "epoch": 0.33706932337069323, "grad_norm": 0.8621212106888736, "learning_rate": 3.08164232232709e-07, "loss": 1.4955, "step": 1624 }, { "epoch": 0.33727687837276876, "grad_norm": 0.6612875148662144, "learning_rate": 3.0806188668793176e-07, "loss": 1.5019, "step": 1625 }, { "epoch": 0.33748443337484435, "grad_norm": 0.663738492465499, "learning_rate": 3.079595037015783e-07, "loss": 1.5535, "step": 1626 }, { "epoch": 0.3376919883769199, "grad_norm": 0.9422035390470533, "learning_rate": 3.07857083317179e-07, "loss": 1.4713, "step": 1627 }, { "epoch": 0.3378995433789954, "grad_norm": 0.7883623781541931, "learning_rate": 3.077546255782804e-07, "loss": 1.5072, "step": 1628 }, { "epoch": 0.338107098381071, "grad_norm": 0.7574632719595523, "learning_rate": 3.076521305284447e-07, "loss": 1.5881, "step": 1629 }, { "epoch": 0.33831465338314654, "grad_norm": 0.7536334871385633, "learning_rate": 3.0754959821125017e-07, "loss": 1.496, "step": 1630 }, { "epoch": 0.33852220838522207, "grad_norm": 0.7909478959005793, "learning_rate": 3.074470286702908e-07, "loss": 1.5074, "step": 1631 }, { "epoch": 0.33872976338729766, "grad_norm": 0.8249333290394759, "learning_rate": 3.073444219491764e-07, "loss": 1.5348, "step": 1632 }, { "epoch": 0.3389373183893732, "grad_norm": 1.023051553917948, "learning_rate": 3.072417780915327e-07, "loss": 1.5532, "step": 1633 }, { "epoch": 0.3391448733914487, "grad_norm": 0.8990458627368273, "learning_rate": 3.07139097141001e-07, "loss": 1.4847, "step": 1634 }, { "epoch": 0.3393524283935243, "grad_norm": 0.7114981462728718, "learning_rate": 3.0703637914123864e-07, "loss": 1.4415, "step": 1635 }, { "epoch": 0.33955998339559984, "grad_norm": 0.9838345121038751, "learning_rate": 3.069336241359186e-07, "loss": 1.5449, "step": 1636 }, { "epoch": 0.3397675383976754, "grad_norm": 0.7081896978793626, "learning_rate": 3.068308321687296e-07, "loss": 1.5356, "step": 1637 }, { "epoch": 0.33997509339975096, "grad_norm": 0.6901392950455878, "learning_rate": 3.0672800328337583e-07, "loss": 1.5497, "step": 1638 }, { "epoch": 0.3401826484018265, "grad_norm": 0.728927656555971, "learning_rate": 3.0662513752357767e-07, "loss": 1.5151, "step": 1639 }, { "epoch": 0.340390203403902, "grad_norm": 0.8221644777235805, "learning_rate": 3.0652223493307066e-07, "loss": 1.51, "step": 1640 }, { "epoch": 0.34059775840597756, "grad_norm": 0.6619593327455161, "learning_rate": 3.064192955556066e-07, "loss": 1.5578, "step": 1641 }, { "epoch": 0.34080531340805315, "grad_norm": 0.6711668860031297, "learning_rate": 3.063163194349522e-07, "loss": 1.5519, "step": 1642 }, { "epoch": 0.3410128684101287, "grad_norm": 1.0665924311144357, "learning_rate": 3.062133066148904e-07, "loss": 1.6023, "step": 1643 }, { "epoch": 0.3412204234122042, "grad_norm": 1.0919493056165805, "learning_rate": 3.061102571392195e-07, "loss": 1.6064, "step": 1644 }, { "epoch": 0.3414279784142798, "grad_norm": 0.8579946574964484, "learning_rate": 3.0600717105175327e-07, "loss": 1.5201, "step": 1645 }, { "epoch": 0.34163553341635533, "grad_norm": 0.8572452091318757, "learning_rate": 3.059040483963214e-07, "loss": 1.5426, "step": 1646 }, { "epoch": 0.34184308841843086, "grad_norm": 0.6299908917917835, "learning_rate": 3.058008892167687e-07, "loss": 1.5534, "step": 1647 }, { "epoch": 0.34205064342050645, "grad_norm": 1.6094766890804149, "learning_rate": 3.0569769355695575e-07, "loss": 1.4546, "step": 1648 }, { "epoch": 0.342258198422582, "grad_norm": 0.94109579604955, "learning_rate": 3.055944614607587e-07, "loss": 1.5961, "step": 1649 }, { "epoch": 0.3424657534246575, "grad_norm": 0.714434693264507, "learning_rate": 3.054911929720691e-07, "loss": 1.5651, "step": 1650 }, { "epoch": 0.3426733084267331, "grad_norm": 0.824520636839947, "learning_rate": 3.053878881347938e-07, "loss": 1.5642, "step": 1651 }, { "epoch": 0.34288086342880864, "grad_norm": 0.7509265291697873, "learning_rate": 3.052845469928554e-07, "loss": 1.4685, "step": 1652 }, { "epoch": 0.34308841843088417, "grad_norm": 1.0078662464984964, "learning_rate": 3.051811695901918e-07, "loss": 1.444, "step": 1653 }, { "epoch": 0.34329597343295976, "grad_norm": 2.3313540365874497, "learning_rate": 3.0507775597075634e-07, "loss": 1.4941, "step": 1654 }, { "epoch": 0.3435035284350353, "grad_norm": 0.813050234345806, "learning_rate": 3.049743061785177e-07, "loss": 1.5321, "step": 1655 }, { "epoch": 0.3437110834371108, "grad_norm": 0.6732430021714549, "learning_rate": 3.0487082025746007e-07, "loss": 1.4882, "step": 1656 }, { "epoch": 0.3439186384391864, "grad_norm": 0.8195534802277931, "learning_rate": 3.047672982515828e-07, "loss": 1.6275, "step": 1657 }, { "epoch": 0.34412619344126194, "grad_norm": 0.8052939454467235, "learning_rate": 3.046637402049008e-07, "loss": 1.5414, "step": 1658 }, { "epoch": 0.3443337484433375, "grad_norm": 0.9324483000140465, "learning_rate": 3.045601461614442e-07, "loss": 1.5398, "step": 1659 }, { "epoch": 0.34454130344541306, "grad_norm": 0.77879291620623, "learning_rate": 3.044565161652583e-07, "loss": 1.5063, "step": 1660 }, { "epoch": 0.3447488584474886, "grad_norm": 0.8697844337300528, "learning_rate": 3.0435285026040393e-07, "loss": 1.4811, "step": 1661 }, { "epoch": 0.3449564134495641, "grad_norm": 0.7403197810690658, "learning_rate": 3.0424914849095715e-07, "loss": 1.4977, "step": 1662 }, { "epoch": 0.34516396845163966, "grad_norm": 1.1406082873430088, "learning_rate": 3.0414541090100907e-07, "loss": 1.5005, "step": 1663 }, { "epoch": 0.34537152345371525, "grad_norm": 0.8772203904166385, "learning_rate": 3.040416375346662e-07, "loss": 1.5712, "step": 1664 }, { "epoch": 0.3455790784557908, "grad_norm": 1.0097804432803577, "learning_rate": 3.0393782843605025e-07, "loss": 1.4485, "step": 1665 }, { "epoch": 0.3457866334578663, "grad_norm": 0.8945710588706391, "learning_rate": 3.0383398364929807e-07, "loss": 1.4628, "step": 1666 }, { "epoch": 0.3459941884599419, "grad_norm": 1.2166364844098123, "learning_rate": 3.0373010321856164e-07, "loss": 1.554, "step": 1667 }, { "epoch": 0.34620174346201743, "grad_norm": 0.8278050827604245, "learning_rate": 3.0362618718800834e-07, "loss": 1.4815, "step": 1668 }, { "epoch": 0.34640929846409296, "grad_norm": 0.6830986360349536, "learning_rate": 3.035222356018203e-07, "loss": 1.5209, "step": 1669 }, { "epoch": 0.34661685346616855, "grad_norm": 0.8092350944352389, "learning_rate": 3.034182485041951e-07, "loss": 1.5253, "step": 1670 }, { "epoch": 0.3468244084682441, "grad_norm": 0.8330013459319859, "learning_rate": 3.033142259393453e-07, "loss": 1.5411, "step": 1671 }, { "epoch": 0.3470319634703196, "grad_norm": 0.752721215090041, "learning_rate": 3.0321016795149847e-07, "loss": 1.5835, "step": 1672 }, { "epoch": 0.3472395184723952, "grad_norm": 0.6675975075083221, "learning_rate": 3.0310607458489734e-07, "loss": 1.4351, "step": 1673 }, { "epoch": 0.34744707347447074, "grad_norm": 0.622488184844189, "learning_rate": 3.0300194588379964e-07, "loss": 1.537, "step": 1674 }, { "epoch": 0.34765462847654627, "grad_norm": 0.8522550700196722, "learning_rate": 3.0289778189247816e-07, "loss": 1.5081, "step": 1675 }, { "epoch": 0.34786218347862186, "grad_norm": 0.69898024471361, "learning_rate": 3.0279358265522053e-07, "loss": 1.4991, "step": 1676 }, { "epoch": 0.3480697384806974, "grad_norm": 0.7433784810098976, "learning_rate": 3.026893482163297e-07, "loss": 1.5236, "step": 1677 }, { "epoch": 0.3482772934827729, "grad_norm": 1.0199962591268212, "learning_rate": 3.025850786201232e-07, "loss": 1.5772, "step": 1678 }, { "epoch": 0.3484848484848485, "grad_norm": 0.6550305302333714, "learning_rate": 3.0248077391093384e-07, "loss": 1.5698, "step": 1679 }, { "epoch": 0.34869240348692404, "grad_norm": 0.7776577145060054, "learning_rate": 3.023764341331092e-07, "loss": 1.5546, "step": 1680 }, { "epoch": 0.3488999584889996, "grad_norm": 1.2816873166807066, "learning_rate": 3.0227205933101166e-07, "loss": 1.5292, "step": 1681 }, { "epoch": 0.34910751349107516, "grad_norm": 1.261557960183847, "learning_rate": 3.0216764954901865e-07, "loss": 1.4805, "step": 1682 }, { "epoch": 0.3493150684931507, "grad_norm": 0.6774303188709085, "learning_rate": 3.020632048315226e-07, "loss": 1.5187, "step": 1683 }, { "epoch": 0.3495226234952262, "grad_norm": 2.035685522152664, "learning_rate": 3.019587252229304e-07, "loss": 1.5804, "step": 1684 }, { "epoch": 0.34973017849730176, "grad_norm": 1.1254319215830477, "learning_rate": 3.018542107676642e-07, "loss": 1.5053, "step": 1685 }, { "epoch": 0.34993773349937735, "grad_norm": 0.8275366226382337, "learning_rate": 3.0174966151016064e-07, "loss": 1.5517, "step": 1686 }, { "epoch": 0.3501452885014529, "grad_norm": 0.6859446592128088, "learning_rate": 3.016450774948713e-07, "loss": 1.5276, "step": 1687 }, { "epoch": 0.3503528435035284, "grad_norm": 0.8553843053007176, "learning_rate": 3.0154045876626264e-07, "loss": 1.5983, "step": 1688 }, { "epoch": 0.350560398505604, "grad_norm": 0.869081237501683, "learning_rate": 3.014358053688157e-07, "loss": 1.6072, "step": 1689 }, { "epoch": 0.35076795350767953, "grad_norm": 0.7909907178079894, "learning_rate": 3.013311173470262e-07, "loss": 1.5368, "step": 1690 }, { "epoch": 0.35097550850975506, "grad_norm": 0.6394119879036785, "learning_rate": 3.0122639474540493e-07, "loss": 1.4718, "step": 1691 }, { "epoch": 0.35118306351183065, "grad_norm": 0.9759307754666919, "learning_rate": 3.01121637608477e-07, "loss": 1.5045, "step": 1692 }, { "epoch": 0.3513906185139062, "grad_norm": 0.8695526425880886, "learning_rate": 3.0101684598078244e-07, "loss": 1.5284, "step": 1693 }, { "epoch": 0.3515981735159817, "grad_norm": 0.7888488891482031, "learning_rate": 3.0091201990687586e-07, "loss": 1.5402, "step": 1694 }, { "epoch": 0.3518057285180573, "grad_norm": 0.6291405641894751, "learning_rate": 3.0080715943132646e-07, "loss": 1.5051, "step": 1695 }, { "epoch": 0.35201328352013284, "grad_norm": 0.7103108192927109, "learning_rate": 3.007022645987182e-07, "loss": 1.5028, "step": 1696 }, { "epoch": 0.35222083852220837, "grad_norm": 0.7627372914788422, "learning_rate": 3.005973354536496e-07, "loss": 1.5835, "step": 1697 }, { "epoch": 0.35242839352428396, "grad_norm": 0.725042565721643, "learning_rate": 3.004923720407336e-07, "loss": 1.5028, "step": 1698 }, { "epoch": 0.3526359485263595, "grad_norm": 0.8297371250754101, "learning_rate": 3.00387374404598e-07, "loss": 1.5794, "step": 1699 }, { "epoch": 0.352843503528435, "grad_norm": 1.247375235901272, "learning_rate": 3.0028234258988503e-07, "loss": 1.501, "step": 1700 }, { "epoch": 0.3530510585305106, "grad_norm": 0.7979091860000276, "learning_rate": 3.001772766412513e-07, "loss": 1.5222, "step": 1701 }, { "epoch": 0.35325861353258614, "grad_norm": 0.7936083419914423, "learning_rate": 3.0007217660336816e-07, "loss": 1.5614, "step": 1702 }, { "epoch": 0.3534661685346617, "grad_norm": 0.8009676053899694, "learning_rate": 2.9996704252092137e-07, "loss": 1.5254, "step": 1703 }, { "epoch": 0.35367372353673726, "grad_norm": 1.043574240180602, "learning_rate": 2.9986187443861103e-07, "loss": 1.5266, "step": 1704 }, { "epoch": 0.3538812785388128, "grad_norm": 0.7379523630474946, "learning_rate": 2.997566724011519e-07, "loss": 1.5135, "step": 1705 }, { "epoch": 0.3540888335408883, "grad_norm": 0.6707427472142247, "learning_rate": 2.996514364532731e-07, "loss": 1.598, "step": 1706 }, { "epoch": 0.35429638854296386, "grad_norm": 0.7105516763227429, "learning_rate": 2.995461666397181e-07, "loss": 1.472, "step": 1707 }, { "epoch": 0.35450394354503945, "grad_norm": 0.9825429304315128, "learning_rate": 2.9944086300524493e-07, "loss": 1.5137, "step": 1708 }, { "epoch": 0.354711498547115, "grad_norm": 0.8266271207201454, "learning_rate": 2.9933552559462586e-07, "loss": 1.5352, "step": 1709 }, { "epoch": 0.3549190535491905, "grad_norm": 0.8420143656260886, "learning_rate": 2.9923015445264746e-07, "loss": 1.5245, "step": 1710 }, { "epoch": 0.3551266085512661, "grad_norm": 0.9319206561434165, "learning_rate": 2.991247496241109e-07, "loss": 1.5242, "step": 1711 }, { "epoch": 0.35533416355334163, "grad_norm": 0.7284856736795287, "learning_rate": 2.9901931115383143e-07, "loss": 1.5128, "step": 1712 }, { "epoch": 0.35554171855541716, "grad_norm": 1.1061317297106878, "learning_rate": 2.989138390866388e-07, "loss": 1.5403, "step": 1713 }, { "epoch": 0.35574927355749275, "grad_norm": 0.8702660793802259, "learning_rate": 2.9880833346737664e-07, "loss": 1.4661, "step": 1714 }, { "epoch": 0.3559568285595683, "grad_norm": 3.1766649706898957, "learning_rate": 2.9870279434090346e-07, "loss": 1.5005, "step": 1715 }, { "epoch": 0.3561643835616438, "grad_norm": 0.7205636259189994, "learning_rate": 2.9859722175209153e-07, "loss": 1.5109, "step": 1716 }, { "epoch": 0.3563719385637194, "grad_norm": 0.6283346875682314, "learning_rate": 2.984916157458275e-07, "loss": 1.4877, "step": 1717 }, { "epoch": 0.35657949356579494, "grad_norm": 1.6563760549803141, "learning_rate": 2.983859763670123e-07, "loss": 1.5204, "step": 1718 }, { "epoch": 0.35678704856787047, "grad_norm": 1.1189558043, "learning_rate": 2.9828030366056106e-07, "loss": 1.5492, "step": 1719 }, { "epoch": 0.35699460356994606, "grad_norm": 0.7216092395063066, "learning_rate": 2.9817459767140286e-07, "loss": 1.5167, "step": 1720 }, { "epoch": 0.3572021585720216, "grad_norm": 0.7441271563187284, "learning_rate": 2.980688584444812e-07, "loss": 1.4984, "step": 1721 }, { "epoch": 0.3574097135740971, "grad_norm": 0.6889872910853054, "learning_rate": 2.979630860247535e-07, "loss": 1.5651, "step": 1722 }, { "epoch": 0.3576172685761727, "grad_norm": 0.6704762098524681, "learning_rate": 2.978572804571914e-07, "loss": 1.4289, "step": 1723 }, { "epoch": 0.35782482357824824, "grad_norm": 0.958615776488211, "learning_rate": 2.977514417867807e-07, "loss": 1.5443, "step": 1724 }, { "epoch": 0.3580323785803238, "grad_norm": 0.7406615626568862, "learning_rate": 2.9764557005852113e-07, "loss": 1.4956, "step": 1725 }, { "epoch": 0.35823993358239936, "grad_norm": 0.8771154807976586, "learning_rate": 2.9753966531742645e-07, "loss": 1.4966, "step": 1726 }, { "epoch": 0.3584474885844749, "grad_norm": 0.7483783007618476, "learning_rate": 2.974337276085248e-07, "loss": 1.5266, "step": 1727 }, { "epoch": 0.3586550435865504, "grad_norm": 0.754628660201579, "learning_rate": 2.973277569768578e-07, "loss": 1.5666, "step": 1728 }, { "epoch": 0.35886259858862596, "grad_norm": 0.8747559113838734, "learning_rate": 2.972217534674815e-07, "loss": 1.6055, "step": 1729 }, { "epoch": 0.35907015359070155, "grad_norm": 1.116113725806758, "learning_rate": 2.971157171254658e-07, "loss": 1.5103, "step": 1730 }, { "epoch": 0.3592777085927771, "grad_norm": 0.7861015005845284, "learning_rate": 2.970096479958944e-07, "loss": 1.6098, "step": 1731 }, { "epoch": 0.3594852635948526, "grad_norm": 0.9617823634146951, "learning_rate": 2.969035461238652e-07, "loss": 1.5086, "step": 1732 }, { "epoch": 0.3596928185969282, "grad_norm": 0.961114820817053, "learning_rate": 2.9679741155448983e-07, "loss": 1.4787, "step": 1733 }, { "epoch": 0.35990037359900373, "grad_norm": 0.855547225438654, "learning_rate": 2.9669124433289396e-07, "loss": 1.5384, "step": 1734 }, { "epoch": 0.36010792860107926, "grad_norm": 0.7805276087870763, "learning_rate": 2.96585044504217e-07, "loss": 1.5389, "step": 1735 }, { "epoch": 0.36031548360315485, "grad_norm": 0.8136550380958589, "learning_rate": 2.9647881211361237e-07, "loss": 1.5576, "step": 1736 }, { "epoch": 0.3605230386052304, "grad_norm": 0.66497298643468, "learning_rate": 2.963725472062472e-07, "loss": 1.454, "step": 1737 }, { "epoch": 0.3607305936073059, "grad_norm": 0.6276381689953632, "learning_rate": 2.962662498273026e-07, "loss": 1.4555, "step": 1738 }, { "epoch": 0.3609381486093815, "grad_norm": 0.6614671787854753, "learning_rate": 2.9615992002197325e-07, "loss": 1.5379, "step": 1739 }, { "epoch": 0.36114570361145704, "grad_norm": 0.8525112543558966, "learning_rate": 2.9605355783546787e-07, "loss": 1.5815, "step": 1740 }, { "epoch": 0.36135325861353257, "grad_norm": 0.7129837198729112, "learning_rate": 2.959471633130088e-07, "loss": 1.4539, "step": 1741 }, { "epoch": 0.36156081361560816, "grad_norm": 0.9130059709848856, "learning_rate": 2.958407364998322e-07, "loss": 1.4706, "step": 1742 }, { "epoch": 0.3617683686176837, "grad_norm": 0.8027343517265195, "learning_rate": 2.957342774411878e-07, "loss": 1.5139, "step": 1743 }, { "epoch": 0.3619759236197592, "grad_norm": 0.7397254986736014, "learning_rate": 2.956277861823394e-07, "loss": 1.5687, "step": 1744 }, { "epoch": 0.3621834786218348, "grad_norm": 0.9829383375014513, "learning_rate": 2.9552126276856404e-07, "loss": 1.5216, "step": 1745 }, { "epoch": 0.36239103362391034, "grad_norm": 0.6952958906345301, "learning_rate": 2.954147072451527e-07, "loss": 1.4976, "step": 1746 }, { "epoch": 0.3625985886259859, "grad_norm": 0.6378002366491281, "learning_rate": 2.9530811965741003e-07, "loss": 1.577, "step": 1747 }, { "epoch": 0.36280614362806146, "grad_norm": 0.6219270941064697, "learning_rate": 2.9520150005065414e-07, "loss": 1.4592, "step": 1748 }, { "epoch": 0.363013698630137, "grad_norm": 0.8451233477653258, "learning_rate": 2.9509484847021704e-07, "loss": 1.5592, "step": 1749 }, { "epoch": 0.3632212536322125, "grad_norm": 1.2275080116382218, "learning_rate": 2.9498816496144394e-07, "loss": 1.5534, "step": 1750 }, { "epoch": 0.36342880863428806, "grad_norm": 1.1182092905517573, "learning_rate": 2.9488144956969394e-07, "loss": 1.4897, "step": 1751 }, { "epoch": 0.36363636363636365, "grad_norm": 0.7721664941862921, "learning_rate": 2.947747023403396e-07, "loss": 1.5315, "step": 1752 }, { "epoch": 0.3638439186384392, "grad_norm": 0.7818740065443556, "learning_rate": 2.946679233187669e-07, "loss": 1.5258, "step": 1753 }, { "epoch": 0.3640514736405147, "grad_norm": 0.7051419859263878, "learning_rate": 2.9456111255037556e-07, "loss": 1.5029, "step": 1754 }, { "epoch": 0.3642590286425903, "grad_norm": 0.747064840265756, "learning_rate": 2.944542700805787e-07, "loss": 1.4954, "step": 1755 }, { "epoch": 0.36446658364466583, "grad_norm": 0.707961041524801, "learning_rate": 2.943473959548028e-07, "loss": 1.5186, "step": 1756 }, { "epoch": 0.36467413864674136, "grad_norm": 0.884401846257606, "learning_rate": 2.942404902184879e-07, "loss": 1.4792, "step": 1757 }, { "epoch": 0.36488169364881695, "grad_norm": 0.7312295811536884, "learning_rate": 2.941335529170876e-07, "loss": 1.4889, "step": 1758 }, { "epoch": 0.3650892486508925, "grad_norm": 1.0756486533974505, "learning_rate": 2.940265840960687e-07, "loss": 1.5116, "step": 1759 }, { "epoch": 0.365296803652968, "grad_norm": 3.969806928744894, "learning_rate": 2.939195838009116e-07, "loss": 1.5481, "step": 1760 }, { "epoch": 0.3655043586550436, "grad_norm": 7.209525118202673, "learning_rate": 2.938125520771099e-07, "loss": 1.5164, "step": 1761 }, { "epoch": 0.36571191365711914, "grad_norm": 0.7153874088280192, "learning_rate": 2.937054889701706e-07, "loss": 1.4693, "step": 1762 }, { "epoch": 0.36591946865919467, "grad_norm": 0.850598206314216, "learning_rate": 2.9359839452561426e-07, "loss": 1.5628, "step": 1763 }, { "epoch": 0.36612702366127026, "grad_norm": 0.8585994069962113, "learning_rate": 2.934912687889744e-07, "loss": 1.4284, "step": 1764 }, { "epoch": 0.3663345786633458, "grad_norm": 1.0627420158289687, "learning_rate": 2.933841118057982e-07, "loss": 1.5067, "step": 1765 }, { "epoch": 0.3665421336654213, "grad_norm": 1.1408889396014805, "learning_rate": 2.932769236216459e-07, "loss": 1.5725, "step": 1766 }, { "epoch": 0.3667496886674969, "grad_norm": 0.7987928242629375, "learning_rate": 2.9316970428209104e-07, "loss": 1.5193, "step": 1767 }, { "epoch": 0.36695724366957244, "grad_norm": 0.7225197506249644, "learning_rate": 2.930624538327205e-07, "loss": 1.5201, "step": 1768 }, { "epoch": 0.367164798671648, "grad_norm": 0.8497929654463315, "learning_rate": 2.9295517231913423e-07, "loss": 1.5435, "step": 1769 }, { "epoch": 0.36737235367372356, "grad_norm": 0.6124953585769698, "learning_rate": 2.928478597869456e-07, "loss": 1.5074, "step": 1770 }, { "epoch": 0.3675799086757991, "grad_norm": 0.8525349756883807, "learning_rate": 2.927405162817809e-07, "loss": 1.5803, "step": 1771 }, { "epoch": 0.3677874636778746, "grad_norm": 3.031277381744521, "learning_rate": 2.9263314184927987e-07, "loss": 1.559, "step": 1772 }, { "epoch": 0.3679950186799502, "grad_norm": 0.6602677669129073, "learning_rate": 2.925257365350952e-07, "loss": 1.4774, "step": 1773 }, { "epoch": 0.36820257368202575, "grad_norm": 0.6551522227737647, "learning_rate": 2.9241830038489293e-07, "loss": 1.5337, "step": 1774 }, { "epoch": 0.3684101286841013, "grad_norm": 0.8985469487129819, "learning_rate": 2.9231083344435185e-07, "loss": 1.4968, "step": 1775 }, { "epoch": 0.3686176836861768, "grad_norm": 0.6463507985368129, "learning_rate": 2.9220333575916414e-07, "loss": 1.5401, "step": 1776 }, { "epoch": 0.3688252386882524, "grad_norm": 1.0489510793874743, "learning_rate": 2.920958073750349e-07, "loss": 1.4496, "step": 1777 }, { "epoch": 0.36903279369032793, "grad_norm": 0.9971658252145071, "learning_rate": 2.9198824833768245e-07, "loss": 1.4645, "step": 1778 }, { "epoch": 0.36924034869240346, "grad_norm": 0.6773521974366477, "learning_rate": 2.91880658692838e-07, "loss": 1.4861, "step": 1779 }, { "epoch": 0.36944790369447905, "grad_norm": 0.797381790314639, "learning_rate": 2.9177303848624576e-07, "loss": 1.4955, "step": 1780 }, { "epoch": 0.3696554586965546, "grad_norm": 0.7243669833038442, "learning_rate": 2.9166538776366305e-07, "loss": 1.5523, "step": 1781 }, { "epoch": 0.3698630136986301, "grad_norm": 0.8073564724124318, "learning_rate": 2.915577065708601e-07, "loss": 1.4681, "step": 1782 }, { "epoch": 0.3700705687007057, "grad_norm": 0.7486457762552564, "learning_rate": 2.9144999495361993e-07, "loss": 1.492, "step": 1783 }, { "epoch": 0.37027812370278124, "grad_norm": 0.73969202222816, "learning_rate": 2.913422529577389e-07, "loss": 1.5026, "step": 1784 }, { "epoch": 0.37048567870485677, "grad_norm": 0.7982331611979465, "learning_rate": 2.91234480629026e-07, "loss": 1.4837, "step": 1785 }, { "epoch": 0.37069323370693236, "grad_norm": 1.433331735456798, "learning_rate": 2.91126678013303e-07, "loss": 1.5643, "step": 1786 }, { "epoch": 0.3709007887090079, "grad_norm": 0.6966227458426625, "learning_rate": 2.9101884515640486e-07, "loss": 1.5435, "step": 1787 }, { "epoch": 0.3711083437110834, "grad_norm": 1.1889136660738293, "learning_rate": 2.909109821041792e-07, "loss": 1.48, "step": 1788 }, { "epoch": 0.371315898713159, "grad_norm": 1.2852233645627547, "learning_rate": 2.9080308890248646e-07, "loss": 1.4536, "step": 1789 }, { "epoch": 0.37152345371523454, "grad_norm": 0.7116942681505585, "learning_rate": 2.906951655972001e-07, "loss": 1.449, "step": 1790 }, { "epoch": 0.3717310087173101, "grad_norm": 0.7306489325876637, "learning_rate": 2.905872122342062e-07, "loss": 1.5153, "step": 1791 }, { "epoch": 0.37193856371938566, "grad_norm": 0.6744167899478379, "learning_rate": 2.904792288594036e-07, "loss": 1.5627, "step": 1792 }, { "epoch": 0.3721461187214612, "grad_norm": 0.9241052132026019, "learning_rate": 2.9037121551870406e-07, "loss": 1.4503, "step": 1793 }, { "epoch": 0.3723536737235367, "grad_norm": 0.6830456692624526, "learning_rate": 2.902631722580319e-07, "loss": 1.5196, "step": 1794 }, { "epoch": 0.3725612287256123, "grad_norm": 0.7699863146828911, "learning_rate": 2.9015509912332425e-07, "loss": 1.4719, "step": 1795 }, { "epoch": 0.37276878372768785, "grad_norm": 0.9407019799499728, "learning_rate": 2.9004699616053094e-07, "loss": 1.5128, "step": 1796 }, { "epoch": 0.3729763387297634, "grad_norm": 0.7331956559267344, "learning_rate": 2.899388634156146e-07, "loss": 1.5367, "step": 1797 }, { "epoch": 0.3731838937318389, "grad_norm": 1.5790529025933957, "learning_rate": 2.8983070093455024e-07, "loss": 1.475, "step": 1798 }, { "epoch": 0.3733914487339145, "grad_norm": 0.9006149220411652, "learning_rate": 2.8972250876332573e-07, "loss": 1.5965, "step": 1799 }, { "epoch": 0.37359900373599003, "grad_norm": 1.3755555683212195, "learning_rate": 2.8961428694794156e-07, "loss": 1.535, "step": 1800 }, { "epoch": 0.37380655873806556, "grad_norm": 0.7353782961824376, "learning_rate": 2.8950603553441073e-07, "loss": 1.417, "step": 1801 }, { "epoch": 0.37401411374014115, "grad_norm": 1.0600416247613942, "learning_rate": 2.893977545687589e-07, "loss": 1.54, "step": 1802 }, { "epoch": 0.3742216687422167, "grad_norm": 0.8104960875457892, "learning_rate": 2.8928944409702414e-07, "loss": 1.5143, "step": 1803 }, { "epoch": 0.3744292237442922, "grad_norm": 0.747597815078873, "learning_rate": 2.891811041652574e-07, "loss": 1.5215, "step": 1804 }, { "epoch": 0.3746367787463678, "grad_norm": 2.5472919710932693, "learning_rate": 2.890727348195217e-07, "loss": 1.5684, "step": 1805 }, { "epoch": 0.37484433374844334, "grad_norm": 0.6653594327993576, "learning_rate": 2.88964336105893e-07, "loss": 1.4674, "step": 1806 }, { "epoch": 0.37505188875051887, "grad_norm": 0.7682038926295779, "learning_rate": 2.888559080704595e-07, "loss": 1.5117, "step": 1807 }, { "epoch": 0.37525944375259446, "grad_norm": 0.7781683832905401, "learning_rate": 2.8874745075932184e-07, "loss": 1.5887, "step": 1808 }, { "epoch": 0.37546699875467, "grad_norm": 0.7733843964903611, "learning_rate": 2.886389642185934e-07, "loss": 1.6289, "step": 1809 }, { "epoch": 0.3756745537567455, "grad_norm": 0.7425122137061511, "learning_rate": 2.8853044849439946e-07, "loss": 1.546, "step": 1810 }, { "epoch": 0.3758821087588211, "grad_norm": 0.7630938308509255, "learning_rate": 2.884219036328783e-07, "loss": 1.5082, "step": 1811 }, { "epoch": 0.37608966376089664, "grad_norm": 0.7544870434777171, "learning_rate": 2.883133296801802e-07, "loss": 1.51, "step": 1812 }, { "epoch": 0.3762972187629722, "grad_norm": 0.701721632843437, "learning_rate": 2.8820472668246794e-07, "loss": 1.5119, "step": 1813 }, { "epoch": 0.37650477376504776, "grad_norm": 0.8288652246980378, "learning_rate": 2.880960946859166e-07, "loss": 1.4584, "step": 1814 }, { "epoch": 0.3767123287671233, "grad_norm": 0.6831096352226285, "learning_rate": 2.8798743373671366e-07, "loss": 1.5326, "step": 1815 }, { "epoch": 0.3769198837691988, "grad_norm": 0.7037553431320874, "learning_rate": 2.878787438810589e-07, "loss": 1.5215, "step": 1816 }, { "epoch": 0.3771274387712744, "grad_norm": 0.9937775764151584, "learning_rate": 2.8777002516516425e-07, "loss": 1.4809, "step": 1817 }, { "epoch": 0.37733499377334995, "grad_norm": 0.8753637791110365, "learning_rate": 2.8766127763525417e-07, "loss": 1.4619, "step": 1818 }, { "epoch": 0.3775425487754255, "grad_norm": 1.381331686046734, "learning_rate": 2.875525013375651e-07, "loss": 1.4532, "step": 1819 }, { "epoch": 0.377750103777501, "grad_norm": 5.828872507809322, "learning_rate": 2.874436963183459e-07, "loss": 1.4929, "step": 1820 }, { "epoch": 0.3779576587795766, "grad_norm": 0.8000148440892989, "learning_rate": 2.873348626238575e-07, "loss": 1.5261, "step": 1821 }, { "epoch": 0.37816521378165213, "grad_norm": 1.4545838082783766, "learning_rate": 2.8722600030037314e-07, "loss": 1.5992, "step": 1822 }, { "epoch": 0.37837276878372766, "grad_norm": 0.7454331605890411, "learning_rate": 2.871171093941782e-07, "loss": 1.4785, "step": 1823 }, { "epoch": 0.37858032378580325, "grad_norm": 0.8913187537978086, "learning_rate": 2.870081899515703e-07, "loss": 1.5635, "step": 1824 }, { "epoch": 0.3787878787878788, "grad_norm": 2.1580098516173813, "learning_rate": 2.8689924201885894e-07, "loss": 1.4963, "step": 1825 }, { "epoch": 0.3789954337899543, "grad_norm": 1.2800966783503454, "learning_rate": 2.8679026564236596e-07, "loss": 1.5661, "step": 1826 }, { "epoch": 0.3792029887920299, "grad_norm": 0.6756002911824516, "learning_rate": 2.8668126086842523e-07, "loss": 1.4652, "step": 1827 }, { "epoch": 0.37941054379410544, "grad_norm": 1.3902061654048554, "learning_rate": 2.865722277433826e-07, "loss": 1.552, "step": 1828 }, { "epoch": 0.37961809879618097, "grad_norm": 3.58459254832797, "learning_rate": 2.864631663135962e-07, "loss": 1.5641, "step": 1829 }, { "epoch": 0.37982565379825656, "grad_norm": 0.6543936366061491, "learning_rate": 2.863540766254359e-07, "loss": 1.5607, "step": 1830 }, { "epoch": 0.3800332088003321, "grad_norm": 0.6606988767087205, "learning_rate": 2.862449587252839e-07, "loss": 1.5429, "step": 1831 }, { "epoch": 0.3802407638024076, "grad_norm": 1.3028334828837254, "learning_rate": 2.861358126595341e-07, "loss": 1.5575, "step": 1832 }, { "epoch": 0.3804483188044832, "grad_norm": 0.878861104681702, "learning_rate": 2.860266384745925e-07, "loss": 1.5603, "step": 1833 }, { "epoch": 0.38065587380655874, "grad_norm": 0.7172936096231933, "learning_rate": 2.859174362168773e-07, "loss": 1.5112, "step": 1834 }, { "epoch": 0.3808634288086343, "grad_norm": 0.672063637043049, "learning_rate": 2.8580820593281816e-07, "loss": 1.5849, "step": 1835 }, { "epoch": 0.38107098381070986, "grad_norm": 0.7124377997873157, "learning_rate": 2.8569894766885694e-07, "loss": 1.5265, "step": 1836 }, { "epoch": 0.3812785388127854, "grad_norm": 0.7413567235389821, "learning_rate": 2.8558966147144736e-07, "loss": 1.5913, "step": 1837 }, { "epoch": 0.3814860938148609, "grad_norm": 0.9272380461197702, "learning_rate": 2.8548034738705507e-07, "loss": 1.5424, "step": 1838 }, { "epoch": 0.3816936488169365, "grad_norm": 1.8804828007492838, "learning_rate": 2.853710054621574e-07, "loss": 1.5533, "step": 1839 }, { "epoch": 0.38190120381901205, "grad_norm": 0.7636573500978951, "learning_rate": 2.852616357432438e-07, "loss": 1.5569, "step": 1840 }, { "epoch": 0.3821087588210876, "grad_norm": 1.1772095457271807, "learning_rate": 2.851522382768153e-07, "loss": 1.592, "step": 1841 }, { "epoch": 0.3823163138231631, "grad_norm": 0.7493612440366139, "learning_rate": 2.8504281310938467e-07, "loss": 1.5642, "step": 1842 }, { "epoch": 0.3825238688252387, "grad_norm": 0.9451664195050361, "learning_rate": 2.849333602874768e-07, "loss": 1.5107, "step": 1843 }, { "epoch": 0.38273142382731423, "grad_norm": 0.6392717217497093, "learning_rate": 2.848238798576279e-07, "loss": 1.5097, "step": 1844 }, { "epoch": 0.38293897882938976, "grad_norm": 1.1872601313715576, "learning_rate": 2.8471437186638637e-07, "loss": 1.4252, "step": 1845 }, { "epoch": 0.38314653383146535, "grad_norm": 0.6530782862274989, "learning_rate": 2.846048363603119e-07, "loss": 1.4733, "step": 1846 }, { "epoch": 0.3833540888335409, "grad_norm": 0.8818869721313423, "learning_rate": 2.844952733859763e-07, "loss": 1.6101, "step": 1847 }, { "epoch": 0.3835616438356164, "grad_norm": 1.9324674517805251, "learning_rate": 2.8438568298996264e-07, "loss": 1.5288, "step": 1848 }, { "epoch": 0.383769198837692, "grad_norm": 0.6803426081958092, "learning_rate": 2.842760652188658e-07, "loss": 1.5297, "step": 1849 }, { "epoch": 0.38397675383976754, "grad_norm": 0.7028778133935468, "learning_rate": 2.841664201192926e-07, "loss": 1.5611, "step": 1850 }, { "epoch": 0.38418430884184307, "grad_norm": 0.6692229665364089, "learning_rate": 2.84056747737861e-07, "loss": 1.4909, "step": 1851 }, { "epoch": 0.38439186384391866, "grad_norm": 0.83404468757173, "learning_rate": 2.83947048121201e-07, "loss": 1.5241, "step": 1852 }, { "epoch": 0.3845994188459942, "grad_norm": 0.7449454132251969, "learning_rate": 2.838373213159537e-07, "loss": 1.5242, "step": 1853 }, { "epoch": 0.3848069738480697, "grad_norm": 0.7572259514250277, "learning_rate": 2.8372756736877223e-07, "loss": 1.577, "step": 1854 }, { "epoch": 0.3850145288501453, "grad_norm": 0.7814337178458187, "learning_rate": 2.83617786326321e-07, "loss": 1.5791, "step": 1855 }, { "epoch": 0.38522208385222084, "grad_norm": 0.9747877083420584, "learning_rate": 2.8350797823527595e-07, "loss": 1.5588, "step": 1856 }, { "epoch": 0.3854296388542964, "grad_norm": 1.0560047301941708, "learning_rate": 2.8339814314232467e-07, "loss": 1.4953, "step": 1857 }, { "epoch": 0.38563719385637196, "grad_norm": 1.0473786774544744, "learning_rate": 2.832882810941659e-07, "loss": 1.5532, "step": 1858 }, { "epoch": 0.3858447488584475, "grad_norm": 0.967706897548344, "learning_rate": 2.8317839213751036e-07, "loss": 1.559, "step": 1859 }, { "epoch": 0.386052303860523, "grad_norm": 1.1212019870201413, "learning_rate": 2.830684763190797e-07, "loss": 1.4703, "step": 1860 }, { "epoch": 0.3862598588625986, "grad_norm": 0.8705750913419449, "learning_rate": 2.829585336856073e-07, "loss": 1.5377, "step": 1861 }, { "epoch": 0.38646741386467415, "grad_norm": 0.8890857236355498, "learning_rate": 2.8284856428383783e-07, "loss": 1.4978, "step": 1862 }, { "epoch": 0.3866749688667497, "grad_norm": 0.7373074560343306, "learning_rate": 2.827385681605273e-07, "loss": 1.5216, "step": 1863 }, { "epoch": 0.3868825238688252, "grad_norm": 0.7450661492329643, "learning_rate": 2.8262854536244333e-07, "loss": 1.4973, "step": 1864 }, { "epoch": 0.3870900788709008, "grad_norm": 0.6472947892747584, "learning_rate": 2.8251849593636444e-07, "loss": 1.5811, "step": 1865 }, { "epoch": 0.38729763387297633, "grad_norm": 0.8515492354658755, "learning_rate": 2.8240841992908093e-07, "loss": 1.5378, "step": 1866 }, { "epoch": 0.38750518887505186, "grad_norm": 1.3033141826532952, "learning_rate": 2.822983173873941e-07, "loss": 1.5013, "step": 1867 }, { "epoch": 0.38771274387712745, "grad_norm": 0.9766285733196798, "learning_rate": 2.8218818835811664e-07, "loss": 1.6529, "step": 1868 }, { "epoch": 0.387920298879203, "grad_norm": 0.6403998153684116, "learning_rate": 2.820780328880725e-07, "loss": 1.583, "step": 1869 }, { "epoch": 0.3881278538812785, "grad_norm": 0.773413876447274, "learning_rate": 2.8196785102409683e-07, "loss": 1.4987, "step": 1870 }, { "epoch": 0.3883354088833541, "grad_norm": 1.7284876751288045, "learning_rate": 2.81857642813036e-07, "loss": 1.5257, "step": 1871 }, { "epoch": 0.38854296388542964, "grad_norm": 0.7333806711353849, "learning_rate": 2.8174740830174777e-07, "loss": 1.5563, "step": 1872 }, { "epoch": 0.38875051888750517, "grad_norm": 1.001070885952592, "learning_rate": 2.8163714753710084e-07, "loss": 1.5177, "step": 1873 }, { "epoch": 0.38895807388958076, "grad_norm": 0.8328437223603238, "learning_rate": 2.815268605659751e-07, "loss": 1.5247, "step": 1874 }, { "epoch": 0.3891656288916563, "grad_norm": 0.7583358914287197, "learning_rate": 2.814165474352617e-07, "loss": 1.4546, "step": 1875 }, { "epoch": 0.3893731838937318, "grad_norm": 0.7124746928026011, "learning_rate": 2.8130620819186284e-07, "loss": 1.507, "step": 1876 }, { "epoch": 0.3895807388958074, "grad_norm": 0.9292279294119694, "learning_rate": 2.811958428826918e-07, "loss": 1.492, "step": 1877 }, { "epoch": 0.38978829389788294, "grad_norm": 0.665360962298018, "learning_rate": 2.810854515546731e-07, "loss": 1.5523, "step": 1878 }, { "epoch": 0.3899958488999585, "grad_norm": 0.9000411119834659, "learning_rate": 2.8097503425474215e-07, "loss": 1.5552, "step": 1879 }, { "epoch": 0.39020340390203406, "grad_norm": 0.8156011417190381, "learning_rate": 2.808645910298454e-07, "loss": 1.5059, "step": 1880 }, { "epoch": 0.3904109589041096, "grad_norm": 0.6787718400614382, "learning_rate": 2.807541219269404e-07, "loss": 1.4733, "step": 1881 }, { "epoch": 0.3906185139061851, "grad_norm": 0.7672358766353656, "learning_rate": 2.8064362699299565e-07, "loss": 1.5063, "step": 1882 }, { "epoch": 0.3908260689082607, "grad_norm": 0.6275728138528871, "learning_rate": 2.805331062749907e-07, "loss": 1.5565, "step": 1883 }, { "epoch": 0.39103362391033625, "grad_norm": 0.9776960823012801, "learning_rate": 2.8042255981991607e-07, "loss": 1.5652, "step": 1884 }, { "epoch": 0.3912411789124118, "grad_norm": 0.6852027388058879, "learning_rate": 2.8031198767477314e-07, "loss": 1.5819, "step": 1885 }, { "epoch": 0.3914487339144873, "grad_norm": 0.9193455622299851, "learning_rate": 2.8020138988657424e-07, "loss": 1.5018, "step": 1886 }, { "epoch": 0.3916562889165629, "grad_norm": 0.777764201613916, "learning_rate": 2.800907665023426e-07, "loss": 1.5728, "step": 1887 }, { "epoch": 0.39186384391863843, "grad_norm": 0.7520658019294972, "learning_rate": 2.7998011756911233e-07, "loss": 1.5694, "step": 1888 }, { "epoch": 0.39207139892071396, "grad_norm": 0.6706414072368084, "learning_rate": 2.798694431339285e-07, "loss": 1.5159, "step": 1889 }, { "epoch": 0.39227895392278955, "grad_norm": 0.9576753099157093, "learning_rate": 2.797587432438468e-07, "loss": 1.5203, "step": 1890 }, { "epoch": 0.3924865089248651, "grad_norm": 0.7720234725496633, "learning_rate": 2.796480179459341e-07, "loss": 1.4727, "step": 1891 }, { "epoch": 0.3926940639269406, "grad_norm": 1.0040258258328383, "learning_rate": 2.795372672872677e-07, "loss": 1.5072, "step": 1892 }, { "epoch": 0.3929016189290162, "grad_norm": 0.8197918622449006, "learning_rate": 2.7942649131493583e-07, "loss": 1.5386, "step": 1893 }, { "epoch": 0.39310917393109174, "grad_norm": 0.8466677779127744, "learning_rate": 2.793156900760376e-07, "loss": 1.5824, "step": 1894 }, { "epoch": 0.39331672893316727, "grad_norm": 0.9189457018412223, "learning_rate": 2.7920486361768265e-07, "loss": 1.4849, "step": 1895 }, { "epoch": 0.39352428393524286, "grad_norm": 1.8072961179005242, "learning_rate": 2.7909401198699147e-07, "loss": 1.5832, "step": 1896 }, { "epoch": 0.3937318389373184, "grad_norm": 1.3560238159837612, "learning_rate": 2.789831352310953e-07, "loss": 1.5138, "step": 1897 }, { "epoch": 0.3939393939393939, "grad_norm": 1.2251198407633839, "learning_rate": 2.788722333971359e-07, "loss": 1.5635, "step": 1898 }, { "epoch": 0.3941469489414695, "grad_norm": 0.7103893180844721, "learning_rate": 2.7876130653226583e-07, "loss": 1.5849, "step": 1899 }, { "epoch": 0.39435450394354504, "grad_norm": 0.6425399457519182, "learning_rate": 2.786503546836482e-07, "loss": 1.5406, "step": 1900 }, { "epoch": 0.3945620589456206, "grad_norm": 0.8424368335860033, "learning_rate": 2.7853937789845703e-07, "loss": 1.5178, "step": 1901 }, { "epoch": 0.39476961394769616, "grad_norm": 0.7443914766213149, "learning_rate": 2.7842837622387634e-07, "loss": 1.559, "step": 1902 }, { "epoch": 0.3949771689497717, "grad_norm": 0.9345212891903224, "learning_rate": 2.7831734970710124e-07, "loss": 1.5163, "step": 1903 }, { "epoch": 0.3951847239518472, "grad_norm": 1.101698639321631, "learning_rate": 2.7820629839533735e-07, "loss": 1.5418, "step": 1904 }, { "epoch": 0.3953922789539228, "grad_norm": 0.7369936160309475, "learning_rate": 2.7809522233580067e-07, "loss": 1.5049, "step": 1905 }, { "epoch": 0.39559983395599835, "grad_norm": 0.82063854054892, "learning_rate": 2.779841215757178e-07, "loss": 1.476, "step": 1906 }, { "epoch": 0.3958073889580739, "grad_norm": 1.2804337591301285, "learning_rate": 2.7787299616232587e-07, "loss": 1.5276, "step": 1907 }, { "epoch": 0.3960149439601494, "grad_norm": 0.7294762129030232, "learning_rate": 2.777618461428723e-07, "loss": 1.5479, "step": 1908 }, { "epoch": 0.396222498962225, "grad_norm": 0.6339456556839771, "learning_rate": 2.7765067156461533e-07, "loss": 1.5121, "step": 1909 }, { "epoch": 0.39643005396430053, "grad_norm": 0.9371184268948276, "learning_rate": 2.775394724748233e-07, "loss": 1.568, "step": 1910 }, { "epoch": 0.39663760896637607, "grad_norm": 1.1074910091988812, "learning_rate": 2.774282489207752e-07, "loss": 1.5305, "step": 1911 }, { "epoch": 0.39684516396845165, "grad_norm": 1.3944087746190492, "learning_rate": 2.773170009497602e-07, "loss": 1.498, "step": 1912 }, { "epoch": 0.3970527189705272, "grad_norm": 0.7427476163230735, "learning_rate": 2.7720572860907825e-07, "loss": 1.546, "step": 1913 }, { "epoch": 0.3972602739726027, "grad_norm": 0.7726285525939837, "learning_rate": 2.770944319460391e-07, "loss": 1.5231, "step": 1914 }, { "epoch": 0.3974678289746783, "grad_norm": 0.798868942757301, "learning_rate": 2.769831110079632e-07, "loss": 1.5249, "step": 1915 }, { "epoch": 0.39767538397675384, "grad_norm": 1.0416876829616981, "learning_rate": 2.7687176584218137e-07, "loss": 1.507, "step": 1916 }, { "epoch": 0.39788293897882937, "grad_norm": 0.9686546628175399, "learning_rate": 2.767603964960346e-07, "loss": 1.503, "step": 1917 }, { "epoch": 0.39809049398090496, "grad_norm": 0.8320277908681334, "learning_rate": 2.766490030168742e-07, "loss": 1.5501, "step": 1918 }, { "epoch": 0.3982980489829805, "grad_norm": 3.0362071168980322, "learning_rate": 2.765375854520616e-07, "loss": 1.5429, "step": 1919 }, { "epoch": 0.398505603985056, "grad_norm": 2.383344141689591, "learning_rate": 2.764261438489686e-07, "loss": 1.4693, "step": 1920 }, { "epoch": 0.3987131589871316, "grad_norm": 0.9330224023721052, "learning_rate": 2.763146782549773e-07, "loss": 1.5399, "step": 1921 }, { "epoch": 0.39892071398920714, "grad_norm": 0.6333921145130824, "learning_rate": 2.7620318871747986e-07, "loss": 1.4676, "step": 1922 }, { "epoch": 0.3991282689912827, "grad_norm": 0.7247784637507091, "learning_rate": 2.7609167528387877e-07, "loss": 1.6343, "step": 1923 }, { "epoch": 0.39933582399335826, "grad_norm": 0.7223408769449733, "learning_rate": 2.7598013800158637e-07, "loss": 1.531, "step": 1924 }, { "epoch": 0.3995433789954338, "grad_norm": 0.9010314709398364, "learning_rate": 2.758685769180256e-07, "loss": 1.599, "step": 1925 }, { "epoch": 0.39975093399750933, "grad_norm": 0.6944061389856688, "learning_rate": 2.7575699208062914e-07, "loss": 1.4845, "step": 1926 }, { "epoch": 0.3999584889995849, "grad_norm": 0.8364905591527232, "learning_rate": 2.7564538353683984e-07, "loss": 1.553, "step": 1927 }, { "epoch": 0.40016604400166045, "grad_norm": 0.7529657059424397, "learning_rate": 2.7553375133411075e-07, "loss": 1.5283, "step": 1928 }, { "epoch": 0.400373599003736, "grad_norm": 0.7190180928510772, "learning_rate": 2.7542209551990495e-07, "loss": 1.5077, "step": 1929 }, { "epoch": 0.4005811540058115, "grad_norm": 0.6925289951992526, "learning_rate": 2.7531041614169556e-07, "loss": 1.5486, "step": 1930 }, { "epoch": 0.4007887090078871, "grad_norm": 0.7915887904090121, "learning_rate": 2.751987132469656e-07, "loss": 1.5124, "step": 1931 }, { "epoch": 0.40099626400996263, "grad_norm": 0.7627898228572834, "learning_rate": 2.750869868832082e-07, "loss": 1.5642, "step": 1932 }, { "epoch": 0.40120381901203817, "grad_norm": 2.3036117094911464, "learning_rate": 2.7497523709792656e-07, "loss": 1.5635, "step": 1933 }, { "epoch": 0.40141137401411375, "grad_norm": 1.1773529869650565, "learning_rate": 2.7486346393863345e-07, "loss": 1.5154, "step": 1934 }, { "epoch": 0.4016189290161893, "grad_norm": 0.9991890160932483, "learning_rate": 2.747516674528521e-07, "loss": 1.5523, "step": 1935 }, { "epoch": 0.4018264840182648, "grad_norm": 0.7512122792523664, "learning_rate": 2.7463984768811533e-07, "loss": 1.5325, "step": 1936 }, { "epoch": 0.4020340390203404, "grad_norm": 0.8008766142155629, "learning_rate": 2.745280046919659e-07, "loss": 1.5534, "step": 1937 }, { "epoch": 0.40224159402241594, "grad_norm": 0.939625216040225, "learning_rate": 2.7441613851195657e-07, "loss": 1.5254, "step": 1938 }, { "epoch": 0.40244914902449147, "grad_norm": 0.9794917016143329, "learning_rate": 2.7430424919564976e-07, "loss": 1.4973, "step": 1939 }, { "epoch": 0.40265670402656706, "grad_norm": 0.9013638156949378, "learning_rate": 2.7419233679061785e-07, "loss": 1.475, "step": 1940 }, { "epoch": 0.4028642590286426, "grad_norm": 0.7953984772173054, "learning_rate": 2.740804013444431e-07, "loss": 1.4647, "step": 1941 }, { "epoch": 0.4030718140307181, "grad_norm": 0.7251998686131645, "learning_rate": 2.7396844290471745e-07, "loss": 1.5457, "step": 1942 }, { "epoch": 0.4032793690327937, "grad_norm": 0.6345492526265151, "learning_rate": 2.7385646151904264e-07, "loss": 1.4983, "step": 1943 }, { "epoch": 0.40348692403486924, "grad_norm": 1.1620281048635444, "learning_rate": 2.7374445723503024e-07, "loss": 1.5516, "step": 1944 }, { "epoch": 0.4036944790369448, "grad_norm": 0.8362171382145378, "learning_rate": 2.7363243010030143e-07, "loss": 1.5405, "step": 1945 }, { "epoch": 0.40390203403902036, "grad_norm": 1.0620848685033106, "learning_rate": 2.735203801624872e-07, "loss": 1.5088, "step": 1946 }, { "epoch": 0.4041095890410959, "grad_norm": 0.8707960728684162, "learning_rate": 2.7340830746922826e-07, "loss": 1.5262, "step": 1947 }, { "epoch": 0.40431714404317143, "grad_norm": 1.071232075744944, "learning_rate": 2.7329621206817484e-07, "loss": 1.4723, "step": 1948 }, { "epoch": 0.404524699045247, "grad_norm": 0.7257287686710381, "learning_rate": 2.7318409400698695e-07, "loss": 1.5015, "step": 1949 }, { "epoch": 0.40473225404732255, "grad_norm": 0.8668421279176032, "learning_rate": 2.7307195333333434e-07, "loss": 1.5108, "step": 1950 }, { "epoch": 0.4049398090493981, "grad_norm": 0.7627042084590963, "learning_rate": 2.7295979009489613e-07, "loss": 1.5181, "step": 1951 }, { "epoch": 0.4051473640514736, "grad_norm": 0.8092281435814975, "learning_rate": 2.7284760433936116e-07, "loss": 1.5521, "step": 1952 }, { "epoch": 0.4053549190535492, "grad_norm": 1.0549612633542718, "learning_rate": 2.727353961144278e-07, "loss": 1.4943, "step": 1953 }, { "epoch": 0.40556247405562473, "grad_norm": 0.8105043827481816, "learning_rate": 2.726231654678041e-07, "loss": 1.5111, "step": 1954 }, { "epoch": 0.40577002905770027, "grad_norm": 0.7174589377144522, "learning_rate": 2.725109124472075e-07, "loss": 1.4832, "step": 1955 }, { "epoch": 0.40597758405977585, "grad_norm": 0.9245630002901994, "learning_rate": 2.72398637100365e-07, "loss": 1.5428, "step": 1956 }, { "epoch": 0.4061851390618514, "grad_norm": 0.9163491355752184, "learning_rate": 2.7228633947501313e-07, "loss": 1.5254, "step": 1957 }, { "epoch": 0.4063926940639269, "grad_norm": 1.108975707723247, "learning_rate": 2.721740196188978e-07, "loss": 1.5512, "step": 1958 }, { "epoch": 0.4066002490660025, "grad_norm": 0.7944624451383683, "learning_rate": 2.7206167757977453e-07, "loss": 1.5106, "step": 1959 }, { "epoch": 0.40680780406807804, "grad_norm": 5.609741949134925, "learning_rate": 2.719493134054081e-07, "loss": 1.5096, "step": 1960 }, { "epoch": 0.40701535907015357, "grad_norm": 1.1187070928880356, "learning_rate": 2.718369271435728e-07, "loss": 1.5925, "step": 1961 }, { "epoch": 0.40722291407222916, "grad_norm": 1.0546010614843362, "learning_rate": 2.7172451884205216e-07, "loss": 1.5623, "step": 1962 }, { "epoch": 0.4074304690743047, "grad_norm": 0.9062508446385186, "learning_rate": 2.716120885486395e-07, "loss": 1.5115, "step": 1963 }, { "epoch": 0.4076380240763802, "grad_norm": 0.6314903926596144, "learning_rate": 2.714996363111369e-07, "loss": 1.557, "step": 1964 }, { "epoch": 0.4078455790784558, "grad_norm": 0.795655228469788, "learning_rate": 2.713871621773562e-07, "loss": 1.5546, "step": 1965 }, { "epoch": 0.40805313408053134, "grad_norm": 1.370501547675247, "learning_rate": 2.712746661951184e-07, "loss": 1.5506, "step": 1966 }, { "epoch": 0.4082606890826069, "grad_norm": 0.7452116526347807, "learning_rate": 2.7116214841225375e-07, "loss": 1.4974, "step": 1967 }, { "epoch": 0.40846824408468246, "grad_norm": 0.7842228040721118, "learning_rate": 2.710496088766019e-07, "loss": 1.5392, "step": 1968 }, { "epoch": 0.408675799086758, "grad_norm": 0.8855829100805783, "learning_rate": 2.709370476360116e-07, "loss": 1.5472, "step": 1969 }, { "epoch": 0.40888335408883353, "grad_norm": 0.7929509687283204, "learning_rate": 2.70824464738341e-07, "loss": 1.4534, "step": 1970 }, { "epoch": 0.4090909090909091, "grad_norm": 0.69688109019725, "learning_rate": 2.7071186023145736e-07, "loss": 1.6032, "step": 1971 }, { "epoch": 0.40929846409298465, "grad_norm": 0.8672389131683208, "learning_rate": 2.7059923416323694e-07, "loss": 1.5137, "step": 1972 }, { "epoch": 0.4095060190950602, "grad_norm": 2.161778956729785, "learning_rate": 2.704865865815656e-07, "loss": 1.5691, "step": 1973 }, { "epoch": 0.40971357409713577, "grad_norm": 0.9402531109283854, "learning_rate": 2.70373917534338e-07, "loss": 1.5073, "step": 1974 }, { "epoch": 0.4099211290992113, "grad_norm": 0.7410762861775534, "learning_rate": 2.7026122706945796e-07, "loss": 1.5349, "step": 1975 }, { "epoch": 0.41012868410128683, "grad_norm": 0.8531186208854797, "learning_rate": 2.7014851523483854e-07, "loss": 1.478, "step": 1976 }, { "epoch": 0.41033623910336237, "grad_norm": 1.0683833121668473, "learning_rate": 2.7003578207840185e-07, "loss": 1.553, "step": 1977 }, { "epoch": 0.41054379410543795, "grad_norm": 0.7647988716687542, "learning_rate": 2.699230276480789e-07, "loss": 1.4995, "step": 1978 }, { "epoch": 0.4107513491075135, "grad_norm": 0.7178294451724859, "learning_rate": 2.6981025199181e-07, "loss": 1.5437, "step": 1979 }, { "epoch": 0.410958904109589, "grad_norm": 0.7650489216282447, "learning_rate": 2.6969745515754444e-07, "loss": 1.4938, "step": 1980 }, { "epoch": 0.4111664591116646, "grad_norm": 0.8511815549654166, "learning_rate": 2.695846371932402e-07, "loss": 1.4929, "step": 1981 }, { "epoch": 0.41137401411374014, "grad_norm": 0.789166193611733, "learning_rate": 2.694717981468647e-07, "loss": 1.5835, "step": 1982 }, { "epoch": 0.41158156911581567, "grad_norm": 0.9913750998034185, "learning_rate": 2.69358938066394e-07, "loss": 1.5683, "step": 1983 }, { "epoch": 0.41178912411789126, "grad_norm": 0.7305718241286214, "learning_rate": 2.692460569998133e-07, "loss": 1.574, "step": 1984 }, { "epoch": 0.4119966791199668, "grad_norm": 0.7436803249449166, "learning_rate": 2.6913315499511647e-07, "loss": 1.5329, "step": 1985 }, { "epoch": 0.4122042341220423, "grad_norm": 1.0178942563673272, "learning_rate": 2.690202321003067e-07, "loss": 1.4834, "step": 1986 }, { "epoch": 0.4124117891241179, "grad_norm": 0.7376959817155728, "learning_rate": 2.6890728836339545e-07, "loss": 1.5433, "step": 1987 }, { "epoch": 0.41261934412619344, "grad_norm": 0.9555009382855892, "learning_rate": 2.6879432383240376e-07, "loss": 1.5354, "step": 1988 }, { "epoch": 0.412826899128269, "grad_norm": 0.6662939897421363, "learning_rate": 2.68681338555361e-07, "loss": 1.4966, "step": 1989 }, { "epoch": 0.41303445413034456, "grad_norm": 0.843215642797735, "learning_rate": 2.6856833258030536e-07, "loss": 1.4844, "step": 1990 }, { "epoch": 0.4132420091324201, "grad_norm": 0.9991433835316956, "learning_rate": 2.6845530595528426e-07, "loss": 1.5481, "step": 1991 }, { "epoch": 0.41344956413449563, "grad_norm": 0.8435969999898885, "learning_rate": 2.6834225872835343e-07, "loss": 1.4435, "step": 1992 }, { "epoch": 0.4136571191365712, "grad_norm": 0.6791292461181895, "learning_rate": 2.682291909475776e-07, "loss": 1.516, "step": 1993 }, { "epoch": 0.41386467413864675, "grad_norm": 0.9605644876741145, "learning_rate": 2.6811610266103027e-07, "loss": 1.4849, "step": 1994 }, { "epoch": 0.4140722291407223, "grad_norm": 1.0086035047298292, "learning_rate": 2.680029939167934e-07, "loss": 1.4928, "step": 1995 }, { "epoch": 0.41427978414279787, "grad_norm": 0.8368771415626136, "learning_rate": 2.678898647629579e-07, "loss": 1.5075, "step": 1996 }, { "epoch": 0.4144873391448734, "grad_norm": 0.8690980104636004, "learning_rate": 2.6777671524762333e-07, "loss": 1.5928, "step": 1997 }, { "epoch": 0.41469489414694893, "grad_norm": 0.720130372239292, "learning_rate": 2.6766354541889787e-07, "loss": 1.4572, "step": 1998 }, { "epoch": 0.41490244914902447, "grad_norm": 0.7549670726723419, "learning_rate": 2.6755035532489833e-07, "loss": 1.5779, "step": 1999 }, { "epoch": 0.41511000415110005, "grad_norm": 0.9098930657658568, "learning_rate": 2.6743714501375003e-07, "loss": 1.5239, "step": 2000 }, { "epoch": 0.4153175591531756, "grad_norm": 1.2050494538479672, "learning_rate": 2.6732391453358713e-07, "loss": 1.462, "step": 2001 }, { "epoch": 0.4155251141552511, "grad_norm": 0.8173804395085473, "learning_rate": 2.672106639325521e-07, "loss": 1.5721, "step": 2002 }, { "epoch": 0.4157326691573267, "grad_norm": 0.7557662067098537, "learning_rate": 2.670973932587961e-07, "loss": 1.5223, "step": 2003 }, { "epoch": 0.41594022415940224, "grad_norm": 0.8747910090242246, "learning_rate": 2.669841025604789e-07, "loss": 1.5398, "step": 2004 }, { "epoch": 0.41614777916147777, "grad_norm": 1.1885443368681692, "learning_rate": 2.668707918857687e-07, "loss": 1.4637, "step": 2005 }, { "epoch": 0.41635533416355336, "grad_norm": 0.8769914767750531, "learning_rate": 2.66757461282842e-07, "loss": 1.5319, "step": 2006 }, { "epoch": 0.4165628891656289, "grad_norm": 0.789150243313903, "learning_rate": 2.666441107998842e-07, "loss": 1.5372, "step": 2007 }, { "epoch": 0.4167704441677044, "grad_norm": 0.9155065626204896, "learning_rate": 2.665307404850887e-07, "loss": 1.5586, "step": 2008 }, { "epoch": 0.41697799916978, "grad_norm": 0.6258097564708011, "learning_rate": 2.664173503866578e-07, "loss": 1.5735, "step": 2009 }, { "epoch": 0.41718555417185554, "grad_norm": 0.8174737773932771, "learning_rate": 2.6630394055280175e-07, "loss": 1.5316, "step": 2010 }, { "epoch": 0.4173931091739311, "grad_norm": 0.7038536993377364, "learning_rate": 2.6619051103173957e-07, "loss": 1.4509, "step": 2011 }, { "epoch": 0.41760066417600666, "grad_norm": 0.7542523517708427, "learning_rate": 2.660770618716983e-07, "loss": 1.5315, "step": 2012 }, { "epoch": 0.4178082191780822, "grad_norm": 0.8299481166178969, "learning_rate": 2.659635931209137e-07, "loss": 1.5386, "step": 2013 }, { "epoch": 0.41801577418015773, "grad_norm": 0.6961370689734985, "learning_rate": 2.6585010482762946e-07, "loss": 1.4669, "step": 2014 }, { "epoch": 0.4182233291822333, "grad_norm": 0.7680902273366674, "learning_rate": 2.6573659704009794e-07, "loss": 1.5235, "step": 2015 }, { "epoch": 0.41843088418430885, "grad_norm": 0.8476145513507719, "learning_rate": 2.656230698065796e-07, "loss": 1.5182, "step": 2016 }, { "epoch": 0.4186384391863844, "grad_norm": 0.7733746156412612, "learning_rate": 2.655095231753432e-07, "loss": 1.543, "step": 2017 }, { "epoch": 0.41884599418845997, "grad_norm": 1.7757191044600509, "learning_rate": 2.653959571946657e-07, "loss": 1.5023, "step": 2018 }, { "epoch": 0.4190535491905355, "grad_norm": 0.6452839633580992, "learning_rate": 2.652823719128325e-07, "loss": 1.4772, "step": 2019 }, { "epoch": 0.41926110419261103, "grad_norm": 0.7799227453446673, "learning_rate": 2.6516876737813685e-07, "loss": 1.475, "step": 2020 }, { "epoch": 0.41946865919468657, "grad_norm": 0.8006026534273608, "learning_rate": 2.6505514363888056e-07, "loss": 1.5116, "step": 2021 }, { "epoch": 0.41967621419676215, "grad_norm": 1.5066166955921338, "learning_rate": 2.6494150074337324e-07, "loss": 1.5478, "step": 2022 }, { "epoch": 0.4198837691988377, "grad_norm": 0.7222997155125618, "learning_rate": 2.64827838739933e-07, "loss": 1.4958, "step": 2023 }, { "epoch": 0.4200913242009132, "grad_norm": 0.8223025333169117, "learning_rate": 2.647141576768858e-07, "loss": 1.494, "step": 2024 }, { "epoch": 0.4202988792029888, "grad_norm": 0.6656864767856296, "learning_rate": 2.646004576025659e-07, "loss": 1.546, "step": 2025 }, { "epoch": 0.42050643420506434, "grad_norm": 0.7805902033504355, "learning_rate": 2.6448673856531543e-07, "loss": 1.552, "step": 2026 }, { "epoch": 0.42071398920713987, "grad_norm": 1.272531855540796, "learning_rate": 2.6437300061348485e-07, "loss": 1.6448, "step": 2027 }, { "epoch": 0.42092154420921546, "grad_norm": 0.6355549472224202, "learning_rate": 2.642592437954324e-07, "loss": 1.5084, "step": 2028 }, { "epoch": 0.421129099211291, "grad_norm": 1.3291418867193618, "learning_rate": 2.641454681595246e-07, "loss": 1.5703, "step": 2029 }, { "epoch": 0.4213366542133665, "grad_norm": 0.789434902757764, "learning_rate": 2.640316737541356e-07, "loss": 1.4814, "step": 2030 }, { "epoch": 0.4215442092154421, "grad_norm": 1.2506765962226911, "learning_rate": 2.6391786062764794e-07, "loss": 1.4988, "step": 2031 }, { "epoch": 0.42175176421751764, "grad_norm": 1.1450936888881258, "learning_rate": 2.638040288284519e-07, "loss": 1.6556, "step": 2032 }, { "epoch": 0.4219593192195932, "grad_norm": 3.6865029122549933, "learning_rate": 2.6369017840494576e-07, "loss": 1.511, "step": 2033 }, { "epoch": 0.42216687422166876, "grad_norm": 0.8230006227676449, "learning_rate": 2.6357630940553564e-07, "loss": 1.5834, "step": 2034 }, { "epoch": 0.4223744292237443, "grad_norm": 0.9182976130910437, "learning_rate": 2.634624218786356e-07, "loss": 1.4871, "step": 2035 }, { "epoch": 0.42258198422581983, "grad_norm": 0.7335896230065206, "learning_rate": 2.633485158726677e-07, "loss": 1.4475, "step": 2036 }, { "epoch": 0.4227895392278954, "grad_norm": 1.4666424136570828, "learning_rate": 2.632345914360617e-07, "loss": 1.5274, "step": 2037 }, { "epoch": 0.42299709422997095, "grad_norm": 2.4023424201431913, "learning_rate": 2.6312064861725526e-07, "loss": 1.4725, "step": 2038 }, { "epoch": 0.4232046492320465, "grad_norm": 0.7697042861056618, "learning_rate": 2.630066874646938e-07, "loss": 1.5226, "step": 2039 }, { "epoch": 0.42341220423412207, "grad_norm": 0.6845749982480749, "learning_rate": 2.628927080268305e-07, "loss": 1.55, "step": 2040 }, { "epoch": 0.4236197592361976, "grad_norm": 0.7471716141410968, "learning_rate": 2.6277871035212653e-07, "loss": 1.5378, "step": 2041 }, { "epoch": 0.42382731423827313, "grad_norm": 0.7208714626204153, "learning_rate": 2.626646944890507e-07, "loss": 1.4736, "step": 2042 }, { "epoch": 0.42403486924034867, "grad_norm": 3.9840396195222674, "learning_rate": 2.625506604860794e-07, "loss": 1.4753, "step": 2043 }, { "epoch": 0.42424242424242425, "grad_norm": 1.0022284636716088, "learning_rate": 2.624366083916969e-07, "loss": 1.5241, "step": 2044 }, { "epoch": 0.4244499792444998, "grad_norm": 0.706265138873214, "learning_rate": 2.6232253825439515e-07, "loss": 1.4645, "step": 2045 }, { "epoch": 0.4246575342465753, "grad_norm": 0.6291291912454011, "learning_rate": 2.622084501226737e-07, "loss": 1.4578, "step": 2046 }, { "epoch": 0.4248650892486509, "grad_norm": 0.876011593692041, "learning_rate": 2.6209434404503994e-07, "loss": 1.5483, "step": 2047 }, { "epoch": 0.42507264425072644, "grad_norm": 1.0912952966883218, "learning_rate": 2.619802200700085e-07, "loss": 1.493, "step": 2048 }, { "epoch": 0.42528019925280197, "grad_norm": 0.6980180837308734, "learning_rate": 2.618660782461021e-07, "loss": 1.5547, "step": 2049 }, { "epoch": 0.42548775425487756, "grad_norm": 0.9037102877983608, "learning_rate": 2.6175191862185066e-07, "loss": 1.483, "step": 2050 }, { "epoch": 0.4256953092569531, "grad_norm": 0.7792476610232986, "learning_rate": 2.616377412457919e-07, "loss": 1.5847, "step": 2051 }, { "epoch": 0.4259028642590286, "grad_norm": 1.0935208527142122, "learning_rate": 2.61523546166471e-07, "loss": 1.534, "step": 2052 }, { "epoch": 0.4261104192611042, "grad_norm": 0.7221317998016549, "learning_rate": 2.6140933343244057e-07, "loss": 1.4952, "step": 2053 }, { "epoch": 0.42631797426317974, "grad_norm": 0.8560585391970406, "learning_rate": 2.61295103092261e-07, "loss": 1.5385, "step": 2054 }, { "epoch": 0.4265255292652553, "grad_norm": 0.839086834158676, "learning_rate": 2.6118085519449993e-07, "loss": 1.4509, "step": 2055 }, { "epoch": 0.42673308426733086, "grad_norm": 0.6664727499837579, "learning_rate": 2.6106658978773244e-07, "loss": 1.5664, "step": 2056 }, { "epoch": 0.4269406392694064, "grad_norm": 0.6553495126242809, "learning_rate": 2.609523069205413e-07, "loss": 1.4837, "step": 2057 }, { "epoch": 0.42714819427148193, "grad_norm": 0.64964437906978, "learning_rate": 2.608380066415164e-07, "loss": 1.5497, "step": 2058 }, { "epoch": 0.4273557492735575, "grad_norm": 0.6071201962002736, "learning_rate": 2.6072368899925536e-07, "loss": 1.5691, "step": 2059 }, { "epoch": 0.42756330427563305, "grad_norm": 1.5713414629083455, "learning_rate": 2.6060935404236286e-07, "loss": 1.4795, "step": 2060 }, { "epoch": 0.4277708592777086, "grad_norm": 1.1262097489189138, "learning_rate": 2.6049500181945113e-07, "loss": 1.576, "step": 2061 }, { "epoch": 0.42797841427978417, "grad_norm": 0.79002612241337, "learning_rate": 2.603806323791397e-07, "loss": 1.5142, "step": 2062 }, { "epoch": 0.4281859692818597, "grad_norm": 1.4422448117375584, "learning_rate": 2.6026624577005546e-07, "loss": 1.5416, "step": 2063 }, { "epoch": 0.42839352428393523, "grad_norm": 0.7503413597555127, "learning_rate": 2.601518420408325e-07, "loss": 1.5221, "step": 2064 }, { "epoch": 0.42860107928601077, "grad_norm": 0.7857775711499824, "learning_rate": 2.600374212401123e-07, "loss": 1.55, "step": 2065 }, { "epoch": 0.42880863428808635, "grad_norm": 0.6273231122356161, "learning_rate": 2.599229834165436e-07, "loss": 1.5003, "step": 2066 }, { "epoch": 0.4290161892901619, "grad_norm": 1.1792941125455756, "learning_rate": 2.5980852861878213e-07, "loss": 1.4907, "step": 2067 }, { "epoch": 0.4292237442922374, "grad_norm": 0.7812811257183906, "learning_rate": 2.5969405689549113e-07, "loss": 1.4668, "step": 2068 }, { "epoch": 0.429431299294313, "grad_norm": 0.7665258415497694, "learning_rate": 2.59579568295341e-07, "loss": 1.5086, "step": 2069 }, { "epoch": 0.42963885429638854, "grad_norm": 0.6803406900341635, "learning_rate": 2.594650628670092e-07, "loss": 1.5059, "step": 2070 }, { "epoch": 0.42984640929846407, "grad_norm": 0.9726321494992176, "learning_rate": 2.5935054065918047e-07, "loss": 1.4858, "step": 2071 }, { "epoch": 0.43005396430053966, "grad_norm": 1.1539561214168268, "learning_rate": 2.5923600172054645e-07, "loss": 1.5302, "step": 2072 }, { "epoch": 0.4302615193026152, "grad_norm": 0.7474077553400984, "learning_rate": 2.591214460998062e-07, "loss": 1.4986, "step": 2073 }, { "epoch": 0.4304690743046907, "grad_norm": 0.921469828434731, "learning_rate": 2.5900687384566565e-07, "loss": 1.5212, "step": 2074 }, { "epoch": 0.4306766293067663, "grad_norm": 1.1254870648032753, "learning_rate": 2.588922850068379e-07, "loss": 1.5289, "step": 2075 }, { "epoch": 0.43088418430884184, "grad_norm": 2.8821663495470937, "learning_rate": 2.587776796320432e-07, "loss": 1.5083, "step": 2076 }, { "epoch": 0.4310917393109174, "grad_norm": 0.7817431035285098, "learning_rate": 2.586630577700086e-07, "loss": 1.4893, "step": 2077 }, { "epoch": 0.43129929431299296, "grad_norm": 1.3322400918914807, "learning_rate": 2.585484194694682e-07, "loss": 1.5307, "step": 2078 }, { "epoch": 0.4315068493150685, "grad_norm": 0.8341806325564548, "learning_rate": 2.584337647791633e-07, "loss": 1.5426, "step": 2079 }, { "epoch": 0.43171440431714403, "grad_norm": 0.692326746479079, "learning_rate": 2.5831909374784194e-07, "loss": 1.4693, "step": 2080 }, { "epoch": 0.4319219593192196, "grad_norm": 0.6304855163810402, "learning_rate": 2.5820440642425923e-07, "loss": 1.5104, "step": 2081 }, { "epoch": 0.43212951432129515, "grad_norm": 0.6651898945885343, "learning_rate": 2.580897028571772e-07, "loss": 1.5929, "step": 2082 }, { "epoch": 0.4323370693233707, "grad_norm": 0.9950781603408949, "learning_rate": 2.5797498309536474e-07, "loss": 1.5657, "step": 2083 }, { "epoch": 0.43254462432544627, "grad_norm": 1.0425445371864661, "learning_rate": 2.5786024718759763e-07, "loss": 1.5491, "step": 2084 }, { "epoch": 0.4327521793275218, "grad_norm": 0.6945210803612328, "learning_rate": 2.577454951826586e-07, "loss": 1.4589, "step": 2085 }, { "epoch": 0.43295973432959733, "grad_norm": 1.1443682126659054, "learning_rate": 2.5763072712933706e-07, "loss": 1.4681, "step": 2086 }, { "epoch": 0.43316728933167287, "grad_norm": 0.6723891645248855, "learning_rate": 2.575159430764293e-07, "loss": 1.537, "step": 2087 }, { "epoch": 0.43337484433374845, "grad_norm": 0.7076545854764452, "learning_rate": 2.5740114307273867e-07, "loss": 1.5585, "step": 2088 }, { "epoch": 0.433582399335824, "grad_norm": 0.8414062525776694, "learning_rate": 2.5728632716707493e-07, "loss": 1.5487, "step": 2089 }, { "epoch": 0.4337899543378995, "grad_norm": 0.6796298457061739, "learning_rate": 2.5717149540825473e-07, "loss": 1.5151, "step": 2090 }, { "epoch": 0.4339975093399751, "grad_norm": 0.6982565888292679, "learning_rate": 2.570566478451015e-07, "loss": 1.5584, "step": 2091 }, { "epoch": 0.43420506434205064, "grad_norm": 2.2419501741257237, "learning_rate": 2.5694178452644547e-07, "loss": 1.4493, "step": 2092 }, { "epoch": 0.43441261934412617, "grad_norm": 1.2497028443646439, "learning_rate": 2.5682690550112343e-07, "loss": 1.5084, "step": 2093 }, { "epoch": 0.43462017434620176, "grad_norm": 0.6264273778094389, "learning_rate": 2.5671201081797883e-07, "loss": 1.535, "step": 2094 }, { "epoch": 0.4348277293482773, "grad_norm": 0.7809303293601078, "learning_rate": 2.5659710052586185e-07, "loss": 1.5293, "step": 2095 }, { "epoch": 0.4350352843503528, "grad_norm": 0.9091501470396954, "learning_rate": 2.564821746736294e-07, "loss": 1.5058, "step": 2096 }, { "epoch": 0.4352428393524284, "grad_norm": 0.6747206005797378, "learning_rate": 2.563672333101447e-07, "loss": 1.4499, "step": 2097 }, { "epoch": 0.43545039435450394, "grad_norm": 1.2416595502264431, "learning_rate": 2.5625227648427794e-07, "loss": 1.581, "step": 2098 }, { "epoch": 0.4356579493565795, "grad_norm": 0.6952150313047731, "learning_rate": 2.5613730424490574e-07, "loss": 1.5475, "step": 2099 }, { "epoch": 0.43586550435865506, "grad_norm": 0.7732241194368042, "learning_rate": 2.560223166409111e-07, "loss": 1.6035, "step": 2100 }, { "epoch": 0.4360730593607306, "grad_norm": 0.6977680960106218, "learning_rate": 2.5590731372118377e-07, "loss": 1.5819, "step": 2101 }, { "epoch": 0.43628061436280613, "grad_norm": 0.942322246097702, "learning_rate": 2.5579229553461994e-07, "loss": 1.4804, "step": 2102 }, { "epoch": 0.4364881693648817, "grad_norm": 0.9454414137347728, "learning_rate": 2.556772621301223e-07, "loss": 1.4412, "step": 2103 }, { "epoch": 0.43669572436695725, "grad_norm": 0.6823896129596078, "learning_rate": 2.555622135566e-07, "loss": 1.5715, "step": 2104 }, { "epoch": 0.4369032793690328, "grad_norm": 2.5678452853475626, "learning_rate": 2.554471498629685e-07, "loss": 1.5717, "step": 2105 }, { "epoch": 0.43711083437110837, "grad_norm": 1.0413133255808267, "learning_rate": 2.5533207109815004e-07, "loss": 1.5249, "step": 2106 }, { "epoch": 0.4373183893731839, "grad_norm": 0.873315196618782, "learning_rate": 2.55216977311073e-07, "loss": 1.5106, "step": 2107 }, { "epoch": 0.43752594437525943, "grad_norm": 1.2472732961964537, "learning_rate": 2.5510186855067205e-07, "loss": 1.5647, "step": 2108 }, { "epoch": 0.43773349937733497, "grad_norm": 1.1743021102139204, "learning_rate": 2.5498674486588857e-07, "loss": 1.5248, "step": 2109 }, { "epoch": 0.43794105437941055, "grad_norm": 0.6864692264212933, "learning_rate": 2.5487160630567e-07, "loss": 1.4887, "step": 2110 }, { "epoch": 0.4381486093814861, "grad_norm": 1.7098830341219857, "learning_rate": 2.547564529189702e-07, "loss": 1.4521, "step": 2111 }, { "epoch": 0.4383561643835616, "grad_norm": 2.1376306290838745, "learning_rate": 2.5464128475474935e-07, "loss": 1.5057, "step": 2112 }, { "epoch": 0.4385637193856372, "grad_norm": 0.7218449571633895, "learning_rate": 2.54526101861974e-07, "loss": 1.4417, "step": 2113 }, { "epoch": 0.43877127438771274, "grad_norm": 0.7117824999076906, "learning_rate": 2.544109042896166e-07, "loss": 1.5356, "step": 2114 }, { "epoch": 0.43897882938978827, "grad_norm": 0.9469277854892874, "learning_rate": 2.542956920866564e-07, "loss": 1.5012, "step": 2115 }, { "epoch": 0.43918638439186386, "grad_norm": 1.49389575265253, "learning_rate": 2.5418046530207827e-07, "loss": 1.4665, "step": 2116 }, { "epoch": 0.4393939393939394, "grad_norm": 1.178001804103002, "learning_rate": 2.5406522398487383e-07, "loss": 1.5106, "step": 2117 }, { "epoch": 0.4396014943960149, "grad_norm": 0.7205061218601415, "learning_rate": 2.539499681840405e-07, "loss": 1.4869, "step": 2118 }, { "epoch": 0.4398090493980905, "grad_norm": 0.7064580384542527, "learning_rate": 2.53834697948582e-07, "loss": 1.5286, "step": 2119 }, { "epoch": 0.44001660440016604, "grad_norm": 1.057498935743862, "learning_rate": 2.5371941332750827e-07, "loss": 1.4963, "step": 2120 }, { "epoch": 0.4402241594022416, "grad_norm": 0.7914071785461088, "learning_rate": 2.536041143698351e-07, "loss": 1.4887, "step": 2121 }, { "epoch": 0.44043171440431717, "grad_norm": 0.8098510614416544, "learning_rate": 2.5348880112458475e-07, "loss": 1.4643, "step": 2122 }, { "epoch": 0.4406392694063927, "grad_norm": 0.8920768556497111, "learning_rate": 2.5337347364078513e-07, "loss": 1.5333, "step": 2123 }, { "epoch": 0.44084682440846823, "grad_norm": 0.8260414924562738, "learning_rate": 2.5325813196747063e-07, "loss": 1.5388, "step": 2124 }, { "epoch": 0.4410543794105438, "grad_norm": 1.2216266672317042, "learning_rate": 2.531427761536813e-07, "loss": 1.5691, "step": 2125 }, { "epoch": 0.44126193441261935, "grad_norm": 1.019132578632894, "learning_rate": 2.5302740624846345e-07, "loss": 1.4846, "step": 2126 }, { "epoch": 0.4414694894146949, "grad_norm": 0.6730609713596192, "learning_rate": 2.529120223008693e-07, "loss": 1.6148, "step": 2127 }, { "epoch": 0.44167704441677047, "grad_norm": 0.7364599063826658, "learning_rate": 2.52796624359957e-07, "loss": 1.4845, "step": 2128 }, { "epoch": 0.441884599418846, "grad_norm": 1.4282277236522039, "learning_rate": 2.526812124747907e-07, "loss": 1.5591, "step": 2129 }, { "epoch": 0.44209215442092153, "grad_norm": 0.8735489865557484, "learning_rate": 2.525657866944406e-07, "loss": 1.5901, "step": 2130 }, { "epoch": 0.44229970942299707, "grad_norm": 1.314174621489636, "learning_rate": 2.5245034706798255e-07, "loss": 1.4845, "step": 2131 }, { "epoch": 0.44250726442507266, "grad_norm": 0.7844126704488753, "learning_rate": 2.523348936444984e-07, "loss": 1.4985, "step": 2132 }, { "epoch": 0.4427148194271482, "grad_norm": 1.29598726479746, "learning_rate": 2.5221942647307595e-07, "loss": 1.4843, "step": 2133 }, { "epoch": 0.4429223744292237, "grad_norm": 0.881226393157972, "learning_rate": 2.521039456028087e-07, "loss": 1.5349, "step": 2134 }, { "epoch": 0.4431299294312993, "grad_norm": 1.2013944878876999, "learning_rate": 2.5198845108279606e-07, "loss": 1.4395, "step": 2135 }, { "epoch": 0.44333748443337484, "grad_norm": 1.792177002966969, "learning_rate": 2.518729429621433e-07, "loss": 1.4914, "step": 2136 }, { "epoch": 0.4435450394354504, "grad_norm": 0.7623098932584398, "learning_rate": 2.517574212899614e-07, "loss": 1.5324, "step": 2137 }, { "epoch": 0.44375259443752596, "grad_norm": 1.3308952326930836, "learning_rate": 2.51641886115367e-07, "loss": 1.4793, "step": 2138 }, { "epoch": 0.4439601494396015, "grad_norm": 1.0284826176503037, "learning_rate": 2.5152633748748274e-07, "loss": 1.5472, "step": 2139 }, { "epoch": 0.444167704441677, "grad_norm": 0.6638555943882263, "learning_rate": 2.5141077545543676e-07, "loss": 1.4954, "step": 2140 }, { "epoch": 0.4443752594437526, "grad_norm": 0.7581208311184571, "learning_rate": 2.512952000683629e-07, "loss": 1.4955, "step": 2141 }, { "epoch": 0.44458281444582815, "grad_norm": 0.772749048068988, "learning_rate": 2.511796113754009e-07, "loss": 1.5363, "step": 2142 }, { "epoch": 0.4447903694479037, "grad_norm": 0.8878423028150323, "learning_rate": 2.5106400942569585e-07, "loss": 1.5375, "step": 2143 }, { "epoch": 0.44499792444997927, "grad_norm": 0.8595138628111212, "learning_rate": 2.509483942683987e-07, "loss": 1.5544, "step": 2144 }, { "epoch": 0.4452054794520548, "grad_norm": 1.1120578591520924, "learning_rate": 2.5083276595266595e-07, "loss": 1.588, "step": 2145 }, { "epoch": 0.44541303445413033, "grad_norm": 0.9408018476804988, "learning_rate": 2.507171245276597e-07, "loss": 1.4777, "step": 2146 }, { "epoch": 0.4456205894562059, "grad_norm": 1.0914886847384382, "learning_rate": 2.5060147004254754e-07, "loss": 1.536, "step": 2147 }, { "epoch": 0.44582814445828145, "grad_norm": 1.0940914759700866, "learning_rate": 2.504858025465028e-07, "loss": 1.5379, "step": 2148 }, { "epoch": 0.446035699460357, "grad_norm": 1.0911655521561265, "learning_rate": 2.503701220887042e-07, "loss": 1.5028, "step": 2149 }, { "epoch": 0.44624325446243257, "grad_norm": 0.6978686356143317, "learning_rate": 2.5025442871833585e-07, "loss": 1.4942, "step": 2150 }, { "epoch": 0.4464508094645081, "grad_norm": 0.9637473364927642, "learning_rate": 2.5013872248458777e-07, "loss": 1.5235, "step": 2151 }, { "epoch": 0.44665836446658364, "grad_norm": 0.73052714759084, "learning_rate": 2.5002300343665485e-07, "loss": 1.6125, "step": 2152 }, { "epoch": 0.44686591946865917, "grad_norm": 0.9468713510678555, "learning_rate": 2.4990727162373806e-07, "loss": 1.5183, "step": 2153 }, { "epoch": 0.44707347447073476, "grad_norm": 0.6504356193109583, "learning_rate": 2.4979152709504334e-07, "loss": 1.4517, "step": 2154 }, { "epoch": 0.4472810294728103, "grad_norm": 0.7420101022035055, "learning_rate": 2.496757698997822e-07, "loss": 1.4707, "step": 2155 }, { "epoch": 0.4474885844748858, "grad_norm": 0.6549410960572217, "learning_rate": 2.495600000871716e-07, "loss": 1.5719, "step": 2156 }, { "epoch": 0.4476961394769614, "grad_norm": 2.359159900418399, "learning_rate": 2.494442177064336e-07, "loss": 1.4731, "step": 2157 }, { "epoch": 0.44790369447903694, "grad_norm": 0.7331777173551153, "learning_rate": 2.493284228067961e-07, "loss": 1.4802, "step": 2158 }, { "epoch": 0.4481112494811125, "grad_norm": 1.2440970935529934, "learning_rate": 2.492126154374917e-07, "loss": 1.5337, "step": 2159 }, { "epoch": 0.44831880448318806, "grad_norm": 0.7913351001443824, "learning_rate": 2.490967956477589e-07, "loss": 1.4839, "step": 2160 }, { "epoch": 0.4485263594852636, "grad_norm": 0.7738195761239326, "learning_rate": 2.48980963486841e-07, "loss": 1.5918, "step": 2161 }, { "epoch": 0.4487339144873391, "grad_norm": 0.7822292770972306, "learning_rate": 2.4886511900398683e-07, "loss": 1.432, "step": 2162 }, { "epoch": 0.4489414694894147, "grad_norm": 0.9965774970451594, "learning_rate": 2.487492622484504e-07, "loss": 1.5116, "step": 2163 }, { "epoch": 0.44914902449149025, "grad_norm": 0.8766209811602343, "learning_rate": 2.4863339326949094e-07, "loss": 1.441, "step": 2164 }, { "epoch": 0.4493565794935658, "grad_norm": 0.652786520871881, "learning_rate": 2.485175121163727e-07, "loss": 1.5011, "step": 2165 }, { "epoch": 0.44956413449564137, "grad_norm": 0.7154615701241811, "learning_rate": 2.4840161883836554e-07, "loss": 1.583, "step": 2166 }, { "epoch": 0.4497716894977169, "grad_norm": 0.7150291241211217, "learning_rate": 2.48285713484744e-07, "loss": 1.5129, "step": 2167 }, { "epoch": 0.44997924449979243, "grad_norm": 0.7413374877217682, "learning_rate": 2.481697961047881e-07, "loss": 1.4973, "step": 2168 }, { "epoch": 0.450186799501868, "grad_norm": 0.936433186319151, "learning_rate": 2.480538667477827e-07, "loss": 1.5095, "step": 2169 }, { "epoch": 0.45039435450394355, "grad_norm": 0.9069472615583587, "learning_rate": 2.4793792546301804e-07, "loss": 1.5027, "step": 2170 }, { "epoch": 0.4506019095060191, "grad_norm": 1.1355643095112342, "learning_rate": 2.478219722997891e-07, "loss": 1.5675, "step": 2171 }, { "epoch": 0.45080946450809467, "grad_norm": 0.7132307173851706, "learning_rate": 2.4770600730739615e-07, "loss": 1.5252, "step": 2172 }, { "epoch": 0.4510170195101702, "grad_norm": 0.8272218787985647, "learning_rate": 2.475900305351445e-07, "loss": 1.5477, "step": 2173 }, { "epoch": 0.45122457451224574, "grad_norm": 0.7242767194465567, "learning_rate": 2.474740420323443e-07, "loss": 1.5261, "step": 2174 }, { "epoch": 0.45143212951432127, "grad_norm": 0.8973888682735535, "learning_rate": 2.4735804184831086e-07, "loss": 1.5265, "step": 2175 }, { "epoch": 0.45163968451639686, "grad_norm": 0.6911003042600978, "learning_rate": 2.472420300323643e-07, "loss": 1.4984, "step": 2176 }, { "epoch": 0.4518472395184724, "grad_norm": 0.671625565554024, "learning_rate": 2.471260066338298e-07, "loss": 1.5356, "step": 2177 }, { "epoch": 0.4520547945205479, "grad_norm": 0.7630655099457171, "learning_rate": 2.4700997170203746e-07, "loss": 1.496, "step": 2178 }, { "epoch": 0.4522623495226235, "grad_norm": 0.7700659474829851, "learning_rate": 2.468939252863223e-07, "loss": 1.6678, "step": 2179 }, { "epoch": 0.45246990452469904, "grad_norm": 1.4565397812827277, "learning_rate": 2.4677786743602396e-07, "loss": 1.4823, "step": 2180 }, { "epoch": 0.4526774595267746, "grad_norm": 1.0367986862582488, "learning_rate": 2.466617982004874e-07, "loss": 1.489, "step": 2181 }, { "epoch": 0.45288501452885016, "grad_norm": 0.651934882544263, "learning_rate": 2.46545717629062e-07, "loss": 1.4984, "step": 2182 }, { "epoch": 0.4530925695309257, "grad_norm": 1.1317650152368461, "learning_rate": 2.4642962577110225e-07, "loss": 1.578, "step": 2183 }, { "epoch": 0.4533001245330012, "grad_norm": 0.8274764600224538, "learning_rate": 2.4631352267596734e-07, "loss": 1.4889, "step": 2184 }, { "epoch": 0.4535076795350768, "grad_norm": 0.7508653095657366, "learning_rate": 2.4619740839302105e-07, "loss": 1.4549, "step": 2185 }, { "epoch": 0.45371523453715235, "grad_norm": 0.7464091447727699, "learning_rate": 2.460812829716323e-07, "loss": 1.5054, "step": 2186 }, { "epoch": 0.4539227895392279, "grad_norm": 3.2129031051020234, "learning_rate": 2.459651464611745e-07, "loss": 1.5716, "step": 2187 }, { "epoch": 0.45413034454130347, "grad_norm": 2.439099305498632, "learning_rate": 2.4584899891102566e-07, "loss": 1.5479, "step": 2188 }, { "epoch": 0.454337899543379, "grad_norm": 0.8031207600909883, "learning_rate": 2.4573284037056876e-07, "loss": 1.4635, "step": 2189 }, { "epoch": 0.45454545454545453, "grad_norm": 1.0006031458842293, "learning_rate": 2.4561667088919135e-07, "loss": 1.6125, "step": 2190 }, { "epoch": 0.4547530095475301, "grad_norm": 0.810242147883144, "learning_rate": 2.455004905162855e-07, "loss": 1.4435, "step": 2191 }, { "epoch": 0.45496056454960565, "grad_norm": 2.0254887730141404, "learning_rate": 2.4538429930124814e-07, "loss": 1.5417, "step": 2192 }, { "epoch": 0.4551681195516812, "grad_norm": 0.7985861811236729, "learning_rate": 2.4526809729348056e-07, "loss": 1.4958, "step": 2193 }, { "epoch": 0.45537567455375677, "grad_norm": 0.9310305620406012, "learning_rate": 2.4515188454238887e-07, "loss": 1.5145, "step": 2194 }, { "epoch": 0.4555832295558323, "grad_norm": 0.8109195266052031, "learning_rate": 2.450356610973836e-07, "loss": 1.5166, "step": 2195 }, { "epoch": 0.45579078455790784, "grad_norm": 0.9150030933398595, "learning_rate": 2.4491942700787993e-07, "loss": 1.5094, "step": 2196 }, { "epoch": 0.4559983395599834, "grad_norm": 0.951889116884637, "learning_rate": 2.4480318232329746e-07, "loss": 1.5348, "step": 2197 }, { "epoch": 0.45620589456205896, "grad_norm": 1.1850439055436415, "learning_rate": 2.4468692709306036e-07, "loss": 1.4985, "step": 2198 }, { "epoch": 0.4564134495641345, "grad_norm": 0.8749768594294257, "learning_rate": 2.445706613665973e-07, "loss": 1.4924, "step": 2199 }, { "epoch": 0.45662100456621, "grad_norm": 0.9163160438438379, "learning_rate": 2.4445438519334127e-07, "loss": 1.4742, "step": 2200 }, { "epoch": 0.4568285595682856, "grad_norm": 0.643252125770128, "learning_rate": 2.443380986227299e-07, "loss": 1.4366, "step": 2201 }, { "epoch": 0.45703611457036114, "grad_norm": 0.7997659922866655, "learning_rate": 2.442218017042051e-07, "loss": 1.4971, "step": 2202 }, { "epoch": 0.4572436695724367, "grad_norm": 1.0279960693569334, "learning_rate": 2.4410549448721334e-07, "loss": 1.5281, "step": 2203 }, { "epoch": 0.45745122457451226, "grad_norm": 0.8381311521165757, "learning_rate": 2.439891770212053e-07, "loss": 1.574, "step": 2204 }, { "epoch": 0.4576587795765878, "grad_norm": 0.9408991150849904, "learning_rate": 2.438728493556359e-07, "loss": 1.471, "step": 2205 }, { "epoch": 0.4578663345786633, "grad_norm": 0.9170328175042292, "learning_rate": 2.437565115399649e-07, "loss": 1.4747, "step": 2206 }, { "epoch": 0.4580738895807389, "grad_norm": 0.6427472013442638, "learning_rate": 2.436401636236559e-07, "loss": 1.4991, "step": 2207 }, { "epoch": 0.45828144458281445, "grad_norm": 0.7063233564871986, "learning_rate": 2.435238056561769e-07, "loss": 1.5374, "step": 2208 }, { "epoch": 0.45848899958489, "grad_norm": 0.8076142970763248, "learning_rate": 2.4340743768700026e-07, "loss": 1.4997, "step": 2209 }, { "epoch": 0.45869655458696557, "grad_norm": 1.3361724639534662, "learning_rate": 2.432910597656025e-07, "loss": 1.5836, "step": 2210 }, { "epoch": 0.4589041095890411, "grad_norm": 0.7501759904041626, "learning_rate": 2.4317467194146455e-07, "loss": 1.5164, "step": 2211 }, { "epoch": 0.45911166459111663, "grad_norm": 1.0543792063166513, "learning_rate": 2.430582742640713e-07, "loss": 1.545, "step": 2212 }, { "epoch": 0.4593192195931922, "grad_norm": 0.8380079336488809, "learning_rate": 2.4294186678291194e-07, "loss": 1.5199, "step": 2213 }, { "epoch": 0.45952677459526775, "grad_norm": 0.9021296785095388, "learning_rate": 2.4282544954748003e-07, "loss": 1.5034, "step": 2214 }, { "epoch": 0.4597343295973433, "grad_norm": 0.7521408333219058, "learning_rate": 2.4270902260727284e-07, "loss": 1.5231, "step": 2215 }, { "epoch": 0.45994188459941887, "grad_norm": 0.844388611353209, "learning_rate": 2.4259258601179214e-07, "loss": 1.5478, "step": 2216 }, { "epoch": 0.4601494396014944, "grad_norm": 0.9820550374511889, "learning_rate": 2.4247613981054373e-07, "loss": 1.5211, "step": 2217 }, { "epoch": 0.46035699460356994, "grad_norm": 0.7235320752272337, "learning_rate": 2.423596840530373e-07, "loss": 1.5455, "step": 2218 }, { "epoch": 0.4605645496056455, "grad_norm": 0.8324983002567111, "learning_rate": 2.4224321878878694e-07, "loss": 1.5362, "step": 2219 }, { "epoch": 0.46077210460772106, "grad_norm": 1.0399830447329832, "learning_rate": 2.421267440673103e-07, "loss": 1.5326, "step": 2220 }, { "epoch": 0.4609796596097966, "grad_norm": 1.0575362914644075, "learning_rate": 2.4201025993812967e-07, "loss": 1.5321, "step": 2221 }, { "epoch": 0.4611872146118721, "grad_norm": 0.6324880735125413, "learning_rate": 2.4189376645077077e-07, "loss": 1.4759, "step": 2222 }, { "epoch": 0.4613947696139477, "grad_norm": 9.613455962360714, "learning_rate": 2.417772636547637e-07, "loss": 1.6108, "step": 2223 }, { "epoch": 0.46160232461602324, "grad_norm": 0.796027678126697, "learning_rate": 2.4166075159964224e-07, "loss": 1.5208, "step": 2224 }, { "epoch": 0.4618098796180988, "grad_norm": 1.340168327814824, "learning_rate": 2.4154423033494427e-07, "loss": 1.5384, "step": 2225 }, { "epoch": 0.46201743462017436, "grad_norm": 0.743504323645428, "learning_rate": 2.4142769991021147e-07, "loss": 1.5876, "step": 2226 }, { "epoch": 0.4622249896222499, "grad_norm": 0.7251114209535581, "learning_rate": 2.413111603749896e-07, "loss": 1.5301, "step": 2227 }, { "epoch": 0.4624325446243254, "grad_norm": 0.8329547937009193, "learning_rate": 2.4119461177882816e-07, "loss": 1.5098, "step": 2228 }, { "epoch": 0.462640099626401, "grad_norm": 0.7595128985859629, "learning_rate": 2.4107805417128035e-07, "loss": 1.535, "step": 2229 }, { "epoch": 0.46284765462847655, "grad_norm": 1.0289447997454348, "learning_rate": 2.409614876019036e-07, "loss": 1.5338, "step": 2230 }, { "epoch": 0.4630552096305521, "grad_norm": 0.876545684650963, "learning_rate": 2.4084491212025873e-07, "loss": 1.5588, "step": 2231 }, { "epoch": 0.46326276463262767, "grad_norm": 0.8742852720639952, "learning_rate": 2.4072832777591075e-07, "loss": 1.4348, "step": 2232 }, { "epoch": 0.4634703196347032, "grad_norm": 0.7026562412805656, "learning_rate": 2.40611734618428e-07, "loss": 1.457, "step": 2233 }, { "epoch": 0.46367787463677873, "grad_norm": 1.1270635462841694, "learning_rate": 2.404951326973829e-07, "loss": 1.551, "step": 2234 }, { "epoch": 0.4638854296388543, "grad_norm": 0.8683412207048049, "learning_rate": 2.403785220623515e-07, "loss": 1.4483, "step": 2235 }, { "epoch": 0.46409298464092985, "grad_norm": 0.7555098610212173, "learning_rate": 2.402619027629136e-07, "loss": 1.5824, "step": 2236 }, { "epoch": 0.4643005396430054, "grad_norm": 0.7981558658921464, "learning_rate": 2.401452748486525e-07, "loss": 1.4537, "step": 2237 }, { "epoch": 0.46450809464508097, "grad_norm": 1.3371005366076087, "learning_rate": 2.400286383691554e-07, "loss": 1.5096, "step": 2238 }, { "epoch": 0.4647156496471565, "grad_norm": 0.8455773124255425, "learning_rate": 2.39911993374013e-07, "loss": 1.5191, "step": 2239 }, { "epoch": 0.46492320464923204, "grad_norm": 0.7691116102035135, "learning_rate": 2.3979533991281966e-07, "loss": 1.5462, "step": 2240 }, { "epoch": 0.4651307596513076, "grad_norm": 0.6431765198247197, "learning_rate": 2.396786780351733e-07, "loss": 1.4937, "step": 2241 }, { "epoch": 0.46533831465338316, "grad_norm": 0.7657358944398197, "learning_rate": 2.395620077906755e-07, "loss": 1.4717, "step": 2242 }, { "epoch": 0.4655458696554587, "grad_norm": 0.9294180369349442, "learning_rate": 2.394453292289313e-07, "loss": 1.4809, "step": 2243 }, { "epoch": 0.4657534246575342, "grad_norm": 11.50667086537391, "learning_rate": 2.3932864239954937e-07, "loss": 1.6035, "step": 2244 }, { "epoch": 0.4659609796596098, "grad_norm": 1.1673548490255723, "learning_rate": 2.3921194735214183e-07, "loss": 1.5091, "step": 2245 }, { "epoch": 0.46616853466168534, "grad_norm": 0.9266484129151237, "learning_rate": 2.390952441363243e-07, "loss": 1.5089, "step": 2246 }, { "epoch": 0.4663760896637609, "grad_norm": 2.0752055352963095, "learning_rate": 2.38978532801716e-07, "loss": 1.5774, "step": 2247 }, { "epoch": 0.46658364466583646, "grad_norm": 0.7924802957558427, "learning_rate": 2.388618133979393e-07, "loss": 1.5107, "step": 2248 }, { "epoch": 0.466791199667912, "grad_norm": 0.6988179244314432, "learning_rate": 2.3874508597462036e-07, "loss": 1.4901, "step": 2249 }, { "epoch": 0.4669987546699875, "grad_norm": 0.7851342111636496, "learning_rate": 2.386283505813885e-07, "loss": 1.5156, "step": 2250 }, { "epoch": 0.4672063096720631, "grad_norm": 0.7180013615537107, "learning_rate": 2.385116072678765e-07, "loss": 1.5447, "step": 2251 }, { "epoch": 0.46741386467413865, "grad_norm": 0.6499516346801097, "learning_rate": 2.383948560837206e-07, "loss": 1.5142, "step": 2252 }, { "epoch": 0.4676214196762142, "grad_norm": 0.6610478488921054, "learning_rate": 2.3827809707856023e-07, "loss": 1.482, "step": 2253 }, { "epoch": 0.46782897467828977, "grad_norm": 1.2337671553158112, "learning_rate": 2.3816133030203818e-07, "loss": 1.4226, "step": 2254 }, { "epoch": 0.4680365296803653, "grad_norm": 0.6947826865818518, "learning_rate": 2.3804455580380072e-07, "loss": 1.5019, "step": 2255 }, { "epoch": 0.46824408468244083, "grad_norm": 0.6997069130088122, "learning_rate": 2.3792777363349722e-07, "loss": 1.5007, "step": 2256 }, { "epoch": 0.4684516396845164, "grad_norm": 0.7088211810414526, "learning_rate": 2.378109838407802e-07, "loss": 1.5571, "step": 2257 }, { "epoch": 0.46865919468659195, "grad_norm": 0.9213998911953192, "learning_rate": 2.3769418647530576e-07, "loss": 1.5353, "step": 2258 }, { "epoch": 0.4688667496886675, "grad_norm": 4.383534052373395, "learning_rate": 2.3757738158673302e-07, "loss": 1.5843, "step": 2259 }, { "epoch": 0.46907430469074307, "grad_norm": 0.9149189232816582, "learning_rate": 2.3746056922472423e-07, "loss": 1.4917, "step": 2260 }, { "epoch": 0.4692818596928186, "grad_norm": 0.8751886951499699, "learning_rate": 2.3734374943894502e-07, "loss": 1.4453, "step": 2261 }, { "epoch": 0.46948941469489414, "grad_norm": 0.8068067117599571, "learning_rate": 2.3722692227906394e-07, "loss": 1.4656, "step": 2262 }, { "epoch": 0.4696969696969697, "grad_norm": 0.7183821599326902, "learning_rate": 2.3711008779475286e-07, "loss": 1.5502, "step": 2263 }, { "epoch": 0.46990452469904526, "grad_norm": 1.4283606380429645, "learning_rate": 2.3699324603568674e-07, "loss": 1.5851, "step": 2264 }, { "epoch": 0.4701120797011208, "grad_norm": 0.9085930663017272, "learning_rate": 2.3687639705154354e-07, "loss": 1.4868, "step": 2265 }, { "epoch": 0.4703196347031963, "grad_norm": 0.9242844557414177, "learning_rate": 2.3675954089200437e-07, "loss": 1.4919, "step": 2266 }, { "epoch": 0.4705271897052719, "grad_norm": 0.721031895461535, "learning_rate": 2.366426776067533e-07, "loss": 1.488, "step": 2267 }, { "epoch": 0.47073474470734744, "grad_norm": 0.9094884078174456, "learning_rate": 2.3652580724547758e-07, "loss": 1.5261, "step": 2268 }, { "epoch": 0.470942299709423, "grad_norm": 0.8241169253099095, "learning_rate": 2.3640892985786736e-07, "loss": 1.4752, "step": 2269 }, { "epoch": 0.47114985471149856, "grad_norm": 0.8928604733558316, "learning_rate": 2.362920454936158e-07, "loss": 1.455, "step": 2270 }, { "epoch": 0.4713574097135741, "grad_norm": 0.7805080742315295, "learning_rate": 2.3617515420241897e-07, "loss": 1.4613, "step": 2271 }, { "epoch": 0.4715649647156496, "grad_norm": 0.6460143432090598, "learning_rate": 2.3605825603397606e-07, "loss": 1.4943, "step": 2272 }, { "epoch": 0.4717725197177252, "grad_norm": 1.1613912271518472, "learning_rate": 2.3594135103798894e-07, "loss": 1.5032, "step": 2273 }, { "epoch": 0.47198007471980075, "grad_norm": 1.3145593790529866, "learning_rate": 2.3582443926416267e-07, "loss": 1.5537, "step": 2274 }, { "epoch": 0.4721876297218763, "grad_norm": 0.9746060269879496, "learning_rate": 2.357075207622049e-07, "loss": 1.5431, "step": 2275 }, { "epoch": 0.47239518472395187, "grad_norm": 0.7274668284374193, "learning_rate": 2.3559059558182626e-07, "loss": 1.5456, "step": 2276 }, { "epoch": 0.4726027397260274, "grad_norm": 0.7978097033622888, "learning_rate": 2.3547366377274035e-07, "loss": 1.509, "step": 2277 }, { "epoch": 0.47281029472810293, "grad_norm": 0.6792619886445909, "learning_rate": 2.3535672538466343e-07, "loss": 1.5635, "step": 2278 }, { "epoch": 0.4730178497301785, "grad_norm": 0.7510328569408615, "learning_rate": 2.352397804673145e-07, "loss": 1.5883, "step": 2279 }, { "epoch": 0.47322540473225405, "grad_norm": 0.8196013965152724, "learning_rate": 2.3512282907041557e-07, "loss": 1.4882, "step": 2280 }, { "epoch": 0.4734329597343296, "grad_norm": 0.9124355227306552, "learning_rate": 2.3500587124369124e-07, "loss": 1.4656, "step": 2281 }, { "epoch": 0.47364051473640517, "grad_norm": 0.7035988412176885, "learning_rate": 2.348889070368687e-07, "loss": 1.5247, "step": 2282 }, { "epoch": 0.4738480697384807, "grad_norm": 0.8640509539676204, "learning_rate": 2.347719364996783e-07, "loss": 1.5071, "step": 2283 }, { "epoch": 0.47405562474055624, "grad_norm": 0.6564767860950007, "learning_rate": 2.3465495968185257e-07, "loss": 1.5044, "step": 2284 }, { "epoch": 0.4742631797426318, "grad_norm": 1.1517678340453643, "learning_rate": 2.345379766331271e-07, "loss": 1.507, "step": 2285 }, { "epoch": 0.47447073474470736, "grad_norm": 0.6889288333671003, "learning_rate": 2.344209874032399e-07, "loss": 1.5451, "step": 2286 }, { "epoch": 0.4746782897467829, "grad_norm": 0.6398958340841971, "learning_rate": 2.3430399204193176e-07, "loss": 1.5083, "step": 2287 }, { "epoch": 0.4748858447488584, "grad_norm": 0.9696278280102735, "learning_rate": 2.3418699059894596e-07, "loss": 1.5542, "step": 2288 }, { "epoch": 0.475093399750934, "grad_norm": 0.7131244976565894, "learning_rate": 2.3406998312402836e-07, "loss": 1.5225, "step": 2289 }, { "epoch": 0.47530095475300954, "grad_norm": 0.8488901136050451, "learning_rate": 2.3395296966692753e-07, "loss": 1.5077, "step": 2290 }, { "epoch": 0.4755085097550851, "grad_norm": 0.7116597973479669, "learning_rate": 2.3383595027739438e-07, "loss": 1.5291, "step": 2291 }, { "epoch": 0.47571606475716066, "grad_norm": 0.7912824666543066, "learning_rate": 2.337189250051825e-07, "loss": 1.4718, "step": 2292 }, { "epoch": 0.4759236197592362, "grad_norm": 0.7762120187147161, "learning_rate": 2.3360189390004788e-07, "loss": 1.5113, "step": 2293 }, { "epoch": 0.4761311747613117, "grad_norm": 1.029205861466796, "learning_rate": 2.3348485701174918e-07, "loss": 1.5491, "step": 2294 }, { "epoch": 0.4763387297633873, "grad_norm": 5.880820379340486, "learning_rate": 2.3336781439004717e-07, "loss": 1.4188, "step": 2295 }, { "epoch": 0.47654628476546285, "grad_norm": 0.8120818762675087, "learning_rate": 2.3325076608470544e-07, "loss": 1.4372, "step": 2296 }, { "epoch": 0.4767538397675384, "grad_norm": 0.7106779144237448, "learning_rate": 2.3313371214548976e-07, "loss": 1.5736, "step": 2297 }, { "epoch": 0.47696139476961397, "grad_norm": 1.2711830984132426, "learning_rate": 2.3301665262216837e-07, "loss": 1.5762, "step": 2298 }, { "epoch": 0.4771689497716895, "grad_norm": 0.7380329117026444, "learning_rate": 2.3289958756451176e-07, "loss": 1.5254, "step": 2299 }, { "epoch": 0.47737650477376503, "grad_norm": 0.6689751624502145, "learning_rate": 2.327825170222931e-07, "loss": 1.5081, "step": 2300 }, { "epoch": 0.4775840597758406, "grad_norm": 0.8735488480645404, "learning_rate": 2.3266544104528747e-07, "loss": 1.5229, "step": 2301 }, { "epoch": 0.47779161477791615, "grad_norm": 7.491064486529877, "learning_rate": 2.3254835968327263e-07, "loss": 1.5328, "step": 2302 }, { "epoch": 0.4779991697799917, "grad_norm": 0.9445451691796192, "learning_rate": 2.324312729860284e-07, "loss": 1.5257, "step": 2303 }, { "epoch": 0.47820672478206727, "grad_norm": 0.8216093833592433, "learning_rate": 2.3231418100333688e-07, "loss": 1.5427, "step": 2304 }, { "epoch": 0.4784142797841428, "grad_norm": 0.8910714939683786, "learning_rate": 2.3219708378498258e-07, "loss": 1.4807, "step": 2305 }, { "epoch": 0.47862183478621834, "grad_norm": 0.7606860366842499, "learning_rate": 2.32079981380752e-07, "loss": 1.502, "step": 2306 }, { "epoch": 0.4788293897882939, "grad_norm": 0.9012764651918588, "learning_rate": 2.3196287384043404e-07, "loss": 1.5503, "step": 2307 }, { "epoch": 0.47903694479036946, "grad_norm": 0.9579651908088471, "learning_rate": 2.3184576121381973e-07, "loss": 1.5675, "step": 2308 }, { "epoch": 0.479244499792445, "grad_norm": 0.7836072654754713, "learning_rate": 2.317286435507023e-07, "loss": 1.4791, "step": 2309 }, { "epoch": 0.4794520547945205, "grad_norm": 0.8674005482187821, "learning_rate": 2.3161152090087695e-07, "loss": 1.5291, "step": 2310 }, { "epoch": 0.4796596097965961, "grad_norm": 0.7839208017615653, "learning_rate": 2.3149439331414116e-07, "loss": 1.5115, "step": 2311 }, { "epoch": 0.47986716479867164, "grad_norm": 0.8487706287298489, "learning_rate": 2.3137726084029455e-07, "loss": 1.5373, "step": 2312 }, { "epoch": 0.4800747198007472, "grad_norm": 0.6871791952873016, "learning_rate": 2.3126012352913867e-07, "loss": 1.5507, "step": 2313 }, { "epoch": 0.48028227480282276, "grad_norm": 0.8373394184859119, "learning_rate": 2.3114298143047718e-07, "loss": 1.5851, "step": 2314 }, { "epoch": 0.4804898298048983, "grad_norm": 1.515144902140093, "learning_rate": 2.310258345941158e-07, "loss": 1.5506, "step": 2315 }, { "epoch": 0.4806973848069738, "grad_norm": 1.6261595708990992, "learning_rate": 2.3090868306986233e-07, "loss": 1.5354, "step": 2316 }, { "epoch": 0.4809049398090494, "grad_norm": 1.4336662702773295, "learning_rate": 2.3079152690752637e-07, "loss": 1.514, "step": 2317 }, { "epoch": 0.48111249481112495, "grad_norm": 0.6615473570534988, "learning_rate": 2.306743661569197e-07, "loss": 1.4657, "step": 2318 }, { "epoch": 0.4813200498132005, "grad_norm": 0.7773467156877616, "learning_rate": 2.3055720086785598e-07, "loss": 1.4369, "step": 2319 }, { "epoch": 0.48152760481527607, "grad_norm": 1.194637578620655, "learning_rate": 2.304400310901506e-07, "loss": 1.5464, "step": 2320 }, { "epoch": 0.4817351598173516, "grad_norm": 0.6918974180078192, "learning_rate": 2.3032285687362126e-07, "loss": 1.5278, "step": 2321 }, { "epoch": 0.48194271481942713, "grad_norm": 1.142409739512506, "learning_rate": 2.3020567826808724e-07, "loss": 1.5155, "step": 2322 }, { "epoch": 0.4821502698215027, "grad_norm": 0.9249476922835445, "learning_rate": 2.3008849532336971e-07, "loss": 1.4607, "step": 2323 }, { "epoch": 0.48235782482357825, "grad_norm": 0.7017221801410658, "learning_rate": 2.2997130808929183e-07, "loss": 1.4927, "step": 2324 }, { "epoch": 0.4825653798256538, "grad_norm": 2.9660027003228544, "learning_rate": 2.2985411661567843e-07, "loss": 1.5249, "step": 2325 }, { "epoch": 0.48277293482772937, "grad_norm": 0.6642418128238853, "learning_rate": 2.297369209523563e-07, "loss": 1.5503, "step": 2326 }, { "epoch": 0.4829804898298049, "grad_norm": 0.9172176372091863, "learning_rate": 2.296197211491539e-07, "loss": 1.5265, "step": 2327 }, { "epoch": 0.48318804483188044, "grad_norm": 0.938666710961786, "learning_rate": 2.2950251725590145e-07, "loss": 1.4752, "step": 2328 }, { "epoch": 0.483395599833956, "grad_norm": 0.6923747271119006, "learning_rate": 2.293853093224309e-07, "loss": 1.4951, "step": 2329 }, { "epoch": 0.48360315483603156, "grad_norm": 3.7339901418898993, "learning_rate": 2.2926809739857606e-07, "loss": 1.5496, "step": 2330 }, { "epoch": 0.4838107098381071, "grad_norm": 0.9441949551559098, "learning_rate": 2.291508815341722e-07, "loss": 1.5485, "step": 2331 }, { "epoch": 0.4840182648401826, "grad_norm": 0.7341260522535987, "learning_rate": 2.290336617790565e-07, "loss": 1.5495, "step": 2332 }, { "epoch": 0.4842258198422582, "grad_norm": 0.643687181295844, "learning_rate": 2.2891643818306757e-07, "loss": 1.541, "step": 2333 }, { "epoch": 0.48443337484433374, "grad_norm": 0.7378296106714559, "learning_rate": 2.287992107960459e-07, "loss": 1.5333, "step": 2334 }, { "epoch": 0.4846409298464093, "grad_norm": 0.6845583694851329, "learning_rate": 2.2868197966783343e-07, "loss": 1.4853, "step": 2335 }, { "epoch": 0.48484848484848486, "grad_norm": 0.7279902969198789, "learning_rate": 2.2856474484827367e-07, "loss": 1.5799, "step": 2336 }, { "epoch": 0.4850560398505604, "grad_norm": 0.7684932353486686, "learning_rate": 2.2844750638721168e-07, "loss": 1.5329, "step": 2337 }, { "epoch": 0.4852635948526359, "grad_norm": 0.6606907016769956, "learning_rate": 2.2833026433449438e-07, "loss": 1.5158, "step": 2338 }, { "epoch": 0.4854711498547115, "grad_norm": 0.6035452251027119, "learning_rate": 2.2821301873996978e-07, "loss": 1.5473, "step": 2339 }, { "epoch": 0.48567870485678705, "grad_norm": 1.649840124667926, "learning_rate": 2.280957696534877e-07, "loss": 1.499, "step": 2340 }, { "epoch": 0.4858862598588626, "grad_norm": 0.6911183197901467, "learning_rate": 2.2797851712489927e-07, "loss": 1.5669, "step": 2341 }, { "epoch": 0.48609381486093817, "grad_norm": 0.8624375163771072, "learning_rate": 2.2786126120405714e-07, "loss": 1.548, "step": 2342 }, { "epoch": 0.4863013698630137, "grad_norm": 0.6485684695885501, "learning_rate": 2.277440019408155e-07, "loss": 1.5588, "step": 2343 }, { "epoch": 0.48650892486508923, "grad_norm": 1.0583012242280798, "learning_rate": 2.276267393850298e-07, "loss": 1.5101, "step": 2344 }, { "epoch": 0.4867164798671648, "grad_norm": 0.7303335413368193, "learning_rate": 2.27509473586557e-07, "loss": 1.4889, "step": 2345 }, { "epoch": 0.48692403486924035, "grad_norm": 0.6507484640374903, "learning_rate": 2.273922045952554e-07, "loss": 1.5165, "step": 2346 }, { "epoch": 0.4871315898713159, "grad_norm": 0.7355554998343299, "learning_rate": 2.2727493246098465e-07, "loss": 1.5499, "step": 2347 }, { "epoch": 0.48733914487339147, "grad_norm": 0.8334077385243107, "learning_rate": 2.2715765723360576e-07, "loss": 1.5599, "step": 2348 }, { "epoch": 0.487546699875467, "grad_norm": 0.6835985906661994, "learning_rate": 2.2704037896298106e-07, "loss": 1.5884, "step": 2349 }, { "epoch": 0.48775425487754254, "grad_norm": 0.7642188923655947, "learning_rate": 2.2692309769897408e-07, "loss": 1.4738, "step": 2350 }, { "epoch": 0.4879618098796181, "grad_norm": 0.7961778772622933, "learning_rate": 2.268058134914498e-07, "loss": 1.548, "step": 2351 }, { "epoch": 0.48816936488169366, "grad_norm": 0.7717140197464556, "learning_rate": 2.266885263902743e-07, "loss": 1.4434, "step": 2352 }, { "epoch": 0.4883769198837692, "grad_norm": 1.1344999282159138, "learning_rate": 2.2657123644531495e-07, "loss": 1.5362, "step": 2353 }, { "epoch": 0.4885844748858447, "grad_norm": 0.8086824019107909, "learning_rate": 2.2645394370644033e-07, "loss": 1.5124, "step": 2354 }, { "epoch": 0.4887920298879203, "grad_norm": 0.6734997115762309, "learning_rate": 2.2633664822352015e-07, "loss": 1.4235, "step": 2355 }, { "epoch": 0.48899958488999584, "grad_norm": 0.926579261439982, "learning_rate": 2.2621935004642542e-07, "loss": 1.542, "step": 2356 }, { "epoch": 0.4892071398920714, "grad_norm": 1.2807860653760692, "learning_rate": 2.2610204922502816e-07, "loss": 1.4462, "step": 2357 }, { "epoch": 0.48941469489414696, "grad_norm": 0.6648117251299703, "learning_rate": 2.2598474580920154e-07, "loss": 1.4751, "step": 2358 }, { "epoch": 0.4896222498962225, "grad_norm": 0.7426686724046041, "learning_rate": 2.2586743984881992e-07, "loss": 1.5033, "step": 2359 }, { "epoch": 0.489829804898298, "grad_norm": 0.9369948294317266, "learning_rate": 2.2575013139375864e-07, "loss": 1.5495, "step": 2360 }, { "epoch": 0.4900373599003736, "grad_norm": 0.7104454748457598, "learning_rate": 2.2563282049389411e-07, "loss": 1.5369, "step": 2361 }, { "epoch": 0.49024491490244915, "grad_norm": 0.666662957904917, "learning_rate": 2.255155071991039e-07, "loss": 1.5705, "step": 2362 }, { "epoch": 0.4904524699045247, "grad_norm": 0.76992241718709, "learning_rate": 2.253981915592664e-07, "loss": 1.5185, "step": 2363 }, { "epoch": 0.49066002490660027, "grad_norm": 1.3641406282741546, "learning_rate": 2.252808736242612e-07, "loss": 1.4954, "step": 2364 }, { "epoch": 0.4908675799086758, "grad_norm": 0.6534250671595311, "learning_rate": 2.2516355344396873e-07, "loss": 1.5424, "step": 2365 }, { "epoch": 0.49107513491075133, "grad_norm": 1.3755655313249537, "learning_rate": 2.2504623106827046e-07, "loss": 1.5184, "step": 2366 }, { "epoch": 0.4912826899128269, "grad_norm": 0.8016967943967964, "learning_rate": 2.2492890654704862e-07, "loss": 1.5138, "step": 2367 }, { "epoch": 0.49149024491490245, "grad_norm": 0.7283545108228767, "learning_rate": 2.2481157993018667e-07, "loss": 1.4976, "step": 2368 }, { "epoch": 0.491697799916978, "grad_norm": 0.7184022988714208, "learning_rate": 2.2469425126756865e-07, "loss": 1.5948, "step": 2369 }, { "epoch": 0.4919053549190536, "grad_norm": 0.7466552043569856, "learning_rate": 2.2457692060907952e-07, "loss": 1.4851, "step": 2370 }, { "epoch": 0.4921129099211291, "grad_norm": 0.7263271990122764, "learning_rate": 2.2445958800460537e-07, "loss": 1.4852, "step": 2371 }, { "epoch": 0.49232046492320464, "grad_norm": 1.0766862289288277, "learning_rate": 2.2434225350403278e-07, "loss": 1.5401, "step": 2372 }, { "epoch": 0.4925280199252802, "grad_norm": 0.9796289910443803, "learning_rate": 2.2422491715724923e-07, "loss": 1.4356, "step": 2373 }, { "epoch": 0.49273557492735576, "grad_norm": 0.810236605123269, "learning_rate": 2.241075790141431e-07, "loss": 1.5169, "step": 2374 }, { "epoch": 0.4929431299294313, "grad_norm": 1.0078173013194331, "learning_rate": 2.2399023912460345e-07, "loss": 1.509, "step": 2375 }, { "epoch": 0.4931506849315068, "grad_norm": 1.2735091932440763, "learning_rate": 2.2387289753852002e-07, "loss": 1.5409, "step": 2376 }, { "epoch": 0.4933582399335824, "grad_norm": 0.7829239059246687, "learning_rate": 2.2375555430578332e-07, "loss": 1.5498, "step": 2377 }, { "epoch": 0.49356579493565794, "grad_norm": 0.7209757964077856, "learning_rate": 2.2363820947628472e-07, "loss": 1.4812, "step": 2378 }, { "epoch": 0.4937733499377335, "grad_norm": 1.0953402268958026, "learning_rate": 2.2352086309991605e-07, "loss": 1.5725, "step": 2379 }, { "epoch": 0.49398090493980906, "grad_norm": 0.7140442658739737, "learning_rate": 2.2340351522656982e-07, "loss": 1.5611, "step": 2380 }, { "epoch": 0.4941884599418846, "grad_norm": 0.6439189073376079, "learning_rate": 2.2328616590613927e-07, "loss": 1.5731, "step": 2381 }, { "epoch": 0.4943960149439601, "grad_norm": 0.8369451397426825, "learning_rate": 2.2316881518851827e-07, "loss": 1.5482, "step": 2382 }, { "epoch": 0.4946035699460357, "grad_norm": 1.3822314719291078, "learning_rate": 2.2305146312360113e-07, "loss": 1.5507, "step": 2383 }, { "epoch": 0.49481112494811125, "grad_norm": 0.7388750690291305, "learning_rate": 2.229341097612829e-07, "loss": 1.5374, "step": 2384 }, { "epoch": 0.4950186799501868, "grad_norm": 0.7308634363153971, "learning_rate": 2.2281675515145911e-07, "loss": 1.4615, "step": 2385 }, { "epoch": 0.49522623495226237, "grad_norm": 0.6561389768958427, "learning_rate": 2.2269939934402579e-07, "loss": 1.55, "step": 2386 }, { "epoch": 0.4954337899543379, "grad_norm": 0.6491519946471916, "learning_rate": 2.2258204238887952e-07, "loss": 1.4425, "step": 2387 }, { "epoch": 0.49564134495641343, "grad_norm": 0.7899172122781211, "learning_rate": 2.2246468433591738e-07, "loss": 1.4685, "step": 2388 }, { "epoch": 0.495848899958489, "grad_norm": 0.9312882067013943, "learning_rate": 2.223473252350369e-07, "loss": 1.5024, "step": 2389 }, { "epoch": 0.49605645496056455, "grad_norm": 0.92402736165806, "learning_rate": 2.2222996513613607e-07, "loss": 1.6208, "step": 2390 }, { "epoch": 0.4962640099626401, "grad_norm": 0.8201952361478746, "learning_rate": 2.2211260408911326e-07, "loss": 1.5908, "step": 2391 }, { "epoch": 0.4964715649647157, "grad_norm": 0.7475155499084326, "learning_rate": 2.2199524214386725e-07, "loss": 1.5815, "step": 2392 }, { "epoch": 0.4966791199667912, "grad_norm": 1.888323899346627, "learning_rate": 2.2187787935029729e-07, "loss": 1.4394, "step": 2393 }, { "epoch": 0.49688667496886674, "grad_norm": 0.6832350446707858, "learning_rate": 2.2176051575830287e-07, "loss": 1.4805, "step": 2394 }, { "epoch": 0.4970942299709423, "grad_norm": 0.8165171691449675, "learning_rate": 2.2164315141778385e-07, "loss": 1.4602, "step": 2395 }, { "epoch": 0.49730178497301786, "grad_norm": 1.0322843220373226, "learning_rate": 2.2152578637864052e-07, "loss": 1.5236, "step": 2396 }, { "epoch": 0.4975093399750934, "grad_norm": 0.7792230662731928, "learning_rate": 2.2140842069077332e-07, "loss": 1.5274, "step": 2397 }, { "epoch": 0.4977168949771689, "grad_norm": 0.7198823116536779, "learning_rate": 2.2129105440408305e-07, "loss": 1.4906, "step": 2398 }, { "epoch": 0.4979244499792445, "grad_norm": 0.6890951297207466, "learning_rate": 2.211736875684707e-07, "loss": 1.5051, "step": 2399 }, { "epoch": 0.49813200498132004, "grad_norm": 0.6984522177679134, "learning_rate": 2.2105632023383754e-07, "loss": 1.5105, "step": 2400 }, { "epoch": 0.4983395599833956, "grad_norm": 0.6524769170303281, "learning_rate": 2.209389524500851e-07, "loss": 1.5476, "step": 2401 }, { "epoch": 0.49854711498547116, "grad_norm": 0.851430811558582, "learning_rate": 2.2082158426711498e-07, "loss": 1.5251, "step": 2402 }, { "epoch": 0.4987546699875467, "grad_norm": 1.8419335179908904, "learning_rate": 2.2070421573482898e-07, "loss": 1.4877, "step": 2403 }, { "epoch": 0.4989622249896222, "grad_norm": 0.8383365556589939, "learning_rate": 2.205868469031292e-07, "loss": 1.4606, "step": 2404 }, { "epoch": 0.4991697799916978, "grad_norm": 1.0272126896521991, "learning_rate": 2.204694778219177e-07, "loss": 1.4775, "step": 2405 }, { "epoch": 0.49937733499377335, "grad_norm": 1.0330333576597883, "learning_rate": 2.2035210854109672e-07, "loss": 1.5555, "step": 2406 }, { "epoch": 0.4995848899958489, "grad_norm": 0.7326413275343511, "learning_rate": 2.2023473911056852e-07, "loss": 1.536, "step": 2407 }, { "epoch": 0.49979244499792447, "grad_norm": 0.8270757016848693, "learning_rate": 2.2011736958023546e-07, "loss": 1.5758, "step": 2408 }, { "epoch": 0.5, "grad_norm": 0.9499466976856001, "learning_rate": 2.2e-07, "loss": 1.5102, "step": 2409 }, { "epoch": 0.5002075550020756, "grad_norm": 0.7388147924857219, "learning_rate": 2.198826304197645e-07, "loss": 1.5057, "step": 2410 }, { "epoch": 0.5004151100041511, "grad_norm": 1.8802440139792262, "learning_rate": 2.197652608894315e-07, "loss": 1.5136, "step": 2411 }, { "epoch": 0.5006226650062267, "grad_norm": 0.744049750632982, "learning_rate": 2.1964789145890327e-07, "loss": 1.5254, "step": 2412 }, { "epoch": 0.5008302200083022, "grad_norm": 0.9103896865032503, "learning_rate": 2.1953052217808227e-07, "loss": 1.6083, "step": 2413 }, { "epoch": 0.5010377750103777, "grad_norm": 15.482267004015949, "learning_rate": 2.194131530968708e-07, "loss": 1.5511, "step": 2414 }, { "epoch": 0.5012453300124533, "grad_norm": 0.6691221392789174, "learning_rate": 2.1929578426517104e-07, "loss": 1.4803, "step": 2415 }, { "epoch": 0.5014528850145289, "grad_norm": 0.8649620252717285, "learning_rate": 2.1917841573288504e-07, "loss": 1.5206, "step": 2416 }, { "epoch": 0.5016604400166044, "grad_norm": 0.7945827938797361, "learning_rate": 2.1906104754991496e-07, "loss": 1.5048, "step": 2417 }, { "epoch": 0.50186799501868, "grad_norm": 0.8105239033777475, "learning_rate": 2.1894367976616248e-07, "loss": 1.4898, "step": 2418 }, { "epoch": 0.5020755500207555, "grad_norm": 0.7066074602117844, "learning_rate": 2.1882631243152932e-07, "loss": 1.4823, "step": 2419 }, { "epoch": 0.502283105022831, "grad_norm": 1.6046386942377815, "learning_rate": 2.1870894559591702e-07, "loss": 1.5445, "step": 2420 }, { "epoch": 0.5024906600249066, "grad_norm": 0.9681527331244452, "learning_rate": 2.185915793092267e-07, "loss": 1.5275, "step": 2421 }, { "epoch": 0.5026982150269822, "grad_norm": 0.708434096699083, "learning_rate": 2.1847421362135945e-07, "loss": 1.5532, "step": 2422 }, { "epoch": 0.5029057700290577, "grad_norm": 0.713965479785203, "learning_rate": 2.1835684858221618e-07, "loss": 1.4534, "step": 2423 }, { "epoch": 0.5031133250311333, "grad_norm": 1.2962602570015131, "learning_rate": 2.1823948424169715e-07, "loss": 1.5067, "step": 2424 }, { "epoch": 0.5033208800332089, "grad_norm": 0.7339499594838154, "learning_rate": 2.181221206497027e-07, "loss": 1.667, "step": 2425 }, { "epoch": 0.5035284350352843, "grad_norm": 0.6634648430427076, "learning_rate": 2.1800475785613277e-07, "loss": 1.5251, "step": 2426 }, { "epoch": 0.5037359900373599, "grad_norm": 3.0871159735918665, "learning_rate": 2.1788739591088677e-07, "loss": 1.5387, "step": 2427 }, { "epoch": 0.5039435450394355, "grad_norm": 0.7638438643105517, "learning_rate": 2.177700348638639e-07, "loss": 1.5176, "step": 2428 }, { "epoch": 0.504151100041511, "grad_norm": 0.8473858309718701, "learning_rate": 2.1765267476496308e-07, "loss": 1.5254, "step": 2429 }, { "epoch": 0.5043586550435866, "grad_norm": 0.8177431453202664, "learning_rate": 2.175353156640826e-07, "loss": 1.5109, "step": 2430 }, { "epoch": 0.5045662100456622, "grad_norm": 1.0904511203969505, "learning_rate": 2.174179576111205e-07, "loss": 1.4932, "step": 2431 }, { "epoch": 0.5047737650477376, "grad_norm": 0.6510980267204981, "learning_rate": 2.1730060065597424e-07, "loss": 1.4741, "step": 2432 }, { "epoch": 0.5049813200498132, "grad_norm": 0.9533777191581878, "learning_rate": 2.1718324484854088e-07, "loss": 1.5348, "step": 2433 }, { "epoch": 0.5051888750518887, "grad_norm": 0.6783813190698431, "learning_rate": 2.1706589023871714e-07, "loss": 1.5372, "step": 2434 }, { "epoch": 0.5053964300539643, "grad_norm": 0.7938763627829857, "learning_rate": 2.169485368763989e-07, "loss": 1.5557, "step": 2435 }, { "epoch": 0.5056039850560399, "grad_norm": 0.7996241166136091, "learning_rate": 2.1683118481148175e-07, "loss": 1.5017, "step": 2436 }, { "epoch": 0.5058115400581153, "grad_norm": 0.884268792283763, "learning_rate": 2.1671383409386075e-07, "loss": 1.5164, "step": 2437 }, { "epoch": 0.5060190950601909, "grad_norm": 0.6716713247883358, "learning_rate": 2.165964847734302e-07, "loss": 1.4735, "step": 2438 }, { "epoch": 0.5062266500622665, "grad_norm": 1.1598691760507256, "learning_rate": 2.1647913690008398e-07, "loss": 1.4989, "step": 2439 }, { "epoch": 0.506434205064342, "grad_norm": 1.2055544529619706, "learning_rate": 2.1636179052371525e-07, "loss": 1.4991, "step": 2440 }, { "epoch": 0.5066417600664176, "grad_norm": 0.7595865867619096, "learning_rate": 2.1624444569421665e-07, "loss": 1.5393, "step": 2441 }, { "epoch": 0.5068493150684932, "grad_norm": 0.6702215983733609, "learning_rate": 2.1612710246148e-07, "loss": 1.5017, "step": 2442 }, { "epoch": 0.5070568700705687, "grad_norm": 0.9219748956820397, "learning_rate": 2.1600976087539663e-07, "loss": 1.5682, "step": 2443 }, { "epoch": 0.5072644250726442, "grad_norm": 0.6697674762341783, "learning_rate": 2.1589242098585688e-07, "loss": 1.5021, "step": 2444 }, { "epoch": 0.5074719800747198, "grad_norm": 0.6356975413297162, "learning_rate": 2.1577508284275074e-07, "loss": 1.4923, "step": 2445 }, { "epoch": 0.5076795350767953, "grad_norm": 1.473554865655388, "learning_rate": 2.156577464959673e-07, "loss": 1.4707, "step": 2446 }, { "epoch": 0.5078870900788709, "grad_norm": 0.8449252727151781, "learning_rate": 2.1554041199539465e-07, "loss": 1.5072, "step": 2447 }, { "epoch": 0.5080946450809465, "grad_norm": 0.9176107996429508, "learning_rate": 2.1542307939092043e-07, "loss": 1.4434, "step": 2448 }, { "epoch": 0.508302200083022, "grad_norm": 0.9618820285083549, "learning_rate": 2.1530574873243142e-07, "loss": 1.4842, "step": 2449 }, { "epoch": 0.5085097550850975, "grad_norm": 0.9252681176960654, "learning_rate": 2.1518842006981335e-07, "loss": 1.5339, "step": 2450 }, { "epoch": 0.5087173100871731, "grad_norm": 0.7061334241468691, "learning_rate": 2.1507109345295135e-07, "loss": 1.5524, "step": 2451 }, { "epoch": 0.5089248650892486, "grad_norm": 0.7019032867443913, "learning_rate": 2.149537689317296e-07, "loss": 1.4591, "step": 2452 }, { "epoch": 0.5091324200913242, "grad_norm": 0.8982633216221373, "learning_rate": 2.148364465560313e-07, "loss": 1.5385, "step": 2453 }, { "epoch": 0.5093399750933998, "grad_norm": 0.766863641714324, "learning_rate": 2.1471912637573877e-07, "loss": 1.5031, "step": 2454 }, { "epoch": 0.5095475300954753, "grad_norm": 0.7062281624244494, "learning_rate": 2.1460180844073358e-07, "loss": 1.4823, "step": 2455 }, { "epoch": 0.5097550850975509, "grad_norm": 0.8723446422793907, "learning_rate": 2.144844928008961e-07, "loss": 1.5528, "step": 2456 }, { "epoch": 0.5099626400996264, "grad_norm": 0.7422745245421455, "learning_rate": 2.143671795061059e-07, "loss": 1.5478, "step": 2457 }, { "epoch": 0.5101701951017019, "grad_norm": 0.7799161085115599, "learning_rate": 2.142498686062414e-07, "loss": 1.5311, "step": 2458 }, { "epoch": 0.5103777501037775, "grad_norm": 0.6945089826557154, "learning_rate": 2.1413256015118008e-07, "loss": 1.5037, "step": 2459 }, { "epoch": 0.5105853051058531, "grad_norm": 0.7138520715435961, "learning_rate": 2.1401525419079846e-07, "loss": 1.491, "step": 2460 }, { "epoch": 0.5107928601079286, "grad_norm": 0.6366437637946678, "learning_rate": 2.1389795077497192e-07, "loss": 1.4952, "step": 2461 }, { "epoch": 0.5110004151100042, "grad_norm": 0.7726024325988109, "learning_rate": 2.1378064995357458e-07, "loss": 1.4565, "step": 2462 }, { "epoch": 0.5112079701120797, "grad_norm": 0.8521489148994181, "learning_rate": 2.1366335177647982e-07, "loss": 1.5389, "step": 2463 }, { "epoch": 0.5114155251141552, "grad_norm": 1.21942086179602, "learning_rate": 2.1354605629355972e-07, "loss": 1.5497, "step": 2464 }, { "epoch": 0.5116230801162308, "grad_norm": 0.8609674445921635, "learning_rate": 2.1342876355468507e-07, "loss": 1.5593, "step": 2465 }, { "epoch": 0.5118306351183064, "grad_norm": 3.2921579618116916, "learning_rate": 2.1331147360972567e-07, "loss": 1.4479, "step": 2466 }, { "epoch": 0.5120381901203819, "grad_norm": 0.7598708051121843, "learning_rate": 2.131941865085502e-07, "loss": 1.5294, "step": 2467 }, { "epoch": 0.5122457451224575, "grad_norm": 0.6713978721464637, "learning_rate": 2.1307690230102594e-07, "loss": 1.5361, "step": 2468 }, { "epoch": 0.512453300124533, "grad_norm": 0.8762878443709896, "learning_rate": 2.1295962103701894e-07, "loss": 1.497, "step": 2469 }, { "epoch": 0.5126608551266085, "grad_norm": 0.9048152481656528, "learning_rate": 2.1284234276639426e-07, "loss": 1.4793, "step": 2470 }, { "epoch": 0.5128684101286841, "grad_norm": 0.7156511024565448, "learning_rate": 2.1272506753901534e-07, "loss": 1.5092, "step": 2471 }, { "epoch": 0.5130759651307597, "grad_norm": 0.696532616525458, "learning_rate": 2.1260779540474457e-07, "loss": 1.5417, "step": 2472 }, { "epoch": 0.5132835201328352, "grad_norm": 0.6927337066286205, "learning_rate": 2.1249052641344302e-07, "loss": 1.5053, "step": 2473 }, { "epoch": 0.5134910751349108, "grad_norm": 0.7442089820585011, "learning_rate": 2.1237326061497017e-07, "loss": 1.4757, "step": 2474 }, { "epoch": 0.5136986301369864, "grad_norm": 0.8246407513578571, "learning_rate": 2.1225599805918448e-07, "loss": 1.5193, "step": 2475 }, { "epoch": 0.5139061851390618, "grad_norm": 0.8641013857303477, "learning_rate": 2.1213873879594288e-07, "loss": 1.5032, "step": 2476 }, { "epoch": 0.5141137401411374, "grad_norm": 2.247438914337527, "learning_rate": 2.1202148287510075e-07, "loss": 1.4839, "step": 2477 }, { "epoch": 0.5143212951432129, "grad_norm": 0.6932341896726947, "learning_rate": 2.119042303465124e-07, "loss": 1.5523, "step": 2478 }, { "epoch": 0.5145288501452885, "grad_norm": 0.7732484587753624, "learning_rate": 2.1178698126003027e-07, "loss": 1.4939, "step": 2479 }, { "epoch": 0.5147364051473641, "grad_norm": 0.9041529393067483, "learning_rate": 2.1166973566550564e-07, "loss": 1.5704, "step": 2480 }, { "epoch": 0.5149439601494396, "grad_norm": 0.8410247703754223, "learning_rate": 2.1155249361278832e-07, "loss": 1.5702, "step": 2481 }, { "epoch": 0.5151515151515151, "grad_norm": 0.8103445323108253, "learning_rate": 2.114352551517264e-07, "loss": 1.5048, "step": 2482 }, { "epoch": 0.5153590701535907, "grad_norm": 1.1187607543104976, "learning_rate": 2.113180203321666e-07, "loss": 1.5225, "step": 2483 }, { "epoch": 0.5155666251556662, "grad_norm": 0.7353532302101807, "learning_rate": 2.1120078920395414e-07, "loss": 1.4607, "step": 2484 }, { "epoch": 0.5157741801577418, "grad_norm": 1.0769957396925915, "learning_rate": 2.1108356181693242e-07, "loss": 1.4906, "step": 2485 }, { "epoch": 0.5159817351598174, "grad_norm": 1.527339486433276, "learning_rate": 2.109663382209435e-07, "loss": 1.5283, "step": 2486 }, { "epoch": 0.5161892901618929, "grad_norm": 0.7213794450479653, "learning_rate": 2.1084911846582782e-07, "loss": 1.581, "step": 2487 }, { "epoch": 0.5163968451639684, "grad_norm": 1.005543951952498, "learning_rate": 2.10731902601424e-07, "loss": 1.5208, "step": 2488 }, { "epoch": 0.516604400166044, "grad_norm": 0.826196582424875, "learning_rate": 2.1061469067756907e-07, "loss": 1.4791, "step": 2489 }, { "epoch": 0.5168119551681195, "grad_norm": 1.0287051244940715, "learning_rate": 2.1049748274409863e-07, "loss": 1.5513, "step": 2490 }, { "epoch": 0.5170195101701951, "grad_norm": 0.9585164554639976, "learning_rate": 2.1038027885084612e-07, "loss": 1.5184, "step": 2491 }, { "epoch": 0.5172270651722707, "grad_norm": 0.7865200612734766, "learning_rate": 2.1026307904764367e-07, "loss": 1.4974, "step": 2492 }, { "epoch": 0.5174346201743462, "grad_norm": 1.3845069727907573, "learning_rate": 2.1014588338432157e-07, "loss": 1.4595, "step": 2493 }, { "epoch": 0.5176421751764217, "grad_norm": 0.690391057181294, "learning_rate": 2.1002869191070825e-07, "loss": 1.5095, "step": 2494 }, { "epoch": 0.5178497301784973, "grad_norm": 0.758964411680427, "learning_rate": 2.099115046766303e-07, "loss": 1.4986, "step": 2495 }, { "epoch": 0.5180572851805728, "grad_norm": 0.6995193011205069, "learning_rate": 2.0979432173191284e-07, "loss": 1.5073, "step": 2496 }, { "epoch": 0.5182648401826484, "grad_norm": 0.6745500359558347, "learning_rate": 2.0967714312637877e-07, "loss": 1.5596, "step": 2497 }, { "epoch": 0.518472395184724, "grad_norm": 8.379139661600988, "learning_rate": 2.0955996890984938e-07, "loss": 1.4595, "step": 2498 }, { "epoch": 0.5186799501867995, "grad_norm": 0.7309652404488921, "learning_rate": 2.0944279913214414e-07, "loss": 1.5826, "step": 2499 }, { "epoch": 0.518887505188875, "grad_norm": 0.7936397903089487, "learning_rate": 2.0932563384308032e-07, "loss": 1.5901, "step": 2500 }, { "epoch": 0.5190950601909506, "grad_norm": 1.2203690484503538, "learning_rate": 2.092084730924736e-07, "loss": 1.4968, "step": 2501 }, { "epoch": 0.5193026151930261, "grad_norm": 0.8466712403175027, "learning_rate": 2.0909131693013772e-07, "loss": 1.5396, "step": 2502 }, { "epoch": 0.5195101701951017, "grad_norm": 0.6963177397067928, "learning_rate": 2.0897416540588418e-07, "loss": 1.4834, "step": 2503 }, { "epoch": 0.5197177251971773, "grad_norm": 0.6624086497063163, "learning_rate": 2.088570185695228e-07, "loss": 1.5081, "step": 2504 }, { "epoch": 0.5199252801992528, "grad_norm": 0.6258416757370212, "learning_rate": 2.087398764708614e-07, "loss": 1.4706, "step": 2505 }, { "epoch": 0.5201328352013284, "grad_norm": 0.9068187223454897, "learning_rate": 2.0862273915970548e-07, "loss": 1.5718, "step": 2506 }, { "epoch": 0.520340390203404, "grad_norm": 0.7166672898104127, "learning_rate": 2.085056066858588e-07, "loss": 1.558, "step": 2507 }, { "epoch": 0.5205479452054794, "grad_norm": 0.9092022844852634, "learning_rate": 2.083884790991231e-07, "loss": 1.588, "step": 2508 }, { "epoch": 0.520755500207555, "grad_norm": 0.7282167983136697, "learning_rate": 2.0827135644929771e-07, "loss": 1.4866, "step": 2509 }, { "epoch": 0.5209630552096306, "grad_norm": 1.4645709270681502, "learning_rate": 2.0815423878618024e-07, "loss": 1.4826, "step": 2510 }, { "epoch": 0.5211706102117061, "grad_norm": 1.1634417026671537, "learning_rate": 2.0803712615956598e-07, "loss": 1.5624, "step": 2511 }, { "epoch": 0.5213781652137817, "grad_norm": 0.5994248350347131, "learning_rate": 2.07920018619248e-07, "loss": 1.4907, "step": 2512 }, { "epoch": 0.5215857202158573, "grad_norm": 0.854130339608346, "learning_rate": 2.0780291621501745e-07, "loss": 1.4797, "step": 2513 }, { "epoch": 0.5217932752179327, "grad_norm": 1.4762152580693124, "learning_rate": 2.0768581899666314e-07, "loss": 1.4978, "step": 2514 }, { "epoch": 0.5220008302200083, "grad_norm": 0.70458294709752, "learning_rate": 2.075687270139716e-07, "loss": 1.5031, "step": 2515 }, { "epoch": 0.5222083852220839, "grad_norm": 0.6863497182521193, "learning_rate": 2.0745164031672734e-07, "loss": 1.5239, "step": 2516 }, { "epoch": 0.5224159402241594, "grad_norm": 1.2072481621083504, "learning_rate": 2.073345589547125e-07, "loss": 1.5273, "step": 2517 }, { "epoch": 0.522623495226235, "grad_norm": 0.778691246550858, "learning_rate": 2.0721748297770691e-07, "loss": 1.6001, "step": 2518 }, { "epoch": 0.5228310502283106, "grad_norm": 1.1136332407815033, "learning_rate": 2.0710041243548818e-07, "loss": 1.5537, "step": 2519 }, { "epoch": 0.523038605230386, "grad_norm": 0.8649465687966319, "learning_rate": 2.0698334737783166e-07, "loss": 1.4557, "step": 2520 }, { "epoch": 0.5232461602324616, "grad_norm": 1.1643855257735907, "learning_rate": 2.0686628785451027e-07, "loss": 1.5895, "step": 2521 }, { "epoch": 0.5234537152345371, "grad_norm": 1.1166779145001688, "learning_rate": 2.0674923391529458e-07, "loss": 1.5445, "step": 2522 }, { "epoch": 0.5236612702366127, "grad_norm": 1.478756010933752, "learning_rate": 2.0663218560995285e-07, "loss": 1.5654, "step": 2523 }, { "epoch": 0.5238688252386883, "grad_norm": 0.6516633123967659, "learning_rate": 2.0651514298825087e-07, "loss": 1.5366, "step": 2524 }, { "epoch": 0.5240763802407638, "grad_norm": 0.7161622602381621, "learning_rate": 2.0639810609995214e-07, "loss": 1.5313, "step": 2525 }, { "epoch": 0.5242839352428393, "grad_norm": 0.7078844579987807, "learning_rate": 2.0628107499481756e-07, "loss": 1.5615, "step": 2526 }, { "epoch": 0.5244914902449149, "grad_norm": 0.6908546622783227, "learning_rate": 2.0616404972260565e-07, "loss": 1.5223, "step": 2527 }, { "epoch": 0.5246990452469904, "grad_norm": 0.9980560645697012, "learning_rate": 2.0604703033307257e-07, "loss": 1.5172, "step": 2528 }, { "epoch": 0.524906600249066, "grad_norm": 0.7512472252202701, "learning_rate": 2.0593001687597167e-07, "loss": 1.5654, "step": 2529 }, { "epoch": 0.5251141552511416, "grad_norm": 1.579620471724761, "learning_rate": 2.0581300940105403e-07, "loss": 1.4265, "step": 2530 }, { "epoch": 0.5253217102532171, "grad_norm": 0.7751885985002903, "learning_rate": 2.056960079580683e-07, "loss": 1.4854, "step": 2531 }, { "epoch": 0.5255292652552926, "grad_norm": 0.655931305698934, "learning_rate": 2.055790125967601e-07, "loss": 1.5715, "step": 2532 }, { "epoch": 0.5257368202573682, "grad_norm": 1.783292058223657, "learning_rate": 2.0546202336687291e-07, "loss": 1.5745, "step": 2533 }, { "epoch": 0.5259443752594437, "grad_norm": 0.787009686242263, "learning_rate": 2.0534504031814746e-07, "loss": 1.52, "step": 2534 }, { "epoch": 0.5261519302615193, "grad_norm": 0.959048562325366, "learning_rate": 2.0522806350032175e-07, "loss": 1.5391, "step": 2535 }, { "epoch": 0.5263594852635949, "grad_norm": 0.8527148164380743, "learning_rate": 2.0511109296313126e-07, "loss": 1.6084, "step": 2536 }, { "epoch": 0.5265670402656704, "grad_norm": 0.7107121194407611, "learning_rate": 2.049941287563089e-07, "loss": 1.5214, "step": 2537 }, { "epoch": 0.526774595267746, "grad_norm": 1.2482997698995304, "learning_rate": 2.0487717092958446e-07, "loss": 1.4906, "step": 2538 }, { "epoch": 0.5269821502698215, "grad_norm": 0.7072247277798667, "learning_rate": 2.0476021953268546e-07, "loss": 1.5156, "step": 2539 }, { "epoch": 0.527189705271897, "grad_norm": 0.7728478784544087, "learning_rate": 2.0464327461533664e-07, "loss": 1.558, "step": 2540 }, { "epoch": 0.5273972602739726, "grad_norm": 0.8284025800637722, "learning_rate": 2.0452633622725964e-07, "loss": 1.4653, "step": 2541 }, { "epoch": 0.5276048152760482, "grad_norm": 0.7864178193578892, "learning_rate": 2.0440940441817368e-07, "loss": 1.5375, "step": 2542 }, { "epoch": 0.5278123702781237, "grad_norm": 0.983310189126021, "learning_rate": 2.0429247923779513e-07, "loss": 1.4949, "step": 2543 }, { "epoch": 0.5280199252801993, "grad_norm": 0.7467909625989806, "learning_rate": 2.0417556073583735e-07, "loss": 1.5651, "step": 2544 }, { "epoch": 0.5282274802822748, "grad_norm": 0.9296866953037852, "learning_rate": 2.0405864896201103e-07, "loss": 1.4697, "step": 2545 }, { "epoch": 0.5284350352843503, "grad_norm": 1.22643417988263, "learning_rate": 2.0394174396602398e-07, "loss": 1.5398, "step": 2546 }, { "epoch": 0.5286425902864259, "grad_norm": 0.8074935583982986, "learning_rate": 2.0382484579758103e-07, "loss": 1.5069, "step": 2547 }, { "epoch": 0.5288501452885015, "grad_norm": 0.7339167135498497, "learning_rate": 2.0370795450638423e-07, "loss": 1.4936, "step": 2548 }, { "epoch": 0.529057700290577, "grad_norm": 0.8308385068673543, "learning_rate": 2.035910701421327e-07, "loss": 1.5502, "step": 2549 }, { "epoch": 0.5292652552926526, "grad_norm": 0.9291711743124204, "learning_rate": 2.0347419275452244e-07, "loss": 1.5193, "step": 2550 }, { "epoch": 0.5294728102947281, "grad_norm": 0.7043226671689163, "learning_rate": 2.0335732239324668e-07, "loss": 1.5815, "step": 2551 }, { "epoch": 0.5296803652968036, "grad_norm": 0.6734361721890403, "learning_rate": 2.032404591079957e-07, "loss": 1.6102, "step": 2552 }, { "epoch": 0.5298879202988792, "grad_norm": 0.6880087259342141, "learning_rate": 2.0312360294845649e-07, "loss": 1.5458, "step": 2553 }, { "epoch": 0.5300954753009548, "grad_norm": 1.2872455794096174, "learning_rate": 2.0300675396431325e-07, "loss": 1.5311, "step": 2554 }, { "epoch": 0.5303030303030303, "grad_norm": 0.7272249077581653, "learning_rate": 2.0288991220524716e-07, "loss": 1.5008, "step": 2555 }, { "epoch": 0.5305105853051059, "grad_norm": 0.7762425445406153, "learning_rate": 2.0277307772093608e-07, "loss": 1.4858, "step": 2556 }, { "epoch": 0.5307181403071815, "grad_norm": 0.713950185719675, "learning_rate": 2.02656250561055e-07, "loss": 1.5028, "step": 2557 }, { "epoch": 0.5309256953092569, "grad_norm": 0.7962183171099543, "learning_rate": 2.0253943077527582e-07, "loss": 1.5346, "step": 2558 }, { "epoch": 0.5311332503113325, "grad_norm": 0.6455917051944241, "learning_rate": 2.02422618413267e-07, "loss": 1.5303, "step": 2559 }, { "epoch": 0.5313408053134081, "grad_norm": 0.7354275031634365, "learning_rate": 2.0230581352469424e-07, "loss": 1.4789, "step": 2560 }, { "epoch": 0.5315483603154836, "grad_norm": 0.6923837539251685, "learning_rate": 2.0218901615921982e-07, "loss": 1.4758, "step": 2561 }, { "epoch": 0.5317559153175592, "grad_norm": 1.069232011110929, "learning_rate": 2.0207222636650286e-07, "loss": 1.4795, "step": 2562 }, { "epoch": 0.5319634703196348, "grad_norm": 0.7616210867512286, "learning_rate": 2.019554441961993e-07, "loss": 1.5087, "step": 2563 }, { "epoch": 0.5321710253217102, "grad_norm": 0.688797360280766, "learning_rate": 2.018386696979618e-07, "loss": 1.4573, "step": 2564 }, { "epoch": 0.5323785803237858, "grad_norm": 0.7302463794444253, "learning_rate": 2.017219029214398e-07, "loss": 1.6092, "step": 2565 }, { "epoch": 0.5325861353258613, "grad_norm": 0.7844837275607282, "learning_rate": 2.0160514391627945e-07, "loss": 1.4471, "step": 2566 }, { "epoch": 0.5327936903279369, "grad_norm": 1.2505709916627947, "learning_rate": 2.014883927321235e-07, "loss": 1.4994, "step": 2567 }, { "epoch": 0.5330012453300125, "grad_norm": 0.8899884888286768, "learning_rate": 2.013716494186115e-07, "loss": 1.5106, "step": 2568 }, { "epoch": 0.533208800332088, "grad_norm": 0.7967945892716943, "learning_rate": 2.0125491402537972e-07, "loss": 1.5715, "step": 2569 }, { "epoch": 0.5334163553341635, "grad_norm": 0.6324024271307198, "learning_rate": 2.0113818660206072e-07, "loss": 1.437, "step": 2570 }, { "epoch": 0.5336239103362391, "grad_norm": 4.3694699066472005, "learning_rate": 2.0102146719828404e-07, "loss": 1.4952, "step": 2571 }, { "epoch": 0.5338314653383146, "grad_norm": 0.8139766291392232, "learning_rate": 2.009047558636757e-07, "loss": 1.5178, "step": 2572 }, { "epoch": 0.5340390203403902, "grad_norm": 0.7086424050980737, "learning_rate": 2.0078805264785822e-07, "loss": 1.4464, "step": 2573 }, { "epoch": 0.5342465753424658, "grad_norm": 1.3878930183771134, "learning_rate": 2.0067135760045065e-07, "loss": 1.505, "step": 2574 }, { "epoch": 0.5344541303445413, "grad_norm": 1.294481870827835, "learning_rate": 2.0055467077106876e-07, "loss": 1.6181, "step": 2575 }, { "epoch": 0.5346616853466168, "grad_norm": 1.7689129760619762, "learning_rate": 2.0043799220932453e-07, "loss": 1.564, "step": 2576 }, { "epoch": 0.5348692403486924, "grad_norm": 1.0651299072503784, "learning_rate": 2.0032132196482668e-07, "loss": 1.5243, "step": 2577 }, { "epoch": 0.5350767953507679, "grad_norm": 1.3711884382001511, "learning_rate": 2.002046600871804e-07, "loss": 1.4992, "step": 2578 }, { "epoch": 0.5352843503528435, "grad_norm": 0.7601701275761795, "learning_rate": 2.00088006625987e-07, "loss": 1.4679, "step": 2579 }, { "epoch": 0.5354919053549191, "grad_norm": 0.7558283542940453, "learning_rate": 1.999713616308446e-07, "loss": 1.4558, "step": 2580 }, { "epoch": 0.5356994603569946, "grad_norm": 0.7442850091165438, "learning_rate": 1.9985472515134752e-07, "loss": 1.5164, "step": 2581 }, { "epoch": 0.5359070153590701, "grad_norm": 0.7766475963883929, "learning_rate": 1.9973809723708642e-07, "loss": 1.4894, "step": 2582 }, { "epoch": 0.5361145703611457, "grad_norm": 0.7115226359159237, "learning_rate": 1.9962147793764847e-07, "loss": 1.5156, "step": 2583 }, { "epoch": 0.5363221253632212, "grad_norm": 0.7948940522129384, "learning_rate": 1.9950486730261714e-07, "loss": 1.5183, "step": 2584 }, { "epoch": 0.5365296803652968, "grad_norm": 0.8025008089632144, "learning_rate": 1.9938826538157208e-07, "loss": 1.4771, "step": 2585 }, { "epoch": 0.5367372353673724, "grad_norm": 0.8724798253088313, "learning_rate": 1.992716722240893e-07, "loss": 1.5546, "step": 2586 }, { "epoch": 0.5369447903694479, "grad_norm": 0.6927123344277129, "learning_rate": 1.9915508787974127e-07, "loss": 1.5162, "step": 2587 }, { "epoch": 0.5371523453715235, "grad_norm": 0.7849352002811072, "learning_rate": 1.9903851239809645e-07, "loss": 1.577, "step": 2588 }, { "epoch": 0.537359900373599, "grad_norm": 1.001484972223679, "learning_rate": 1.9892194582871964e-07, "loss": 1.4782, "step": 2589 }, { "epoch": 0.5375674553756745, "grad_norm": 0.814298982399508, "learning_rate": 1.9880538822117194e-07, "loss": 1.4753, "step": 2590 }, { "epoch": 0.5377750103777501, "grad_norm": 0.7479124766756335, "learning_rate": 1.9868883962501043e-07, "loss": 1.5312, "step": 2591 }, { "epoch": 0.5379825653798257, "grad_norm": 1.5220964061406288, "learning_rate": 1.985723000897885e-07, "loss": 1.5761, "step": 2592 }, { "epoch": 0.5381901203819012, "grad_norm": 0.6334335219464433, "learning_rate": 1.9845576966505578e-07, "loss": 1.5532, "step": 2593 }, { "epoch": 0.5383976753839768, "grad_norm": 0.9226501087900151, "learning_rate": 1.9833924840035773e-07, "loss": 1.5375, "step": 2594 }, { "epoch": 0.5386052303860523, "grad_norm": 0.7677883613361626, "learning_rate": 1.9822273634523627e-07, "loss": 1.458, "step": 2595 }, { "epoch": 0.5388127853881278, "grad_norm": 0.7881348689251744, "learning_rate": 1.9810623354922922e-07, "loss": 1.602, "step": 2596 }, { "epoch": 0.5390203403902034, "grad_norm": 0.9850253695586121, "learning_rate": 1.9798974006187033e-07, "loss": 1.5089, "step": 2597 }, { "epoch": 0.539227895392279, "grad_norm": 1.001149933615508, "learning_rate": 1.9787325593268962e-07, "loss": 1.5188, "step": 2598 }, { "epoch": 0.5394354503943545, "grad_norm": 0.8002193561602818, "learning_rate": 1.9775678121121308e-07, "loss": 1.5664, "step": 2599 }, { "epoch": 0.5396430053964301, "grad_norm": 0.6582182484610896, "learning_rate": 1.9764031594696266e-07, "loss": 1.4626, "step": 2600 }, { "epoch": 0.5398505603985057, "grad_norm": 1.1128223269784405, "learning_rate": 1.9752386018945627e-07, "loss": 1.5202, "step": 2601 }, { "epoch": 0.5400581154005811, "grad_norm": 1.0129186404957065, "learning_rate": 1.9740741398820783e-07, "loss": 1.5198, "step": 2602 }, { "epoch": 0.5402656704026567, "grad_norm": 1.0330207638800408, "learning_rate": 1.9729097739272716e-07, "loss": 1.4579, "step": 2603 }, { "epoch": 0.5404732254047323, "grad_norm": 0.6994365836644076, "learning_rate": 1.9717455045251997e-07, "loss": 1.5533, "step": 2604 }, { "epoch": 0.5406807804068078, "grad_norm": 0.847396599098467, "learning_rate": 1.9705813321708803e-07, "loss": 1.5015, "step": 2605 }, { "epoch": 0.5408883354088834, "grad_norm": 1.0987093683386875, "learning_rate": 1.9694172573592872e-07, "loss": 1.5718, "step": 2606 }, { "epoch": 0.541095890410959, "grad_norm": 1.4286627190603036, "learning_rate": 1.9682532805853542e-07, "loss": 1.4825, "step": 2607 }, { "epoch": 0.5413034454130344, "grad_norm": 0.6840506621435607, "learning_rate": 1.967089402343975e-07, "loss": 1.4244, "step": 2608 }, { "epoch": 0.54151100041511, "grad_norm": 3.760057848458652, "learning_rate": 1.9659256231299976e-07, "loss": 1.5407, "step": 2609 }, { "epoch": 0.5417185554171855, "grad_norm": 0.6711912130269639, "learning_rate": 1.9647619434382317e-07, "loss": 1.5012, "step": 2610 }, { "epoch": 0.5419261104192611, "grad_norm": 0.7361567980378269, "learning_rate": 1.9635983637634413e-07, "loss": 1.5993, "step": 2611 }, { "epoch": 0.5421336654213367, "grad_norm": 0.7413045068438774, "learning_rate": 1.9624348846003507e-07, "loss": 1.5273, "step": 2612 }, { "epoch": 0.5423412204234122, "grad_norm": 0.6876218435027974, "learning_rate": 1.9612715064436402e-07, "loss": 1.4582, "step": 2613 }, { "epoch": 0.5425487754254877, "grad_norm": 0.6831856447717607, "learning_rate": 1.9601082297879473e-07, "loss": 1.5518, "step": 2614 }, { "epoch": 0.5427563304275633, "grad_norm": 0.7701202309594065, "learning_rate": 1.9589450551278665e-07, "loss": 1.5647, "step": 2615 }, { "epoch": 0.5429638854296388, "grad_norm": 0.7506364075714071, "learning_rate": 1.957781982957949e-07, "loss": 1.477, "step": 2616 }, { "epoch": 0.5431714404317144, "grad_norm": 0.8723358264886831, "learning_rate": 1.9566190137727015e-07, "loss": 1.5253, "step": 2617 }, { "epoch": 0.54337899543379, "grad_norm": 0.8627677965500178, "learning_rate": 1.9554561480665872e-07, "loss": 1.5289, "step": 2618 }, { "epoch": 0.5435865504358655, "grad_norm": 0.7130662303568074, "learning_rate": 1.9542933863340277e-07, "loss": 1.5445, "step": 2619 }, { "epoch": 0.543794105437941, "grad_norm": 1.7699211116355538, "learning_rate": 1.9531307290693966e-07, "loss": 1.4854, "step": 2620 }, { "epoch": 0.5440016604400166, "grad_norm": 3.7505287485680316, "learning_rate": 1.9519681767670248e-07, "loss": 1.4785, "step": 2621 }, { "epoch": 0.5442092154420921, "grad_norm": 0.7132708475910746, "learning_rate": 1.9508057299212006e-07, "loss": 1.5088, "step": 2622 }, { "epoch": 0.5444167704441677, "grad_norm": 0.8104327028031877, "learning_rate": 1.9496433890261637e-07, "loss": 1.6185, "step": 2623 }, { "epoch": 0.5446243254462433, "grad_norm": 0.7107408496719876, "learning_rate": 1.948481154576111e-07, "loss": 1.49, "step": 2624 }, { "epoch": 0.5448318804483188, "grad_norm": 0.9847344267353024, "learning_rate": 1.9473190270651946e-07, "loss": 1.5385, "step": 2625 }, { "epoch": 0.5450394354503943, "grad_norm": 0.8365847136395933, "learning_rate": 1.9461570069875189e-07, "loss": 1.5188, "step": 2626 }, { "epoch": 0.5452469904524699, "grad_norm": 0.8676422222843052, "learning_rate": 1.9449950948371452e-07, "loss": 1.4884, "step": 2627 }, { "epoch": 0.5454545454545454, "grad_norm": 0.9724202963012527, "learning_rate": 1.943833291108087e-07, "loss": 1.4944, "step": 2628 }, { "epoch": 0.545662100456621, "grad_norm": 0.938996661882557, "learning_rate": 1.9426715962943124e-07, "loss": 1.5254, "step": 2629 }, { "epoch": 0.5458696554586966, "grad_norm": 0.7555234906413024, "learning_rate": 1.9415100108897433e-07, "loss": 1.535, "step": 2630 }, { "epoch": 0.5460772104607721, "grad_norm": 0.8342994351519754, "learning_rate": 1.9403485353882556e-07, "loss": 1.5575, "step": 2631 }, { "epoch": 0.5462847654628477, "grad_norm": 1.4670984804883267, "learning_rate": 1.9391871702836767e-07, "loss": 1.5168, "step": 2632 }, { "epoch": 0.5464923204649232, "grad_norm": 0.7000872578901604, "learning_rate": 1.938025916069789e-07, "loss": 1.6288, "step": 2633 }, { "epoch": 0.5466998754669987, "grad_norm": 0.7460420704703576, "learning_rate": 1.936864773240327e-07, "loss": 1.4856, "step": 2634 }, { "epoch": 0.5469074304690743, "grad_norm": 0.9654472569021206, "learning_rate": 1.9357037422889775e-07, "loss": 1.5593, "step": 2635 }, { "epoch": 0.5471149854711499, "grad_norm": 0.6153019995606699, "learning_rate": 1.9345428237093796e-07, "loss": 1.489, "step": 2636 }, { "epoch": 0.5473225404732254, "grad_norm": 0.6563171925169946, "learning_rate": 1.9333820179951265e-07, "loss": 1.5439, "step": 2637 }, { "epoch": 0.547530095475301, "grad_norm": 0.7718146302500087, "learning_rate": 1.9322213256397607e-07, "loss": 1.5064, "step": 2638 }, { "epoch": 0.5477376504773765, "grad_norm": 0.6844069954284052, "learning_rate": 1.9310607471367776e-07, "loss": 1.508, "step": 2639 }, { "epoch": 0.547945205479452, "grad_norm": 0.835347122756183, "learning_rate": 1.9299002829796253e-07, "loss": 1.5552, "step": 2640 }, { "epoch": 0.5481527604815276, "grad_norm": 1.1051777780740364, "learning_rate": 1.9287399336617013e-07, "loss": 1.5357, "step": 2641 }, { "epoch": 0.5483603154836032, "grad_norm": 0.9967497623851438, "learning_rate": 1.927579699676357e-07, "loss": 1.5422, "step": 2642 }, { "epoch": 0.5485678704856787, "grad_norm": 0.6450612345188861, "learning_rate": 1.9264195815168917e-07, "loss": 1.5541, "step": 2643 }, { "epoch": 0.5487754254877543, "grad_norm": 0.9532642709277006, "learning_rate": 1.925259579676557e-07, "loss": 1.4528, "step": 2644 }, { "epoch": 0.5489829804898299, "grad_norm": 0.7023915731412769, "learning_rate": 1.924099694648555e-07, "loss": 1.555, "step": 2645 }, { "epoch": 0.5491905354919053, "grad_norm": 0.7285824094094605, "learning_rate": 1.922939926926039e-07, "loss": 1.5087, "step": 2646 }, { "epoch": 0.5493980904939809, "grad_norm": 4.718979803607192, "learning_rate": 1.921780277002109e-07, "loss": 1.4276, "step": 2647 }, { "epoch": 0.5496056454960565, "grad_norm": 1.626087856849418, "learning_rate": 1.9206207453698196e-07, "loss": 1.4988, "step": 2648 }, { "epoch": 0.549813200498132, "grad_norm": 0.7161478862989256, "learning_rate": 1.919461332522173e-07, "loss": 1.5385, "step": 2649 }, { "epoch": 0.5500207555002076, "grad_norm": 1.158163434777051, "learning_rate": 1.918302038952119e-07, "loss": 1.5257, "step": 2650 }, { "epoch": 0.5502283105022832, "grad_norm": 0.8710489994965636, "learning_rate": 1.9171428651525594e-07, "loss": 1.4933, "step": 2651 }, { "epoch": 0.5504358655043586, "grad_norm": 0.7084888585141259, "learning_rate": 1.9159838116163445e-07, "loss": 1.5889, "step": 2652 }, { "epoch": 0.5506434205064342, "grad_norm": 0.7457934447926522, "learning_rate": 1.9148248788362725e-07, "loss": 1.6317, "step": 2653 }, { "epoch": 0.5508509755085098, "grad_norm": 0.7462125314074037, "learning_rate": 1.9136660673050908e-07, "loss": 1.5664, "step": 2654 }, { "epoch": 0.5510585305105853, "grad_norm": 0.8513899894602226, "learning_rate": 1.912507377515496e-07, "loss": 1.5258, "step": 2655 }, { "epoch": 0.5512660855126609, "grad_norm": 1.213717355933964, "learning_rate": 1.9113488099601316e-07, "loss": 1.5186, "step": 2656 }, { "epoch": 0.5514736405147364, "grad_norm": 0.8398296246797831, "learning_rate": 1.9101903651315903e-07, "loss": 1.5114, "step": 2657 }, { "epoch": 0.5516811955168119, "grad_norm": 1.2823849352393197, "learning_rate": 1.909032043522411e-07, "loss": 1.5577, "step": 2658 }, { "epoch": 0.5518887505188875, "grad_norm": 0.9858566423500901, "learning_rate": 1.9078738456250822e-07, "loss": 1.516, "step": 2659 }, { "epoch": 0.552096305520963, "grad_norm": 1.1533939088957754, "learning_rate": 1.9067157719320398e-07, "loss": 1.5529, "step": 2660 }, { "epoch": 0.5523038605230386, "grad_norm": 0.7309259460140325, "learning_rate": 1.9055578229356635e-07, "loss": 1.4912, "step": 2661 }, { "epoch": 0.5525114155251142, "grad_norm": 0.8819161146840586, "learning_rate": 1.9043999991282843e-07, "loss": 1.5093, "step": 2662 }, { "epoch": 0.5527189705271897, "grad_norm": 1.547214659924875, "learning_rate": 1.9032423010021783e-07, "loss": 1.5139, "step": 2663 }, { "epoch": 0.5529265255292652, "grad_norm": 0.9963851536672461, "learning_rate": 1.902084729049567e-07, "loss": 1.5319, "step": 2664 }, { "epoch": 0.5531340805313408, "grad_norm": 0.8097640047372465, "learning_rate": 1.9009272837626193e-07, "loss": 1.5667, "step": 2665 }, { "epoch": 0.5533416355334163, "grad_norm": 0.8382361931016105, "learning_rate": 1.8997699656334514e-07, "loss": 1.5423, "step": 2666 }, { "epoch": 0.5535491905354919, "grad_norm": 1.0250922342427287, "learning_rate": 1.898612775154123e-07, "loss": 1.5439, "step": 2667 }, { "epoch": 0.5537567455375675, "grad_norm": 1.0918296168075359, "learning_rate": 1.8974557128166412e-07, "loss": 1.5251, "step": 2668 }, { "epoch": 0.553964300539643, "grad_norm": 0.9430203632150722, "learning_rate": 1.8962987791129587e-07, "loss": 1.445, "step": 2669 }, { "epoch": 0.5541718555417185, "grad_norm": 0.6625904624933437, "learning_rate": 1.895141974534972e-07, "loss": 1.4622, "step": 2670 }, { "epoch": 0.5543794105437941, "grad_norm": 2.797678347456512, "learning_rate": 1.893985299574524e-07, "loss": 1.4945, "step": 2671 }, { "epoch": 0.5545869655458696, "grad_norm": 1.7556413512574123, "learning_rate": 1.8928287547234034e-07, "loss": 1.5745, "step": 2672 }, { "epoch": 0.5547945205479452, "grad_norm": 0.9866484212728677, "learning_rate": 1.8916723404733404e-07, "loss": 1.4825, "step": 2673 }, { "epoch": 0.5550020755500208, "grad_norm": 1.0127980911074592, "learning_rate": 1.8905160573160127e-07, "loss": 1.5796, "step": 2674 }, { "epoch": 0.5552096305520963, "grad_norm": 0.7065156647173052, "learning_rate": 1.889359905743042e-07, "loss": 1.5202, "step": 2675 }, { "epoch": 0.5554171855541719, "grad_norm": 1.2487302686251152, "learning_rate": 1.8882038862459915e-07, "loss": 1.547, "step": 2676 }, { "epoch": 0.5556247405562474, "grad_norm": 1.478208011918142, "learning_rate": 1.8870479993163704e-07, "loss": 1.484, "step": 2677 }, { "epoch": 0.5558322955583229, "grad_norm": 0.815336877384556, "learning_rate": 1.8858922454456327e-07, "loss": 1.5019, "step": 2678 }, { "epoch": 0.5560398505603985, "grad_norm": 1.1044039929512466, "learning_rate": 1.884736625125172e-07, "loss": 1.5718, "step": 2679 }, { "epoch": 0.5562474055624741, "grad_norm": 0.665007064219597, "learning_rate": 1.88358113884633e-07, "loss": 1.488, "step": 2680 }, { "epoch": 0.5564549605645496, "grad_norm": 0.6790814995066067, "learning_rate": 1.8824257871003866e-07, "loss": 1.5128, "step": 2681 }, { "epoch": 0.5566625155666252, "grad_norm": 1.0506260031468775, "learning_rate": 1.8812705703785673e-07, "loss": 1.419, "step": 2682 }, { "epoch": 0.5568700705687007, "grad_norm": 0.9181952804045278, "learning_rate": 1.8801154891720391e-07, "loss": 1.5431, "step": 2683 }, { "epoch": 0.5570776255707762, "grad_norm": 0.8247199591341412, "learning_rate": 1.8789605439719134e-07, "loss": 1.5133, "step": 2684 }, { "epoch": 0.5572851805728518, "grad_norm": 1.1506388715183375, "learning_rate": 1.877805735269241e-07, "loss": 1.515, "step": 2685 }, { "epoch": 0.5574927355749274, "grad_norm": 0.9291975605696297, "learning_rate": 1.8766510635550157e-07, "loss": 1.4602, "step": 2686 }, { "epoch": 0.5577002905770029, "grad_norm": 0.8013294926749087, "learning_rate": 1.8754965293201747e-07, "loss": 1.5143, "step": 2687 }, { "epoch": 0.5579078455790785, "grad_norm": 0.9362542771381518, "learning_rate": 1.874342133055594e-07, "loss": 1.4484, "step": 2688 }, { "epoch": 0.558115400581154, "grad_norm": 1.190948033288314, "learning_rate": 1.8731878752520922e-07, "loss": 1.5479, "step": 2689 }, { "epoch": 0.5583229555832295, "grad_norm": 0.8587599737060164, "learning_rate": 1.8720337564004303e-07, "loss": 1.4964, "step": 2690 }, { "epoch": 0.5585305105853051, "grad_norm": 0.8190566264464891, "learning_rate": 1.870879776991307e-07, "loss": 1.5108, "step": 2691 }, { "epoch": 0.5587380655873807, "grad_norm": 0.7502089057972806, "learning_rate": 1.8697259375153657e-07, "loss": 1.5149, "step": 2692 }, { "epoch": 0.5589456205894562, "grad_norm": 0.6261930646183931, "learning_rate": 1.8685722384631872e-07, "loss": 1.5177, "step": 2693 }, { "epoch": 0.5591531755915318, "grad_norm": 0.6954909957302217, "learning_rate": 1.8674186803252942e-07, "loss": 1.5427, "step": 2694 }, { "epoch": 0.5593607305936074, "grad_norm": 0.8081707958785462, "learning_rate": 1.8662652635921478e-07, "loss": 1.5188, "step": 2695 }, { "epoch": 0.5595682855956828, "grad_norm": 1.0336840727140324, "learning_rate": 1.865111988754153e-07, "loss": 1.5569, "step": 2696 }, { "epoch": 0.5597758405977584, "grad_norm": 0.9004309527194245, "learning_rate": 1.8639588563016483e-07, "loss": 1.5987, "step": 2697 }, { "epoch": 0.559983395599834, "grad_norm": 0.8600382351053841, "learning_rate": 1.862805866724917e-07, "loss": 1.4599, "step": 2698 }, { "epoch": 0.5601909506019095, "grad_norm": 0.8325831193058758, "learning_rate": 1.8616530205141795e-07, "loss": 1.5351, "step": 2699 }, { "epoch": 0.5603985056039851, "grad_norm": 0.7393574097641633, "learning_rate": 1.8605003181595947e-07, "loss": 1.5565, "step": 2700 }, { "epoch": 0.5606060606060606, "grad_norm": 0.6309010403150376, "learning_rate": 1.8593477601512625e-07, "loss": 1.4587, "step": 2701 }, { "epoch": 0.5608136156081361, "grad_norm": 1.7628409646136418, "learning_rate": 1.858195346979217e-07, "loss": 1.4865, "step": 2702 }, { "epoch": 0.5610211706102117, "grad_norm": 0.6912674024261483, "learning_rate": 1.8570430791334367e-07, "loss": 1.5231, "step": 2703 }, { "epoch": 0.5612287256122872, "grad_norm": 0.6966170370309946, "learning_rate": 1.8558909571038338e-07, "loss": 1.5428, "step": 2704 }, { "epoch": 0.5614362806143628, "grad_norm": 2.328648662710698, "learning_rate": 1.8547389813802607e-07, "loss": 1.5229, "step": 2705 }, { "epoch": 0.5616438356164384, "grad_norm": 0.6779571511322753, "learning_rate": 1.8535871524525062e-07, "loss": 1.5633, "step": 2706 }, { "epoch": 0.5618513906185139, "grad_norm": 0.731410520775825, "learning_rate": 1.852435470810298e-07, "loss": 1.4934, "step": 2707 }, { "epoch": 0.5620589456205894, "grad_norm": 0.7131051144208489, "learning_rate": 1.8512839369432996e-07, "loss": 1.4917, "step": 2708 }, { "epoch": 0.562266500622665, "grad_norm": 0.8023585879855598, "learning_rate": 1.8501325513411138e-07, "loss": 1.474, "step": 2709 }, { "epoch": 0.5624740556247405, "grad_norm": 0.8758225868910381, "learning_rate": 1.8489813144932797e-07, "loss": 1.5054, "step": 2710 }, { "epoch": 0.5626816106268161, "grad_norm": 0.6229606125937508, "learning_rate": 1.8478302268892704e-07, "loss": 1.4871, "step": 2711 }, { "epoch": 0.5628891656288917, "grad_norm": 0.6872097271487397, "learning_rate": 1.8466792890184993e-07, "loss": 1.5273, "step": 2712 }, { "epoch": 0.5630967206309672, "grad_norm": 0.7667222581343445, "learning_rate": 1.8455285013703152e-07, "loss": 1.4982, "step": 2713 }, { "epoch": 0.5633042756330428, "grad_norm": 0.7748817773205877, "learning_rate": 1.844377864434001e-07, "loss": 1.5322, "step": 2714 }, { "epoch": 0.5635118306351183, "grad_norm": 0.7815913221597046, "learning_rate": 1.8432273786987774e-07, "loss": 1.5468, "step": 2715 }, { "epoch": 0.5637193856371938, "grad_norm": 0.722430599278042, "learning_rate": 1.842077044653801e-07, "loss": 1.5262, "step": 2716 }, { "epoch": 0.5639269406392694, "grad_norm": 0.6839973300725115, "learning_rate": 1.8409268627881623e-07, "loss": 1.4895, "step": 2717 }, { "epoch": 0.564134495641345, "grad_norm": 0.9516528917229495, "learning_rate": 1.8397768335908887e-07, "loss": 1.4988, "step": 2718 }, { "epoch": 0.5643420506434205, "grad_norm": 0.6788231883898889, "learning_rate": 1.838626957550943e-07, "loss": 1.4586, "step": 2719 }, { "epoch": 0.564549605645496, "grad_norm": 0.7480678374192195, "learning_rate": 1.83747723515722e-07, "loss": 1.498, "step": 2720 }, { "epoch": 0.5647571606475716, "grad_norm": 1.1427546986061294, "learning_rate": 1.8363276668985525e-07, "loss": 1.5274, "step": 2721 }, { "epoch": 0.5649647156496471, "grad_norm": 1.096417469322258, "learning_rate": 1.8351782532637068e-07, "loss": 1.5141, "step": 2722 }, { "epoch": 0.5651722706517227, "grad_norm": 1.012014928385167, "learning_rate": 1.8340289947413815e-07, "loss": 1.4919, "step": 2723 }, { "epoch": 0.5653798256537983, "grad_norm": 0.7639563419830245, "learning_rate": 1.832879891820212e-07, "loss": 1.4295, "step": 2724 }, { "epoch": 0.5655873806558738, "grad_norm": 1.1925686093072705, "learning_rate": 1.8317309449887662e-07, "loss": 1.4739, "step": 2725 }, { "epoch": 0.5657949356579494, "grad_norm": 1.037202910007226, "learning_rate": 1.8305821547355448e-07, "loss": 1.539, "step": 2726 }, { "epoch": 0.566002490660025, "grad_norm": 0.8563013831550119, "learning_rate": 1.8294335215489843e-07, "loss": 1.5632, "step": 2727 }, { "epoch": 0.5662100456621004, "grad_norm": 0.8427013856278586, "learning_rate": 1.828285045917453e-07, "loss": 1.503, "step": 2728 }, { "epoch": 0.566417600664176, "grad_norm": 0.7298989480893748, "learning_rate": 1.827136728329251e-07, "loss": 1.5318, "step": 2729 }, { "epoch": 0.5666251556662516, "grad_norm": 0.7845165546987427, "learning_rate": 1.825988569272613e-07, "loss": 1.5243, "step": 2730 }, { "epoch": 0.5668327106683271, "grad_norm": 0.6408042649221842, "learning_rate": 1.8248405692357066e-07, "loss": 1.5679, "step": 2731 }, { "epoch": 0.5670402656704027, "grad_norm": 0.6909984077160943, "learning_rate": 1.8236927287066296e-07, "loss": 1.5052, "step": 2732 }, { "epoch": 0.5672478206724783, "grad_norm": 0.6951777283489345, "learning_rate": 1.8225450481734144e-07, "loss": 1.5199, "step": 2733 }, { "epoch": 0.5674553756745537, "grad_norm": 0.7694456173903054, "learning_rate": 1.8213975281240236e-07, "loss": 1.4955, "step": 2734 }, { "epoch": 0.5676629306766293, "grad_norm": 0.9643460694550313, "learning_rate": 1.8202501690463526e-07, "loss": 1.5716, "step": 2735 }, { "epoch": 0.5678704856787049, "grad_norm": 1.4823493750423102, "learning_rate": 1.8191029714282276e-07, "loss": 1.4821, "step": 2736 }, { "epoch": 0.5680780406807804, "grad_norm": 0.9450821946974228, "learning_rate": 1.8179559357574074e-07, "loss": 1.5334, "step": 2737 }, { "epoch": 0.568285595682856, "grad_norm": 0.7361669666385137, "learning_rate": 1.8168090625215803e-07, "loss": 1.4771, "step": 2738 }, { "epoch": 0.5684931506849316, "grad_norm": 0.7497096119643377, "learning_rate": 1.815662352208367e-07, "loss": 1.4959, "step": 2739 }, { "epoch": 0.568700705687007, "grad_norm": 0.7278737759107909, "learning_rate": 1.814515805305318e-07, "loss": 1.5506, "step": 2740 }, { "epoch": 0.5689082606890826, "grad_norm": 0.7946986738801807, "learning_rate": 1.8133694222999142e-07, "loss": 1.6229, "step": 2741 }, { "epoch": 0.5691158156911582, "grad_norm": 0.8215902947665022, "learning_rate": 1.8122232036795678e-07, "loss": 1.546, "step": 2742 }, { "epoch": 0.5693233706932337, "grad_norm": 0.7579123615211772, "learning_rate": 1.8110771499316204e-07, "loss": 1.5423, "step": 2743 }, { "epoch": 0.5695309256953093, "grad_norm": 0.6938686021950268, "learning_rate": 1.8099312615433432e-07, "loss": 1.4626, "step": 2744 }, { "epoch": 0.5697384806973848, "grad_norm": 0.8070564091763462, "learning_rate": 1.8087855390019385e-07, "loss": 1.5212, "step": 2745 }, { "epoch": 0.5699460356994603, "grad_norm": 1.1471650079910272, "learning_rate": 1.8076399827945354e-07, "loss": 1.4707, "step": 2746 }, { "epoch": 0.5701535907015359, "grad_norm": 1.6139513162128873, "learning_rate": 1.8064945934081958e-07, "loss": 1.5841, "step": 2747 }, { "epoch": 0.5703611457036114, "grad_norm": 0.7614890714888461, "learning_rate": 1.8053493713299082e-07, "loss": 1.5646, "step": 2748 }, { "epoch": 0.570568700705687, "grad_norm": 1.023191354272193, "learning_rate": 1.8042043170465902e-07, "loss": 1.5557, "step": 2749 }, { "epoch": 0.5707762557077626, "grad_norm": 1.2261344768306712, "learning_rate": 1.8030594310450886e-07, "loss": 1.5399, "step": 2750 }, { "epoch": 0.5709838107098381, "grad_norm": 0.723985990597229, "learning_rate": 1.8019147138121794e-07, "loss": 1.5442, "step": 2751 }, { "epoch": 0.5711913657119136, "grad_norm": 0.949264869721745, "learning_rate": 1.800770165834565e-07, "loss": 1.5218, "step": 2752 }, { "epoch": 0.5713989207139892, "grad_norm": 0.8808337471237722, "learning_rate": 1.799625787598877e-07, "loss": 1.5304, "step": 2753 }, { "epoch": 0.5716064757160647, "grad_norm": 0.7164568810738093, "learning_rate": 1.7984815795916753e-07, "loss": 1.5109, "step": 2754 }, { "epoch": 0.5718140307181403, "grad_norm": 0.8489271133851851, "learning_rate": 1.7973375422994456e-07, "loss": 1.4667, "step": 2755 }, { "epoch": 0.5720215857202159, "grad_norm": 1.0910126301979972, "learning_rate": 1.796193676208603e-07, "loss": 1.4165, "step": 2756 }, { "epoch": 0.5722291407222914, "grad_norm": 3.135914016248933, "learning_rate": 1.795049981805489e-07, "loss": 1.489, "step": 2757 }, { "epoch": 0.572436695724367, "grad_norm": 0.7124518973801518, "learning_rate": 1.7939064595763714e-07, "loss": 1.5295, "step": 2758 }, { "epoch": 0.5726442507264425, "grad_norm": 0.9999219194852379, "learning_rate": 1.7927631100074466e-07, "loss": 1.4793, "step": 2759 }, { "epoch": 0.572851805728518, "grad_norm": 0.7301464314158208, "learning_rate": 1.791619933584836e-07, "loss": 1.4797, "step": 2760 }, { "epoch": 0.5730593607305936, "grad_norm": 0.7173427879646389, "learning_rate": 1.790476930794587e-07, "loss": 1.5339, "step": 2761 }, { "epoch": 0.5732669157326692, "grad_norm": 0.6558477164196588, "learning_rate": 1.7893341021226753e-07, "loss": 1.5162, "step": 2762 }, { "epoch": 0.5734744707347447, "grad_norm": 0.7544021447305143, "learning_rate": 1.7881914480550014e-07, "loss": 1.563, "step": 2763 }, { "epoch": 0.5736820257368203, "grad_norm": 0.7627394445166236, "learning_rate": 1.7870489690773904e-07, "loss": 1.5377, "step": 2764 }, { "epoch": 0.5738895807388958, "grad_norm": 2.1990430715287927, "learning_rate": 1.785906665675594e-07, "loss": 1.4977, "step": 2765 }, { "epoch": 0.5740971357409713, "grad_norm": 0.8377752614424979, "learning_rate": 1.7847645383352906e-07, "loss": 1.505, "step": 2766 }, { "epoch": 0.5743046907430469, "grad_norm": 0.9162713093019611, "learning_rate": 1.783622587542081e-07, "loss": 1.5722, "step": 2767 }, { "epoch": 0.5745122457451225, "grad_norm": 0.6948015757648495, "learning_rate": 1.7824808137814933e-07, "loss": 1.5344, "step": 2768 }, { "epoch": 0.574719800747198, "grad_norm": 1.0860076720655756, "learning_rate": 1.7813392175389797e-07, "loss": 1.5259, "step": 2769 }, { "epoch": 0.5749273557492736, "grad_norm": 0.6762461867833881, "learning_rate": 1.7801977992999148e-07, "loss": 1.5648, "step": 2770 }, { "epoch": 0.5751349107513491, "grad_norm": 0.6887915169174699, "learning_rate": 1.7790565595496006e-07, "loss": 1.5001, "step": 2771 }, { "epoch": 0.5753424657534246, "grad_norm": 0.8224992999908507, "learning_rate": 1.7779154987732627e-07, "loss": 1.475, "step": 2772 }, { "epoch": 0.5755500207555002, "grad_norm": 0.8216569245861504, "learning_rate": 1.7767746174560482e-07, "loss": 1.5263, "step": 2773 }, { "epoch": 0.5757575757575758, "grad_norm": 0.7748509807248156, "learning_rate": 1.7756339160830307e-07, "loss": 1.5521, "step": 2774 }, { "epoch": 0.5759651307596513, "grad_norm": 0.8065867638140796, "learning_rate": 1.7744933951392062e-07, "loss": 1.5457, "step": 2775 }, { "epoch": 0.5761726857617269, "grad_norm": 0.7945712872010535, "learning_rate": 1.7733530551094932e-07, "loss": 1.4163, "step": 2776 }, { "epoch": 0.5763802407638025, "grad_norm": 0.7227156515783685, "learning_rate": 1.7722128964787338e-07, "loss": 1.5796, "step": 2777 }, { "epoch": 0.5765877957658779, "grad_norm": 1.1558996117114002, "learning_rate": 1.771072919731695e-07, "loss": 1.5096, "step": 2778 }, { "epoch": 0.5767953507679535, "grad_norm": 0.69368801528994, "learning_rate": 1.7699331253530624e-07, "loss": 1.4507, "step": 2779 }, { "epoch": 0.5770029057700291, "grad_norm": 0.7049065143249547, "learning_rate": 1.7687935138274474e-07, "loss": 1.493, "step": 2780 }, { "epoch": 0.5772104607721046, "grad_norm": 0.880759807237636, "learning_rate": 1.767654085639383e-07, "loss": 1.5459, "step": 2781 }, { "epoch": 0.5774180157741802, "grad_norm": 0.7924430722882332, "learning_rate": 1.7665148412733229e-07, "loss": 1.5162, "step": 2782 }, { "epoch": 0.5776255707762558, "grad_norm": 0.735618198447245, "learning_rate": 1.765375781213643e-07, "loss": 1.5227, "step": 2783 }, { "epoch": 0.5778331257783312, "grad_norm": 0.6610780007571779, "learning_rate": 1.7642369059446435e-07, "loss": 1.506, "step": 2784 }, { "epoch": 0.5780406807804068, "grad_norm": 1.3411785743217628, "learning_rate": 1.763098215950542e-07, "loss": 1.5726, "step": 2785 }, { "epoch": 0.5782482357824824, "grad_norm": 0.6685683317640891, "learning_rate": 1.7619597117154807e-07, "loss": 1.4742, "step": 2786 }, { "epoch": 0.5784557907845579, "grad_norm": 0.6239754715574964, "learning_rate": 1.7608213937235203e-07, "loss": 1.4952, "step": 2787 }, { "epoch": 0.5786633457866335, "grad_norm": 0.957657533000173, "learning_rate": 1.7596832624586438e-07, "loss": 1.5211, "step": 2788 }, { "epoch": 0.578870900788709, "grad_norm": 0.7612572146783587, "learning_rate": 1.758545318404755e-07, "loss": 1.4679, "step": 2789 }, { "epoch": 0.5790784557907845, "grad_norm": 1.6899257873814624, "learning_rate": 1.757407562045676e-07, "loss": 1.5534, "step": 2790 }, { "epoch": 0.5792860107928601, "grad_norm": 0.7248436953699403, "learning_rate": 1.756269993865151e-07, "loss": 1.4489, "step": 2791 }, { "epoch": 0.5794935657949356, "grad_norm": 1.2729965354170167, "learning_rate": 1.755132614346846e-07, "loss": 1.5434, "step": 2792 }, { "epoch": 0.5797011207970112, "grad_norm": 1.0516639089080668, "learning_rate": 1.7539954239743416e-07, "loss": 1.5201, "step": 2793 }, { "epoch": 0.5799086757990868, "grad_norm": 0.7181568457073596, "learning_rate": 1.752858423231142e-07, "loss": 1.5394, "step": 2794 }, { "epoch": 0.5801162308011623, "grad_norm": 0.8539085521713808, "learning_rate": 1.7517216126006704e-07, "loss": 1.5428, "step": 2795 }, { "epoch": 0.5803237858032378, "grad_norm": 0.803091835114635, "learning_rate": 1.7505849925662678e-07, "loss": 1.5689, "step": 2796 }, { "epoch": 0.5805313408053134, "grad_norm": 0.6717985094200712, "learning_rate": 1.7494485636111952e-07, "loss": 1.5273, "step": 2797 }, { "epoch": 0.5807388958073889, "grad_norm": 1.0031030593762762, "learning_rate": 1.7483123262186314e-07, "loss": 1.5406, "step": 2798 }, { "epoch": 0.5809464508094645, "grad_norm": 0.7879728523416093, "learning_rate": 1.7471762808716752e-07, "loss": 1.567, "step": 2799 }, { "epoch": 0.5811540058115401, "grad_norm": 0.8509554724376943, "learning_rate": 1.7460404280533422e-07, "loss": 1.3852, "step": 2800 }, { "epoch": 0.5813615608136156, "grad_norm": 0.8771775591357174, "learning_rate": 1.7449047682465685e-07, "loss": 1.6476, "step": 2801 }, { "epoch": 0.5815691158156912, "grad_norm": 0.694974595372767, "learning_rate": 1.743769301934204e-07, "loss": 1.4773, "step": 2802 }, { "epoch": 0.5817766708177667, "grad_norm": 0.6686018603524398, "learning_rate": 1.7426340295990208e-07, "loss": 1.516, "step": 2803 }, { "epoch": 0.5819842258198422, "grad_norm": 0.6899907104155022, "learning_rate": 1.7414989517237054e-07, "loss": 1.4854, "step": 2804 }, { "epoch": 0.5821917808219178, "grad_norm": 0.6745164516360551, "learning_rate": 1.7403640687908637e-07, "loss": 1.5393, "step": 2805 }, { "epoch": 0.5823993358239934, "grad_norm": 0.7095786756685708, "learning_rate": 1.7392293812830164e-07, "loss": 1.5309, "step": 2806 }, { "epoch": 0.5826068908260689, "grad_norm": 0.8871783972376093, "learning_rate": 1.7380948896826048e-07, "loss": 1.5734, "step": 2807 }, { "epoch": 0.5828144458281445, "grad_norm": 0.6666461280326869, "learning_rate": 1.7369605944719822e-07, "loss": 1.6016, "step": 2808 }, { "epoch": 0.58302200083022, "grad_norm": 0.785056674409609, "learning_rate": 1.7358264961334217e-07, "loss": 1.5668, "step": 2809 }, { "epoch": 0.5832295558322955, "grad_norm": 0.8790225430828947, "learning_rate": 1.7346925951491124e-07, "loss": 1.5746, "step": 2810 }, { "epoch": 0.5834371108343711, "grad_norm": 0.8908804070811525, "learning_rate": 1.7335588920011582e-07, "loss": 1.5646, "step": 2811 }, { "epoch": 0.5836446658364467, "grad_norm": 0.6455106310486284, "learning_rate": 1.7324253871715802e-07, "loss": 1.4157, "step": 2812 }, { "epoch": 0.5838522208385222, "grad_norm": 1.2852366321554003, "learning_rate": 1.731292081142314e-07, "loss": 1.5639, "step": 2813 }, { "epoch": 0.5840597758405978, "grad_norm": 0.736461843347046, "learning_rate": 1.7301589743952115e-07, "loss": 1.5274, "step": 2814 }, { "epoch": 0.5842673308426733, "grad_norm": 1.6491995037109397, "learning_rate": 1.7290260674120388e-07, "loss": 1.5498, "step": 2815 }, { "epoch": 0.5844748858447488, "grad_norm": 0.6870443890116192, "learning_rate": 1.7278933606744794e-07, "loss": 1.4876, "step": 2816 }, { "epoch": 0.5846824408468244, "grad_norm": 0.7150788592562356, "learning_rate": 1.7267608546641292e-07, "loss": 1.5086, "step": 2817 }, { "epoch": 0.5848899958489, "grad_norm": 0.8112294611213976, "learning_rate": 1.7256285498624994e-07, "loss": 1.5116, "step": 2818 }, { "epoch": 0.5850975508509755, "grad_norm": 1.056554728121192, "learning_rate": 1.724496446751017e-07, "loss": 1.5391, "step": 2819 }, { "epoch": 0.5853051058530511, "grad_norm": 0.778416612575924, "learning_rate": 1.7233645458110208e-07, "loss": 1.5524, "step": 2820 }, { "epoch": 0.5855126608551267, "grad_norm": 1.7858654314912314, "learning_rate": 1.722232847523766e-07, "loss": 1.4902, "step": 2821 }, { "epoch": 0.5857202158572021, "grad_norm": 0.7264239887411612, "learning_rate": 1.7211013523704213e-07, "loss": 1.5418, "step": 2822 }, { "epoch": 0.5859277708592777, "grad_norm": 0.9124515378803588, "learning_rate": 1.7199700608320664e-07, "loss": 1.4999, "step": 2823 }, { "epoch": 0.5861353258613533, "grad_norm": 0.7789997235064122, "learning_rate": 1.7188389733896975e-07, "loss": 1.4683, "step": 2824 }, { "epoch": 0.5863428808634288, "grad_norm": 0.6846733302363334, "learning_rate": 1.717708090524224e-07, "loss": 1.4486, "step": 2825 }, { "epoch": 0.5865504358655044, "grad_norm": 0.828452113655679, "learning_rate": 1.7165774127164654e-07, "loss": 1.6132, "step": 2826 }, { "epoch": 0.58675799086758, "grad_norm": 1.0659231556445423, "learning_rate": 1.715446940447157e-07, "loss": 1.5378, "step": 2827 }, { "epoch": 0.5869655458696554, "grad_norm": 1.2705205199548841, "learning_rate": 1.714316674196946e-07, "loss": 1.4992, "step": 2828 }, { "epoch": 0.587173100871731, "grad_norm": 0.7733321292250058, "learning_rate": 1.7131866144463905e-07, "loss": 1.5633, "step": 2829 }, { "epoch": 0.5873806558738066, "grad_norm": 0.7222036600015269, "learning_rate": 1.7120567616759618e-07, "loss": 1.5132, "step": 2830 }, { "epoch": 0.5875882108758821, "grad_norm": 0.9664975459056787, "learning_rate": 1.710927116366045e-07, "loss": 1.6256, "step": 2831 }, { "epoch": 0.5877957658779577, "grad_norm": 1.0454087338726974, "learning_rate": 1.7097976789969332e-07, "loss": 1.5205, "step": 2832 }, { "epoch": 0.5880033208800332, "grad_norm": 0.6614692515709273, "learning_rate": 1.7086684500488353e-07, "loss": 1.4166, "step": 2833 }, { "epoch": 0.5882108758821087, "grad_norm": 0.8483585823548292, "learning_rate": 1.7075394300018674e-07, "loss": 1.5012, "step": 2834 }, { "epoch": 0.5884184308841843, "grad_norm": 0.9826675761446627, "learning_rate": 1.7064106193360597e-07, "loss": 1.5342, "step": 2835 }, { "epoch": 0.5886259858862598, "grad_norm": 0.7255508303526077, "learning_rate": 1.7052820185313533e-07, "loss": 1.5023, "step": 2836 }, { "epoch": 0.5888335408883354, "grad_norm": 0.6619937536364876, "learning_rate": 1.7041536280675976e-07, "loss": 1.5608, "step": 2837 }, { "epoch": 0.589041095890411, "grad_norm": 0.8762113991446207, "learning_rate": 1.7030254484245558e-07, "loss": 1.5317, "step": 2838 }, { "epoch": 0.5892486508924865, "grad_norm": 0.6825757964039468, "learning_rate": 1.7018974800819002e-07, "loss": 1.4647, "step": 2839 }, { "epoch": 0.589456205894562, "grad_norm": 1.3496877116127028, "learning_rate": 1.7007697235192115e-07, "loss": 1.5164, "step": 2840 }, { "epoch": 0.5896637608966376, "grad_norm": 0.9429121077855189, "learning_rate": 1.6996421792159818e-07, "loss": 1.5441, "step": 2841 }, { "epoch": 0.5898713158987131, "grad_norm": 0.7896162106585859, "learning_rate": 1.6985148476516148e-07, "loss": 1.4951, "step": 2842 }, { "epoch": 0.5900788709007887, "grad_norm": 0.7501720645012195, "learning_rate": 1.6973877293054209e-07, "loss": 1.5096, "step": 2843 }, { "epoch": 0.5902864259028643, "grad_norm": 0.7885291660381466, "learning_rate": 1.6962608246566205e-07, "loss": 1.5437, "step": 2844 }, { "epoch": 0.5904939809049398, "grad_norm": 1.1839814403887827, "learning_rate": 1.6951341341843444e-07, "loss": 1.53, "step": 2845 }, { "epoch": 0.5907015359070154, "grad_norm": 6.715340988043474, "learning_rate": 1.69400765836763e-07, "loss": 1.4748, "step": 2846 }, { "epoch": 0.5909090909090909, "grad_norm": 0.9135874048048324, "learning_rate": 1.6928813976854267e-07, "loss": 1.5122, "step": 2847 }, { "epoch": 0.5911166459111664, "grad_norm": 1.0383435094613938, "learning_rate": 1.6917553526165897e-07, "loss": 1.4801, "step": 2848 }, { "epoch": 0.591324200913242, "grad_norm": 1.087779549046269, "learning_rate": 1.6906295236398837e-07, "loss": 1.4537, "step": 2849 }, { "epoch": 0.5915317559153176, "grad_norm": 0.8004657366491701, "learning_rate": 1.6895039112339812e-07, "loss": 1.6085, "step": 2850 }, { "epoch": 0.5917393109173931, "grad_norm": 0.818620190195881, "learning_rate": 1.688378515877463e-07, "loss": 1.4738, "step": 2851 }, { "epoch": 0.5919468659194687, "grad_norm": 0.9754305122536338, "learning_rate": 1.6872533380488166e-07, "loss": 1.472, "step": 2852 }, { "epoch": 0.5921544209215442, "grad_norm": 0.8282078067858415, "learning_rate": 1.6861283782264382e-07, "loss": 1.4543, "step": 2853 }, { "epoch": 0.5923619759236197, "grad_norm": 0.7777232443399855, "learning_rate": 1.6850036368886315e-07, "loss": 1.5037, "step": 2854 }, { "epoch": 0.5925695309256953, "grad_norm": 1.0011051100462725, "learning_rate": 1.6838791145136054e-07, "loss": 1.5132, "step": 2855 }, { "epoch": 0.5927770859277709, "grad_norm": 0.7061602178315153, "learning_rate": 1.6827548115794773e-07, "loss": 1.4822, "step": 2856 }, { "epoch": 0.5929846409298464, "grad_norm": 0.8615442269748376, "learning_rate": 1.6816307285642725e-07, "loss": 1.4596, "step": 2857 }, { "epoch": 0.593192195931922, "grad_norm": 0.7410582504633436, "learning_rate": 1.6805068659459188e-07, "loss": 1.4865, "step": 2858 }, { "epoch": 0.5933997509339975, "grad_norm": 0.6648874348989493, "learning_rate": 1.6793832242022544e-07, "loss": 1.5381, "step": 2859 }, { "epoch": 0.593607305936073, "grad_norm": 0.6694704310744576, "learning_rate": 1.678259803811022e-07, "loss": 1.4673, "step": 2860 }, { "epoch": 0.5938148609381486, "grad_norm": 0.6576054240198026, "learning_rate": 1.6771366052498686e-07, "loss": 1.5323, "step": 2861 }, { "epoch": 0.5940224159402242, "grad_norm": 0.9291845406302139, "learning_rate": 1.6760136289963497e-07, "loss": 1.5228, "step": 2862 }, { "epoch": 0.5942299709422997, "grad_norm": 1.0831472246572866, "learning_rate": 1.6748908755279252e-07, "loss": 1.5669, "step": 2863 }, { "epoch": 0.5944375259443753, "grad_norm": 0.7082129139775857, "learning_rate": 1.673768345321959e-07, "loss": 1.6155, "step": 2864 }, { "epoch": 0.5946450809464509, "grad_norm": 0.7382426941565197, "learning_rate": 1.672646038855722e-07, "loss": 1.5399, "step": 2865 }, { "epoch": 0.5948526359485263, "grad_norm": 2.157114757113836, "learning_rate": 1.671523956606389e-07, "loss": 1.5971, "step": 2866 }, { "epoch": 0.5950601909506019, "grad_norm": 0.7760571163502992, "learning_rate": 1.670402099051039e-07, "loss": 1.5534, "step": 2867 }, { "epoch": 0.5952677459526775, "grad_norm": 0.6557502436454352, "learning_rate": 1.6692804666666565e-07, "loss": 1.48, "step": 2868 }, { "epoch": 0.595475300954753, "grad_norm": 0.9619804404913119, "learning_rate": 1.6681590599301302e-07, "loss": 1.4867, "step": 2869 }, { "epoch": 0.5956828559568286, "grad_norm": 0.8467657154933467, "learning_rate": 1.6670378793182516e-07, "loss": 1.5047, "step": 2870 }, { "epoch": 0.5958904109589042, "grad_norm": 0.9265342656401355, "learning_rate": 1.665916925307717e-07, "loss": 1.5953, "step": 2871 }, { "epoch": 0.5960979659609796, "grad_norm": 0.6655733530141834, "learning_rate": 1.664796198375128e-07, "loss": 1.4923, "step": 2872 }, { "epoch": 0.5963055209630552, "grad_norm": 0.7456955983870813, "learning_rate": 1.6636756989969857e-07, "loss": 1.4876, "step": 2873 }, { "epoch": 0.5965130759651308, "grad_norm": 0.7998449652257649, "learning_rate": 1.6625554276496976e-07, "loss": 1.4516, "step": 2874 }, { "epoch": 0.5967206309672063, "grad_norm": 0.6742621585339764, "learning_rate": 1.6614353848095738e-07, "loss": 1.5065, "step": 2875 }, { "epoch": 0.5969281859692819, "grad_norm": 0.6570486109804717, "learning_rate": 1.6603155709528257e-07, "loss": 1.4571, "step": 2876 }, { "epoch": 0.5971357409713575, "grad_norm": 2.4041789544860936, "learning_rate": 1.6591959865555688e-07, "loss": 1.5265, "step": 2877 }, { "epoch": 0.5973432959734329, "grad_norm": 0.6247341425283687, "learning_rate": 1.6580766320938214e-07, "loss": 1.4432, "step": 2878 }, { "epoch": 0.5975508509755085, "grad_norm": 0.668155820819984, "learning_rate": 1.6569575080435027e-07, "loss": 1.4596, "step": 2879 }, { "epoch": 0.597758405977584, "grad_norm": 0.7758182196691806, "learning_rate": 1.655838614880435e-07, "loss": 1.6065, "step": 2880 }, { "epoch": 0.5979659609796596, "grad_norm": 1.8880447355405923, "learning_rate": 1.6547199530803414e-07, "loss": 1.4297, "step": 2881 }, { "epoch": 0.5981735159817352, "grad_norm": 0.7129015100545596, "learning_rate": 1.6536015231188464e-07, "loss": 1.5703, "step": 2882 }, { "epoch": 0.5983810709838107, "grad_norm": 0.6491171660152925, "learning_rate": 1.652483325471479e-07, "loss": 1.4965, "step": 2883 }, { "epoch": 0.5985886259858862, "grad_norm": 0.6944868828223861, "learning_rate": 1.6513653606136652e-07, "loss": 1.4748, "step": 2884 }, { "epoch": 0.5987961809879618, "grad_norm": 0.7715275444741108, "learning_rate": 1.6502476290207349e-07, "loss": 1.522, "step": 2885 }, { "epoch": 0.5990037359900373, "grad_norm": 0.6678937476334154, "learning_rate": 1.6491301311679177e-07, "loss": 1.5166, "step": 2886 }, { "epoch": 0.5992112909921129, "grad_norm": 1.3858146481318514, "learning_rate": 1.648012867530344e-07, "loss": 1.5477, "step": 2887 }, { "epoch": 0.5994188459941885, "grad_norm": 1.0642207640173227, "learning_rate": 1.646895838583044e-07, "loss": 1.5675, "step": 2888 }, { "epoch": 0.599626400996264, "grad_norm": 1.135044182499458, "learning_rate": 1.6457790448009502e-07, "loss": 1.4908, "step": 2889 }, { "epoch": 0.5998339559983396, "grad_norm": 1.1370257180738672, "learning_rate": 1.6446624866588922e-07, "loss": 1.5351, "step": 2890 }, { "epoch": 0.6000415110004151, "grad_norm": 0.6809034803841085, "learning_rate": 1.6435461646316013e-07, "loss": 1.4821, "step": 2891 }, { "epoch": 0.6002490660024906, "grad_norm": 0.7834256155977669, "learning_rate": 1.6424300791937088e-07, "loss": 1.5516, "step": 2892 }, { "epoch": 0.6004566210045662, "grad_norm": 1.408629801948699, "learning_rate": 1.641314230819744e-07, "loss": 1.5239, "step": 2893 }, { "epoch": 0.6006641760066418, "grad_norm": 1.137349098778385, "learning_rate": 1.6401986199841354e-07, "loss": 1.4822, "step": 2894 }, { "epoch": 0.6008717310087173, "grad_norm": 0.7530387197886126, "learning_rate": 1.6390832471612125e-07, "loss": 1.5264, "step": 2895 }, { "epoch": 0.6010792860107929, "grad_norm": 0.7325961648805188, "learning_rate": 1.637968112825201e-07, "loss": 1.555, "step": 2896 }, { "epoch": 0.6012868410128684, "grad_norm": 0.7195609503537667, "learning_rate": 1.636853217450227e-07, "loss": 1.6114, "step": 2897 }, { "epoch": 0.6014943960149439, "grad_norm": 0.6988482173041103, "learning_rate": 1.6357385615103141e-07, "loss": 1.5373, "step": 2898 }, { "epoch": 0.6017019510170195, "grad_norm": 0.8099116133934874, "learning_rate": 1.6346241454793844e-07, "loss": 1.5275, "step": 2899 }, { "epoch": 0.6019095060190951, "grad_norm": 0.7748074177026885, "learning_rate": 1.633509969831258e-07, "loss": 1.5355, "step": 2900 }, { "epoch": 0.6021170610211706, "grad_norm": 1.6981014660023268, "learning_rate": 1.6323960350396532e-07, "loss": 1.5169, "step": 2901 }, { "epoch": 0.6023246160232462, "grad_norm": 0.808798216846251, "learning_rate": 1.6312823415781858e-07, "loss": 1.4549, "step": 2902 }, { "epoch": 0.6025321710253217, "grad_norm": 0.6800704743294834, "learning_rate": 1.6301688899203673e-07, "loss": 1.5851, "step": 2903 }, { "epoch": 0.6027397260273972, "grad_norm": 0.9308861948639792, "learning_rate": 1.629055680539609e-07, "loss": 1.5198, "step": 2904 }, { "epoch": 0.6029472810294728, "grad_norm": 0.9177301727407988, "learning_rate": 1.627942713909218e-07, "loss": 1.5081, "step": 2905 }, { "epoch": 0.6031548360315484, "grad_norm": 0.8365186296662164, "learning_rate": 1.6268299905023967e-07, "loss": 1.4431, "step": 2906 }, { "epoch": 0.6033623910336239, "grad_norm": 0.8972887384886354, "learning_rate": 1.6257175107922482e-07, "loss": 1.5495, "step": 2907 }, { "epoch": 0.6035699460356995, "grad_norm": 0.7223700493904724, "learning_rate": 1.624605275251767e-07, "loss": 1.5725, "step": 2908 }, { "epoch": 0.603777501037775, "grad_norm": 0.7406390612337366, "learning_rate": 1.6234932843538464e-07, "loss": 1.537, "step": 2909 }, { "epoch": 0.6039850560398505, "grad_norm": 0.6683050447709474, "learning_rate": 1.6223815385712773e-07, "loss": 1.4619, "step": 2910 }, { "epoch": 0.6041926110419261, "grad_norm": 0.9611396145243545, "learning_rate": 1.6212700383767418e-07, "loss": 1.4987, "step": 2911 }, { "epoch": 0.6044001660440017, "grad_norm": 0.9775699785624652, "learning_rate": 1.6201587842428216e-07, "loss": 1.5025, "step": 2912 }, { "epoch": 0.6046077210460772, "grad_norm": 0.8942348272111529, "learning_rate": 1.6190477766419935e-07, "loss": 1.4386, "step": 2913 }, { "epoch": 0.6048152760481528, "grad_norm": 0.6786349585185361, "learning_rate": 1.6179370160466262e-07, "loss": 1.4386, "step": 2914 }, { "epoch": 0.6050228310502284, "grad_norm": 0.7774037627875939, "learning_rate": 1.6168265029289868e-07, "loss": 1.5356, "step": 2915 }, { "epoch": 0.6052303860523038, "grad_norm": 1.2456115676074435, "learning_rate": 1.6157162377612368e-07, "loss": 1.5196, "step": 2916 }, { "epoch": 0.6054379410543794, "grad_norm": 0.8000454177858142, "learning_rate": 1.6146062210154302e-07, "loss": 1.4826, "step": 2917 }, { "epoch": 0.605645496056455, "grad_norm": 0.8326334827030125, "learning_rate": 1.6134964531635173e-07, "loss": 1.5512, "step": 2918 }, { "epoch": 0.6058530510585305, "grad_norm": 1.2482680568309497, "learning_rate": 1.6123869346773416e-07, "loss": 1.4681, "step": 2919 }, { "epoch": 0.6060606060606061, "grad_norm": 0.9322448888291446, "learning_rate": 1.611277666028641e-07, "loss": 1.4428, "step": 2920 }, { "epoch": 0.6062681610626817, "grad_norm": 0.9798057768306832, "learning_rate": 1.6101686476890467e-07, "loss": 1.4984, "step": 2921 }, { "epoch": 0.6064757160647571, "grad_norm": 0.6722285410165774, "learning_rate": 1.6090598801300855e-07, "loss": 1.4506, "step": 2922 }, { "epoch": 0.6066832710668327, "grad_norm": 3.149224533660752, "learning_rate": 1.6079513638231737e-07, "loss": 1.5731, "step": 2923 }, { "epoch": 0.6068908260689082, "grad_norm": 0.672411064248572, "learning_rate": 1.6068430992396249e-07, "loss": 1.5029, "step": 2924 }, { "epoch": 0.6070983810709838, "grad_norm": 0.8382347554066111, "learning_rate": 1.605735086850642e-07, "loss": 1.528, "step": 2925 }, { "epoch": 0.6073059360730594, "grad_norm": 0.6505680097557937, "learning_rate": 1.604627327127323e-07, "loss": 1.5694, "step": 2926 }, { "epoch": 0.6075134910751349, "grad_norm": 0.6451655225633504, "learning_rate": 1.6035198205406592e-07, "loss": 1.502, "step": 2927 }, { "epoch": 0.6077210460772104, "grad_norm": 2.1242372854079092, "learning_rate": 1.6024125675615316e-07, "loss": 1.509, "step": 2928 }, { "epoch": 0.607928601079286, "grad_norm": 0.7388999339118123, "learning_rate": 1.6013055686607152e-07, "loss": 1.5902, "step": 2929 }, { "epoch": 0.6081361560813615, "grad_norm": 2.2249876806638897, "learning_rate": 1.600198824308877e-07, "loss": 1.5577, "step": 2930 }, { "epoch": 0.6083437110834371, "grad_norm": 0.7387184105390527, "learning_rate": 1.599092334976574e-07, "loss": 1.52, "step": 2931 }, { "epoch": 0.6085512660855127, "grad_norm": 0.8797744193713711, "learning_rate": 1.5979861011342573e-07, "loss": 1.4782, "step": 2932 }, { "epoch": 0.6087588210875882, "grad_norm": 1.466926016085479, "learning_rate": 1.596880123252269e-07, "loss": 1.4952, "step": 2933 }, { "epoch": 0.6089663760896638, "grad_norm": 0.7110906563008449, "learning_rate": 1.5957744018008392e-07, "loss": 1.3958, "step": 2934 }, { "epoch": 0.6091739310917393, "grad_norm": 6.768917304805736, "learning_rate": 1.594668937250092e-07, "loss": 1.5072, "step": 2935 }, { "epoch": 0.6093814860938148, "grad_norm": 1.4932472007636786, "learning_rate": 1.5935637300700434e-07, "loss": 1.3895, "step": 2936 }, { "epoch": 0.6095890410958904, "grad_norm": 0.7327965232562699, "learning_rate": 1.592458780730596e-07, "loss": 1.5003, "step": 2937 }, { "epoch": 0.609796596097966, "grad_norm": 1.2306496974469834, "learning_rate": 1.591354089701546e-07, "loss": 1.5329, "step": 2938 }, { "epoch": 0.6100041511000415, "grad_norm": 0.7948189873014087, "learning_rate": 1.590249657452579e-07, "loss": 1.5602, "step": 2939 }, { "epoch": 0.6102117061021171, "grad_norm": 0.9783436064365502, "learning_rate": 1.5891454844532688e-07, "loss": 1.5619, "step": 2940 }, { "epoch": 0.6104192611041926, "grad_norm": 1.688177212488804, "learning_rate": 1.5880415711730812e-07, "loss": 1.4981, "step": 2941 }, { "epoch": 0.6106268161062681, "grad_norm": 1.2624233649307406, "learning_rate": 1.5869379180813716e-07, "loss": 1.4939, "step": 2942 }, { "epoch": 0.6108343711083437, "grad_norm": 1.8929786269348523, "learning_rate": 1.5858345256473832e-07, "loss": 1.5054, "step": 2943 }, { "epoch": 0.6110419261104193, "grad_norm": 1.215104235857268, "learning_rate": 1.584731394340249e-07, "loss": 1.5444, "step": 2944 }, { "epoch": 0.6112494811124948, "grad_norm": 1.0269210213539417, "learning_rate": 1.5836285246289918e-07, "loss": 1.4612, "step": 2945 }, { "epoch": 0.6114570361145704, "grad_norm": 1.4481906241643254, "learning_rate": 1.5825259169825223e-07, "loss": 1.6106, "step": 2946 }, { "epoch": 0.611664591116646, "grad_norm": 4.674749038767822, "learning_rate": 1.5814235718696393e-07, "loss": 1.5739, "step": 2947 }, { "epoch": 0.6118721461187214, "grad_norm": 0.8921273018258443, "learning_rate": 1.580321489759032e-07, "loss": 1.5583, "step": 2948 }, { "epoch": 0.612079701120797, "grad_norm": 1.9281278345913548, "learning_rate": 1.5792196711192753e-07, "loss": 1.5313, "step": 2949 }, { "epoch": 0.6122872561228726, "grad_norm": 0.6416354612437689, "learning_rate": 1.5781181164188335e-07, "loss": 1.4934, "step": 2950 }, { "epoch": 0.6124948111249481, "grad_norm": 1.4831932699606183, "learning_rate": 1.5770168261260594e-07, "loss": 1.5226, "step": 2951 }, { "epoch": 0.6127023661270237, "grad_norm": 0.743602973028861, "learning_rate": 1.5759158007091906e-07, "loss": 1.5191, "step": 2952 }, { "epoch": 0.6129099211290993, "grad_norm": 1.0871439948594217, "learning_rate": 1.5748150406363553e-07, "loss": 1.36, "step": 2953 }, { "epoch": 0.6131174761311747, "grad_norm": 1.053034430069096, "learning_rate": 1.573714546375567e-07, "loss": 1.4987, "step": 2954 }, { "epoch": 0.6133250311332503, "grad_norm": 0.7272434227605858, "learning_rate": 1.5726143183947267e-07, "loss": 1.4755, "step": 2955 }, { "epoch": 0.6135325861353259, "grad_norm": 0.6521455351751879, "learning_rate": 1.5715143571616217e-07, "loss": 1.4769, "step": 2956 }, { "epoch": 0.6137401411374014, "grad_norm": 1.838780836723481, "learning_rate": 1.5704146631439272e-07, "loss": 1.4806, "step": 2957 }, { "epoch": 0.613947696139477, "grad_norm": 0.662858179250531, "learning_rate": 1.569315236809203e-07, "loss": 1.4577, "step": 2958 }, { "epoch": 0.6141552511415526, "grad_norm": 1.1895464943054594, "learning_rate": 1.5682160786248963e-07, "loss": 1.5234, "step": 2959 }, { "epoch": 0.614362806143628, "grad_norm": 0.7644677955325846, "learning_rate": 1.5671171890583404e-07, "loss": 1.4675, "step": 2960 }, { "epoch": 0.6145703611457036, "grad_norm": 1.0595897828837544, "learning_rate": 1.5660185685767538e-07, "loss": 1.5153, "step": 2961 }, { "epoch": 0.6147779161477792, "grad_norm": 0.7390601781416092, "learning_rate": 1.56492021764724e-07, "loss": 1.5099, "step": 2962 }, { "epoch": 0.6149854711498547, "grad_norm": 0.6700285618159091, "learning_rate": 1.5638221367367898e-07, "loss": 1.5058, "step": 2963 }, { "epoch": 0.6151930261519303, "grad_norm": 0.958994073180166, "learning_rate": 1.5627243263122774e-07, "loss": 1.4962, "step": 2964 }, { "epoch": 0.6154005811540059, "grad_norm": 0.8059355602514806, "learning_rate": 1.5616267868404625e-07, "loss": 1.5718, "step": 2965 }, { "epoch": 0.6156081361560813, "grad_norm": 2.709452211139489, "learning_rate": 1.5605295187879905e-07, "loss": 1.5221, "step": 2966 }, { "epoch": 0.6158156911581569, "grad_norm": 0.7243686287520321, "learning_rate": 1.5594325226213893e-07, "loss": 1.4802, "step": 2967 }, { "epoch": 0.6160232461602324, "grad_norm": 0.9907973640748218, "learning_rate": 1.5583357988070743e-07, "loss": 1.527, "step": 2968 }, { "epoch": 0.616230801162308, "grad_norm": 1.7532901521611337, "learning_rate": 1.5572393478113415e-07, "loss": 1.5197, "step": 2969 }, { "epoch": 0.6164383561643836, "grad_norm": 1.0510418178863643, "learning_rate": 1.5561431701003738e-07, "loss": 1.577, "step": 2970 }, { "epoch": 0.6166459111664591, "grad_norm": 0.8542329308784178, "learning_rate": 1.555047266140238e-07, "loss": 1.4479, "step": 2971 }, { "epoch": 0.6168534661685346, "grad_norm": 0.6714615169200676, "learning_rate": 1.553951636396881e-07, "loss": 1.4762, "step": 2972 }, { "epoch": 0.6170610211706102, "grad_norm": 1.0805127152235228, "learning_rate": 1.5528562813361363e-07, "loss": 1.4916, "step": 2973 }, { "epoch": 0.6172685761726857, "grad_norm": 0.7570564020218621, "learning_rate": 1.551761201423721e-07, "loss": 1.4371, "step": 2974 }, { "epoch": 0.6174761311747613, "grad_norm": 0.954274125501317, "learning_rate": 1.5506663971252328e-07, "loss": 1.4952, "step": 2975 }, { "epoch": 0.6176836861768369, "grad_norm": 0.8922934207568118, "learning_rate": 1.549571868906153e-07, "loss": 1.5672, "step": 2976 }, { "epoch": 0.6178912411789124, "grad_norm": 0.7830545483179003, "learning_rate": 1.5484776172318478e-07, "loss": 1.5184, "step": 2977 }, { "epoch": 0.618098796180988, "grad_norm": 0.8595535477644503, "learning_rate": 1.5473836425675622e-07, "loss": 1.4959, "step": 2978 }, { "epoch": 0.6183063511830635, "grad_norm": 0.7796394384734499, "learning_rate": 1.5462899453784255e-07, "loss": 1.5353, "step": 2979 }, { "epoch": 0.618513906185139, "grad_norm": 0.6474049726223324, "learning_rate": 1.5451965261294495e-07, "loss": 1.5632, "step": 2980 }, { "epoch": 0.6187214611872146, "grad_norm": 1.1461043069125916, "learning_rate": 1.544103385285527e-07, "loss": 1.5501, "step": 2981 }, { "epoch": 0.6189290161892902, "grad_norm": 0.7409050555421258, "learning_rate": 1.543010523311431e-07, "loss": 1.6144, "step": 2982 }, { "epoch": 0.6191365711913657, "grad_norm": 0.7185504687166825, "learning_rate": 1.541917940671819e-07, "loss": 1.5007, "step": 2983 }, { "epoch": 0.6193441261934413, "grad_norm": 0.7694250443273994, "learning_rate": 1.5408256378312266e-07, "loss": 1.5075, "step": 2984 }, { "epoch": 0.6195516811955168, "grad_norm": 0.7959331632363079, "learning_rate": 1.5397336152540737e-07, "loss": 1.5204, "step": 2985 }, { "epoch": 0.6197592361975923, "grad_norm": 0.9809280978612099, "learning_rate": 1.5386418734046592e-07, "loss": 1.4766, "step": 2986 }, { "epoch": 0.6199667911996679, "grad_norm": 0.907104566372969, "learning_rate": 1.5375504127471614e-07, "loss": 1.5986, "step": 2987 }, { "epoch": 0.6201743462017435, "grad_norm": 1.1534459656647642, "learning_rate": 1.5364592337456404e-07, "loss": 1.5532, "step": 2988 }, { "epoch": 0.620381901203819, "grad_norm": 1.0827506933797901, "learning_rate": 1.5353683368640385e-07, "loss": 1.5647, "step": 2989 }, { "epoch": 0.6205894562058946, "grad_norm": 1.0170255044891223, "learning_rate": 1.5342777225661743e-07, "loss": 1.4406, "step": 2990 }, { "epoch": 0.6207970112079702, "grad_norm": 3.197352124616111, "learning_rate": 1.533187391315748e-07, "loss": 1.5702, "step": 2991 }, { "epoch": 0.6210045662100456, "grad_norm": 0.7667294217408318, "learning_rate": 1.532097343576341e-07, "loss": 1.536, "step": 2992 }, { "epoch": 0.6212121212121212, "grad_norm": 1.037351167495545, "learning_rate": 1.5310075798114106e-07, "loss": 1.5387, "step": 2993 }, { "epoch": 0.6214196762141968, "grad_norm": 2.85295211655403, "learning_rate": 1.5299181004842966e-07, "loss": 1.4951, "step": 2994 }, { "epoch": 0.6216272312162723, "grad_norm": 0.9479156853728501, "learning_rate": 1.5288289060582176e-07, "loss": 1.505, "step": 2995 }, { "epoch": 0.6218347862183479, "grad_norm": 0.8104359866162321, "learning_rate": 1.527739996996268e-07, "loss": 1.5047, "step": 2996 }, { "epoch": 0.6220423412204235, "grad_norm": 1.058396992585166, "learning_rate": 1.5266513737614255e-07, "loss": 1.5996, "step": 2997 }, { "epoch": 0.6222498962224989, "grad_norm": 1.1725620408714186, "learning_rate": 1.5255630368165418e-07, "loss": 1.4962, "step": 2998 }, { "epoch": 0.6224574512245745, "grad_norm": 0.7782880880570476, "learning_rate": 1.5244749866243495e-07, "loss": 1.4797, "step": 2999 }, { "epoch": 0.6226650062266501, "grad_norm": 0.6842865401473562, "learning_rate": 1.5233872236474583e-07, "loss": 1.4745, "step": 3000 }, { "epoch": 0.6228725612287256, "grad_norm": 0.8813136241253349, "learning_rate": 1.5222997483483577e-07, "loss": 1.4815, "step": 3001 }, { "epoch": 0.6230801162308012, "grad_norm": 0.6537692674582971, "learning_rate": 1.521212561189411e-07, "loss": 1.5509, "step": 3002 }, { "epoch": 0.6232876712328768, "grad_norm": 0.633355514782118, "learning_rate": 1.5201256626328628e-07, "loss": 1.4821, "step": 3003 }, { "epoch": 0.6234952262349522, "grad_norm": 0.9616770146352793, "learning_rate": 1.5190390531408342e-07, "loss": 1.5782, "step": 3004 }, { "epoch": 0.6237027812370278, "grad_norm": 0.9021480042707577, "learning_rate": 1.517952733175321e-07, "loss": 1.4727, "step": 3005 }, { "epoch": 0.6239103362391034, "grad_norm": 1.9363805692078735, "learning_rate": 1.516866703198198e-07, "loss": 1.5533, "step": 3006 }, { "epoch": 0.6241178912411789, "grad_norm": 0.7041329497262746, "learning_rate": 1.515780963671217e-07, "loss": 1.5163, "step": 3007 }, { "epoch": 0.6243254462432545, "grad_norm": 0.7197669810289279, "learning_rate": 1.5146955150560054e-07, "loss": 1.5024, "step": 3008 }, { "epoch": 0.6245330012453301, "grad_norm": 0.7821402677509988, "learning_rate": 1.5136103578140666e-07, "loss": 1.5532, "step": 3009 }, { "epoch": 0.6247405562474055, "grad_norm": 0.783264355980897, "learning_rate": 1.5125254924067813e-07, "loss": 1.5301, "step": 3010 }, { "epoch": 0.6249481112494811, "grad_norm": 0.7170160991048493, "learning_rate": 1.511440919295405e-07, "loss": 1.5392, "step": 3011 }, { "epoch": 0.6251556662515566, "grad_norm": 1.3743728472889096, "learning_rate": 1.5103566389410701e-07, "loss": 1.4351, "step": 3012 }, { "epoch": 0.6253632212536322, "grad_norm": 0.689914167923522, "learning_rate": 1.509272651804783e-07, "loss": 1.4229, "step": 3013 }, { "epoch": 0.6255707762557078, "grad_norm": 0.8364211499808111, "learning_rate": 1.5081889583474264e-07, "loss": 1.5598, "step": 3014 }, { "epoch": 0.6257783312577833, "grad_norm": 1.762970507255435, "learning_rate": 1.5071055590297585e-07, "loss": 1.54, "step": 3015 }, { "epoch": 0.6259858862598588, "grad_norm": 0.7169625524201716, "learning_rate": 1.5060224543124113e-07, "loss": 1.534, "step": 3016 }, { "epoch": 0.6261934412619344, "grad_norm": 0.6420430959541432, "learning_rate": 1.504939644655893e-07, "loss": 1.5144, "step": 3017 }, { "epoch": 0.6264009962640099, "grad_norm": 0.6569486396456943, "learning_rate": 1.5038571305205846e-07, "loss": 1.4862, "step": 3018 }, { "epoch": 0.6266085512660855, "grad_norm": 0.7611164666440157, "learning_rate": 1.502774912366743e-07, "loss": 1.5575, "step": 3019 }, { "epoch": 0.6268161062681611, "grad_norm": 0.7973821180491858, "learning_rate": 1.5016929906544978e-07, "loss": 1.5356, "step": 3020 }, { "epoch": 0.6270236612702366, "grad_norm": 0.7342341745643031, "learning_rate": 1.5006113658438545e-07, "loss": 1.4536, "step": 3021 }, { "epoch": 0.6272312162723122, "grad_norm": 1.6485937584072101, "learning_rate": 1.49953003839469e-07, "loss": 1.54, "step": 3022 }, { "epoch": 0.6274387712743877, "grad_norm": 0.7086689226127876, "learning_rate": 1.4984490087667575e-07, "loss": 1.482, "step": 3023 }, { "epoch": 0.6276463262764632, "grad_norm": 1.1438750820212285, "learning_rate": 1.4973682774196817e-07, "loss": 1.5799, "step": 3024 }, { "epoch": 0.6278538812785388, "grad_norm": 0.8542738698924801, "learning_rate": 1.4962878448129596e-07, "loss": 1.4845, "step": 3025 }, { "epoch": 0.6280614362806144, "grad_norm": 0.7921797464277974, "learning_rate": 1.4952077114059635e-07, "loss": 1.5561, "step": 3026 }, { "epoch": 0.6282689912826899, "grad_norm": 0.9463312455449134, "learning_rate": 1.4941278776579384e-07, "loss": 1.5487, "step": 3027 }, { "epoch": 0.6284765462847655, "grad_norm": 0.7336887354061277, "learning_rate": 1.4930483440279988e-07, "loss": 1.4212, "step": 3028 }, { "epoch": 0.628684101286841, "grad_norm": 0.6651588954425106, "learning_rate": 1.4919691109751348e-07, "loss": 1.4534, "step": 3029 }, { "epoch": 0.6288916562889165, "grad_norm": 0.6058886376895825, "learning_rate": 1.4908901789582086e-07, "loss": 1.5682, "step": 3030 }, { "epoch": 0.6290992112909921, "grad_norm": 0.7547659377690231, "learning_rate": 1.4898115484359516e-07, "loss": 1.5295, "step": 3031 }, { "epoch": 0.6293067662930677, "grad_norm": 1.6267872869079583, "learning_rate": 1.4887332198669702e-07, "loss": 1.4697, "step": 3032 }, { "epoch": 0.6295143212951432, "grad_norm": 0.9037490609311014, "learning_rate": 1.487655193709741e-07, "loss": 1.5508, "step": 3033 }, { "epoch": 0.6297218762972188, "grad_norm": 1.4427007845775373, "learning_rate": 1.486577470422611e-07, "loss": 1.4478, "step": 3034 }, { "epoch": 0.6299294312992944, "grad_norm": 1.4216909229705654, "learning_rate": 1.4855000504638e-07, "loss": 1.5714, "step": 3035 }, { "epoch": 0.6301369863013698, "grad_norm": 0.7270077252043348, "learning_rate": 1.4844229342913996e-07, "loss": 1.5215, "step": 3036 }, { "epoch": 0.6303445413034454, "grad_norm": 0.7634573109246932, "learning_rate": 1.4833461223633697e-07, "loss": 1.6044, "step": 3037 }, { "epoch": 0.630552096305521, "grad_norm": 0.7004038343649741, "learning_rate": 1.482269615137542e-07, "loss": 1.4663, "step": 3038 }, { "epoch": 0.6307596513075965, "grad_norm": 7.898637847027786, "learning_rate": 1.4811934130716202e-07, "loss": 1.5076, "step": 3039 }, { "epoch": 0.6309672063096721, "grad_norm": 0.7015718235904259, "learning_rate": 1.4801175166231752e-07, "loss": 1.4908, "step": 3040 }, { "epoch": 0.6311747613117477, "grad_norm": 0.6355451229397927, "learning_rate": 1.4790419262496508e-07, "loss": 1.5284, "step": 3041 }, { "epoch": 0.6313823163138231, "grad_norm": 0.7914333897141325, "learning_rate": 1.4779666424083593e-07, "loss": 1.428, "step": 3042 }, { "epoch": 0.6315898713158987, "grad_norm": 1.0819444153853983, "learning_rate": 1.4768916655564815e-07, "loss": 1.5306, "step": 3043 }, { "epoch": 0.6317974263179743, "grad_norm": 1.0141038537035725, "learning_rate": 1.475816996151071e-07, "loss": 1.5528, "step": 3044 }, { "epoch": 0.6320049813200498, "grad_norm": 2.5570138409444287, "learning_rate": 1.4747426346490474e-07, "loss": 1.5156, "step": 3045 }, { "epoch": 0.6322125363221254, "grad_norm": 0.758829756857622, "learning_rate": 1.473668581507201e-07, "loss": 1.5179, "step": 3046 }, { "epoch": 0.632420091324201, "grad_norm": 0.7781761515774269, "learning_rate": 1.4725948371821905e-07, "loss": 1.4828, "step": 3047 }, { "epoch": 0.6326276463262764, "grad_norm": 0.8349775520107943, "learning_rate": 1.4715214021305443e-07, "loss": 1.5643, "step": 3048 }, { "epoch": 0.632835201328352, "grad_norm": 0.807621338388221, "learning_rate": 1.4704482768086574e-07, "loss": 1.5033, "step": 3049 }, { "epoch": 0.6330427563304276, "grad_norm": 0.7113634675541607, "learning_rate": 1.4693754616727954e-07, "loss": 1.4864, "step": 3050 }, { "epoch": 0.6332503113325031, "grad_norm": 0.7703297269058391, "learning_rate": 1.4683029571790898e-07, "loss": 1.5122, "step": 3051 }, { "epoch": 0.6334578663345787, "grad_norm": 1.126413866694997, "learning_rate": 1.467230763783541e-07, "loss": 1.4837, "step": 3052 }, { "epoch": 0.6336654213366543, "grad_norm": 0.663847763203691, "learning_rate": 1.4661588819420172e-07, "loss": 1.5925, "step": 3053 }, { "epoch": 0.6338729763387297, "grad_norm": 0.796988765377266, "learning_rate": 1.465087312110256e-07, "loss": 1.5467, "step": 3054 }, { "epoch": 0.6340805313408053, "grad_norm": 0.7037528947052286, "learning_rate": 1.4640160547438574e-07, "loss": 1.5069, "step": 3055 }, { "epoch": 0.6342880863428808, "grad_norm": 0.7113551970199539, "learning_rate": 1.462945110298294e-07, "loss": 1.5677, "step": 3056 }, { "epoch": 0.6344956413449564, "grad_norm": 0.7389917451791563, "learning_rate": 1.4618744792289014e-07, "loss": 1.5606, "step": 3057 }, { "epoch": 0.634703196347032, "grad_norm": 1.2961475558266493, "learning_rate": 1.4608041619908837e-07, "loss": 1.5365, "step": 3058 }, { "epoch": 0.6349107513491075, "grad_norm": 0.6944269981154831, "learning_rate": 1.4597341590393125e-07, "loss": 1.529, "step": 3059 }, { "epoch": 0.635118306351183, "grad_norm": 0.6791333898669759, "learning_rate": 1.4586644708291235e-07, "loss": 1.4323, "step": 3060 }, { "epoch": 0.6353258613532586, "grad_norm": 0.8779002714428632, "learning_rate": 1.4575950978151209e-07, "loss": 1.5115, "step": 3061 }, { "epoch": 0.6355334163553341, "grad_norm": 0.7422274739584223, "learning_rate": 1.4565260404519726e-07, "loss": 1.5069, "step": 3062 }, { "epoch": 0.6357409713574097, "grad_norm": 1.0041976754773174, "learning_rate": 1.4554572991942133e-07, "loss": 1.546, "step": 3063 }, { "epoch": 0.6359485263594853, "grad_norm": 0.8688180392046078, "learning_rate": 1.454388874496244e-07, "loss": 1.505, "step": 3064 }, { "epoch": 0.6361560813615608, "grad_norm": 0.7305615683718213, "learning_rate": 1.453320766812331e-07, "loss": 1.5375, "step": 3065 }, { "epoch": 0.6363636363636364, "grad_norm": 0.746878551233616, "learning_rate": 1.4522529765966044e-07, "loss": 1.5304, "step": 3066 }, { "epoch": 0.6365711913657119, "grad_norm": 0.7294817896273393, "learning_rate": 1.4511855043030608e-07, "loss": 1.47, "step": 3067 }, { "epoch": 0.6367787463677874, "grad_norm": 0.7024606832797239, "learning_rate": 1.4501183503855608e-07, "loss": 1.5076, "step": 3068 }, { "epoch": 0.636986301369863, "grad_norm": 2.7014978284920983, "learning_rate": 1.44905151529783e-07, "loss": 1.4818, "step": 3069 }, { "epoch": 0.6371938563719386, "grad_norm": 0.7405074455054956, "learning_rate": 1.4479849994934577e-07, "loss": 1.5395, "step": 3070 }, { "epoch": 0.6374014113740141, "grad_norm": 2.35265288440181, "learning_rate": 1.4469188034259002e-07, "loss": 1.53, "step": 3071 }, { "epoch": 0.6376089663760897, "grad_norm": 1.3202621993779977, "learning_rate": 1.4458529275484725e-07, "loss": 1.548, "step": 3072 }, { "epoch": 0.6378165213781652, "grad_norm": 1.5101447094630214, "learning_rate": 1.44478737231436e-07, "loss": 1.5202, "step": 3073 }, { "epoch": 0.6380240763802407, "grad_norm": 0.9474628256486164, "learning_rate": 1.4437221381766062e-07, "loss": 1.5159, "step": 3074 }, { "epoch": 0.6382316313823163, "grad_norm": 1.4882513096778087, "learning_rate": 1.442657225588122e-07, "loss": 1.4976, "step": 3075 }, { "epoch": 0.6384391863843919, "grad_norm": 1.3156103895085105, "learning_rate": 1.4415926350016778e-07, "loss": 1.4427, "step": 3076 }, { "epoch": 0.6386467413864674, "grad_norm": 0.8089426620932525, "learning_rate": 1.4405283668699127e-07, "loss": 1.5182, "step": 3077 }, { "epoch": 0.638854296388543, "grad_norm": 0.7426577416740147, "learning_rate": 1.4394644216453213e-07, "loss": 1.5289, "step": 3078 }, { "epoch": 0.6390618513906186, "grad_norm": 0.8748941427895027, "learning_rate": 1.4384007997802674e-07, "loss": 1.4783, "step": 3079 }, { "epoch": 0.639269406392694, "grad_norm": 0.6665043031229163, "learning_rate": 1.4373375017269745e-07, "loss": 1.5631, "step": 3080 }, { "epoch": 0.6394769613947696, "grad_norm": 0.9841742310367197, "learning_rate": 1.4362745279375274e-07, "loss": 1.5146, "step": 3081 }, { "epoch": 0.6396845163968452, "grad_norm": 0.6311251374540982, "learning_rate": 1.435211878863876e-07, "loss": 1.5031, "step": 3082 }, { "epoch": 0.6398920713989207, "grad_norm": 0.8117753554171052, "learning_rate": 1.4341495549578296e-07, "loss": 1.5217, "step": 3083 }, { "epoch": 0.6400996264009963, "grad_norm": 0.9764619582407291, "learning_rate": 1.4330875566710606e-07, "loss": 1.534, "step": 3084 }, { "epoch": 0.6403071814030719, "grad_norm": 0.9553728325129893, "learning_rate": 1.432025884455101e-07, "loss": 1.5403, "step": 3085 }, { "epoch": 0.6405147364051473, "grad_norm": 0.7000982075225894, "learning_rate": 1.4309645387613488e-07, "loss": 1.4807, "step": 3086 }, { "epoch": 0.6407222914072229, "grad_norm": 1.6149960427753751, "learning_rate": 1.429903520041056e-07, "loss": 1.5269, "step": 3087 }, { "epoch": 0.6409298464092985, "grad_norm": 0.7146758156091039, "learning_rate": 1.4288428287453424e-07, "loss": 1.473, "step": 3088 }, { "epoch": 0.641137401411374, "grad_norm": 0.6970781604322636, "learning_rate": 1.427782465325185e-07, "loss": 1.5905, "step": 3089 }, { "epoch": 0.6413449564134496, "grad_norm": 0.7296286315195898, "learning_rate": 1.4267224302314221e-07, "loss": 1.4691, "step": 3090 }, { "epoch": 0.6415525114155252, "grad_norm": 1.2317732571851574, "learning_rate": 1.4256627239147522e-07, "loss": 1.4712, "step": 3091 }, { "epoch": 0.6417600664176006, "grad_norm": 0.856851020931563, "learning_rate": 1.4246033468257355e-07, "loss": 1.432, "step": 3092 }, { "epoch": 0.6419676214196762, "grad_norm": 0.8347545856184144, "learning_rate": 1.4235442994147887e-07, "loss": 1.4874, "step": 3093 }, { "epoch": 0.6421751764217518, "grad_norm": 2.410892699957657, "learning_rate": 1.422485582132193e-07, "loss": 1.5744, "step": 3094 }, { "epoch": 0.6423827314238273, "grad_norm": 0.7254458479768149, "learning_rate": 1.4214271954280856e-07, "loss": 1.5756, "step": 3095 }, { "epoch": 0.6425902864259029, "grad_norm": 0.7842689685340327, "learning_rate": 1.4203691397524646e-07, "loss": 1.5175, "step": 3096 }, { "epoch": 0.6427978414279785, "grad_norm": 0.8292723896025624, "learning_rate": 1.419311415555188e-07, "loss": 1.5536, "step": 3097 }, { "epoch": 0.6430053964300539, "grad_norm": 1.1061180140130233, "learning_rate": 1.4182540232859713e-07, "loss": 1.4814, "step": 3098 }, { "epoch": 0.6432129514321295, "grad_norm": 0.7542346971289389, "learning_rate": 1.4171969633943896e-07, "loss": 1.5254, "step": 3099 }, { "epoch": 0.6434205064342051, "grad_norm": 0.6603891545657087, "learning_rate": 1.416140236329876e-07, "loss": 1.4529, "step": 3100 }, { "epoch": 0.6436280614362806, "grad_norm": 0.6625168165180937, "learning_rate": 1.4150838425417253e-07, "loss": 1.439, "step": 3101 }, { "epoch": 0.6438356164383562, "grad_norm": 0.8494791070221424, "learning_rate": 1.414027782479085e-07, "loss": 1.5402, "step": 3102 }, { "epoch": 0.6440431714404317, "grad_norm": 0.8003417404459681, "learning_rate": 1.4129720565909664e-07, "loss": 1.6088, "step": 3103 }, { "epoch": 0.6442507264425072, "grad_norm": 1.8149013968738987, "learning_rate": 1.411916665326234e-07, "loss": 1.5488, "step": 3104 }, { "epoch": 0.6444582814445828, "grad_norm": 0.7907987128264112, "learning_rate": 1.4108616091336134e-07, "loss": 1.5051, "step": 3105 }, { "epoch": 0.6446658364466583, "grad_norm": 0.8082073328103767, "learning_rate": 1.409806888461686e-07, "loss": 1.512, "step": 3106 }, { "epoch": 0.6448733914487339, "grad_norm": 0.9518176879682078, "learning_rate": 1.4087525037588908e-07, "loss": 1.4735, "step": 3107 }, { "epoch": 0.6450809464508095, "grad_norm": 0.80544846305097, "learning_rate": 1.407698455473525e-07, "loss": 1.5938, "step": 3108 }, { "epoch": 0.645288501452885, "grad_norm": 0.8651528823161663, "learning_rate": 1.4066447440537416e-07, "loss": 1.5049, "step": 3109 }, { "epoch": 0.6454960564549606, "grad_norm": 0.6726388140363898, "learning_rate": 1.405591369947551e-07, "loss": 1.4747, "step": 3110 }, { "epoch": 0.6457036114570361, "grad_norm": 0.9257356011821511, "learning_rate": 1.4045383336028184e-07, "loss": 1.5203, "step": 3111 }, { "epoch": 0.6459111664591116, "grad_norm": 0.7458646426687328, "learning_rate": 1.4034856354672698e-07, "loss": 1.5565, "step": 3112 }, { "epoch": 0.6461187214611872, "grad_norm": 1.211695814577115, "learning_rate": 1.4024332759884807e-07, "loss": 1.4715, "step": 3113 }, { "epoch": 0.6463262764632628, "grad_norm": 0.7086060899643317, "learning_rate": 1.4013812556138896e-07, "loss": 1.4939, "step": 3114 }, { "epoch": 0.6465338314653383, "grad_norm": 1.5452079901734623, "learning_rate": 1.4003295747907866e-07, "loss": 1.4349, "step": 3115 }, { "epoch": 0.6467413864674139, "grad_norm": 0.8409333296750025, "learning_rate": 1.3992782339663186e-07, "loss": 1.5196, "step": 3116 }, { "epoch": 0.6469489414694894, "grad_norm": 0.7396965118502287, "learning_rate": 1.3982272335874865e-07, "loss": 1.4621, "step": 3117 }, { "epoch": 0.6471564964715649, "grad_norm": 0.7334179538865436, "learning_rate": 1.3971765741011496e-07, "loss": 1.5172, "step": 3118 }, { "epoch": 0.6473640514736405, "grad_norm": 0.9689462051889443, "learning_rate": 1.3961262559540194e-07, "loss": 1.5153, "step": 3119 }, { "epoch": 0.6475716064757161, "grad_norm": 1.2226145756928242, "learning_rate": 1.395076279592664e-07, "loss": 1.4683, "step": 3120 }, { "epoch": 0.6477791614777916, "grad_norm": 0.9055456898843436, "learning_rate": 1.3940266454635046e-07, "loss": 1.4859, "step": 3121 }, { "epoch": 0.6479867164798672, "grad_norm": 0.7520262766254164, "learning_rate": 1.3929773540128178e-07, "loss": 1.4896, "step": 3122 }, { "epoch": 0.6481942714819428, "grad_norm": 0.8497141338664903, "learning_rate": 1.3919284056867354e-07, "loss": 1.5648, "step": 3123 }, { "epoch": 0.6484018264840182, "grad_norm": 0.9689861309046328, "learning_rate": 1.3908798009312417e-07, "loss": 1.5345, "step": 3124 }, { "epoch": 0.6486093814860938, "grad_norm": 0.7166738617384836, "learning_rate": 1.3898315401921753e-07, "loss": 1.4839, "step": 3125 }, { "epoch": 0.6488169364881694, "grad_norm": 0.7047849979525533, "learning_rate": 1.3887836239152295e-07, "loss": 1.544, "step": 3126 }, { "epoch": 0.6490244914902449, "grad_norm": 0.9955286228651691, "learning_rate": 1.3877360525459512e-07, "loss": 1.5037, "step": 3127 }, { "epoch": 0.6492320464923205, "grad_norm": 0.6827715321456042, "learning_rate": 1.3866888265297373e-07, "loss": 1.4684, "step": 3128 }, { "epoch": 0.6494396014943961, "grad_norm": 1.1616458049172922, "learning_rate": 1.3856419463118435e-07, "loss": 1.5075, "step": 3129 }, { "epoch": 0.6496471564964715, "grad_norm": 0.6312473488223415, "learning_rate": 1.3845954123373735e-07, "loss": 1.5471, "step": 3130 }, { "epoch": 0.6498547114985471, "grad_norm": 1.354544474937245, "learning_rate": 1.383549225051287e-07, "loss": 1.4566, "step": 3131 }, { "epoch": 0.6500622665006227, "grad_norm": 0.907029290232451, "learning_rate": 1.3825033848983933e-07, "loss": 1.548, "step": 3132 }, { "epoch": 0.6502698215026982, "grad_norm": 0.8544662443232702, "learning_rate": 1.381457892323358e-07, "loss": 1.5051, "step": 3133 }, { "epoch": 0.6504773765047738, "grad_norm": 0.8201666520277637, "learning_rate": 1.3804127477706956e-07, "loss": 1.5534, "step": 3134 }, { "epoch": 0.6506849315068494, "grad_norm": 0.6775086341052935, "learning_rate": 1.3793679516847744e-07, "loss": 1.4579, "step": 3135 }, { "epoch": 0.6508924865089248, "grad_norm": 2.9902079356614486, "learning_rate": 1.3783235045098134e-07, "loss": 1.4855, "step": 3136 }, { "epoch": 0.6511000415110004, "grad_norm": 0.8620242069011522, "learning_rate": 1.377279406689883e-07, "loss": 1.4511, "step": 3137 }, { "epoch": 0.651307596513076, "grad_norm": 0.683327038854693, "learning_rate": 1.376235658668908e-07, "loss": 1.5387, "step": 3138 }, { "epoch": 0.6515151515151515, "grad_norm": 1.1432071442735263, "learning_rate": 1.3751922608906612e-07, "loss": 1.5425, "step": 3139 }, { "epoch": 0.6517227065172271, "grad_norm": 0.7113947008645579, "learning_rate": 1.3741492137987675e-07, "loss": 1.491, "step": 3140 }, { "epoch": 0.6519302615193027, "grad_norm": 0.6147124440706151, "learning_rate": 1.3731065178367026e-07, "loss": 1.5743, "step": 3141 }, { "epoch": 0.6521378165213781, "grad_norm": 0.6824026715321913, "learning_rate": 1.3720641734477946e-07, "loss": 1.5709, "step": 3142 }, { "epoch": 0.6523453715234537, "grad_norm": 0.6956716778191295, "learning_rate": 1.3710221810752186e-07, "loss": 1.6263, "step": 3143 }, { "epoch": 0.6525529265255293, "grad_norm": 0.8923387330246622, "learning_rate": 1.3699805411620035e-07, "loss": 1.5206, "step": 3144 }, { "epoch": 0.6527604815276048, "grad_norm": 0.9645212346817976, "learning_rate": 1.3689392541510266e-07, "loss": 1.5674, "step": 3145 }, { "epoch": 0.6529680365296804, "grad_norm": 0.635361822058857, "learning_rate": 1.3678983204850153e-07, "loss": 1.5305, "step": 3146 }, { "epoch": 0.6531755915317559, "grad_norm": 0.7179233590868765, "learning_rate": 1.3668577406065472e-07, "loss": 1.4897, "step": 3147 }, { "epoch": 0.6533831465338314, "grad_norm": 1.3084336324783978, "learning_rate": 1.3658175149580485e-07, "loss": 1.5444, "step": 3148 }, { "epoch": 0.653590701535907, "grad_norm": 1.1654637217615773, "learning_rate": 1.3647776439817968e-07, "loss": 1.5048, "step": 3149 }, { "epoch": 0.6537982565379825, "grad_norm": 4.559807394204516, "learning_rate": 1.363738128119917e-07, "loss": 1.4855, "step": 3150 }, { "epoch": 0.6540058115400581, "grad_norm": 0.8977383915080426, "learning_rate": 1.3626989678143835e-07, "loss": 1.5687, "step": 3151 }, { "epoch": 0.6542133665421337, "grad_norm": 0.7481379331951814, "learning_rate": 1.3616601635070196e-07, "loss": 1.4991, "step": 3152 }, { "epoch": 0.6544209215442092, "grad_norm": 0.7777807813575783, "learning_rate": 1.3606217156394983e-07, "loss": 1.5649, "step": 3153 }, { "epoch": 0.6546284765462848, "grad_norm": 0.7495343957749625, "learning_rate": 1.359583624653338e-07, "loss": 1.5372, "step": 3154 }, { "epoch": 0.6548360315483603, "grad_norm": 0.7378338866584575, "learning_rate": 1.3585458909899095e-07, "loss": 1.4468, "step": 3155 }, { "epoch": 0.6550435865504358, "grad_norm": 0.7977122563890158, "learning_rate": 1.357508515090429e-07, "loss": 1.4674, "step": 3156 }, { "epoch": 0.6552511415525114, "grad_norm": 2.380994998102106, "learning_rate": 1.356471497395961e-07, "loss": 1.4269, "step": 3157 }, { "epoch": 0.655458696554587, "grad_norm": 0.7261063638137861, "learning_rate": 1.3554348383474167e-07, "loss": 1.4712, "step": 3158 }, { "epoch": 0.6556662515566625, "grad_norm": 0.926607805842262, "learning_rate": 1.3543985383855584e-07, "loss": 1.5323, "step": 3159 }, { "epoch": 0.6558738065587381, "grad_norm": 0.6499893228636157, "learning_rate": 1.3533625979509916e-07, "loss": 1.4999, "step": 3160 }, { "epoch": 0.6560813615608136, "grad_norm": 0.980387142846143, "learning_rate": 1.3523270174841711e-07, "loss": 1.5735, "step": 3161 }, { "epoch": 0.6562889165628891, "grad_norm": 0.7930593259630854, "learning_rate": 1.3512917974253998e-07, "loss": 1.4966, "step": 3162 }, { "epoch": 0.6564964715649647, "grad_norm": 0.6897423656993112, "learning_rate": 1.3502569382148223e-07, "loss": 1.487, "step": 3163 }, { "epoch": 0.6567040265670403, "grad_norm": 0.8164414402342477, "learning_rate": 1.3492224402924363e-07, "loss": 1.5089, "step": 3164 }, { "epoch": 0.6569115815691158, "grad_norm": 0.6630990873187498, "learning_rate": 1.3481883040980817e-07, "loss": 1.5283, "step": 3165 }, { "epoch": 0.6571191365711914, "grad_norm": 0.7270126661653342, "learning_rate": 1.3471545300714464e-07, "loss": 1.5902, "step": 3166 }, { "epoch": 0.657326691573267, "grad_norm": 0.6601295245188631, "learning_rate": 1.346121118652062e-07, "loss": 1.5154, "step": 3167 }, { "epoch": 0.6575342465753424, "grad_norm": 1.046206148608879, "learning_rate": 1.3450880702793098e-07, "loss": 1.4581, "step": 3168 }, { "epoch": 0.657741801577418, "grad_norm": 0.7879608412016764, "learning_rate": 1.3440553853924128e-07, "loss": 1.4875, "step": 3169 }, { "epoch": 0.6579493565794936, "grad_norm": 0.7294064551758634, "learning_rate": 1.3430230644304424e-07, "loss": 1.6067, "step": 3170 }, { "epoch": 0.6581569115815691, "grad_norm": 0.6965681069968713, "learning_rate": 1.3419911078323133e-07, "loss": 1.5046, "step": 3171 }, { "epoch": 0.6583644665836447, "grad_norm": 0.7455234469985699, "learning_rate": 1.3409595160367865e-07, "loss": 1.5454, "step": 3172 }, { "epoch": 0.6585720215857203, "grad_norm": 0.810065925372056, "learning_rate": 1.3399282894824667e-07, "loss": 1.5366, "step": 3173 }, { "epoch": 0.6587795765877957, "grad_norm": 0.7022851758788176, "learning_rate": 1.3388974286078048e-07, "loss": 1.4455, "step": 3174 }, { "epoch": 0.6589871315898713, "grad_norm": 0.7307043044194036, "learning_rate": 1.337866933851096e-07, "loss": 1.511, "step": 3175 }, { "epoch": 0.6591946865919469, "grad_norm": 0.9138245044277977, "learning_rate": 1.3368368056504774e-07, "loss": 1.4929, "step": 3176 }, { "epoch": 0.6594022415940224, "grad_norm": 0.9479502615623114, "learning_rate": 1.3358070444439348e-07, "loss": 1.5206, "step": 3177 }, { "epoch": 0.659609796596098, "grad_norm": 0.9285880312061874, "learning_rate": 1.3347776506692925e-07, "loss": 1.5595, "step": 3178 }, { "epoch": 0.6598173515981736, "grad_norm": 0.7780464977009807, "learning_rate": 1.3337486247642235e-07, "loss": 1.5104, "step": 3179 }, { "epoch": 0.660024906600249, "grad_norm": 0.7112987218549393, "learning_rate": 1.3327199671662417e-07, "loss": 1.5283, "step": 3180 }, { "epoch": 0.6602324616023246, "grad_norm": 0.6875460376572219, "learning_rate": 1.331691678312705e-07, "loss": 1.5727, "step": 3181 }, { "epoch": 0.6604400166044002, "grad_norm": 0.7420930274666655, "learning_rate": 1.3306637586408133e-07, "loss": 1.544, "step": 3182 }, { "epoch": 0.6606475716064757, "grad_norm": 0.6554675119831339, "learning_rate": 1.3296362085876136e-07, "loss": 1.5081, "step": 3183 }, { "epoch": 0.6608551266085513, "grad_norm": 9.337618882141618, "learning_rate": 1.3286090285899896e-07, "loss": 1.4557, "step": 3184 }, { "epoch": 0.6610626816106269, "grad_norm": 1.4940346493915295, "learning_rate": 1.3275822190846733e-07, "loss": 1.5046, "step": 3185 }, { "epoch": 0.6612702366127023, "grad_norm": 0.7160809580436963, "learning_rate": 1.3265557805082362e-07, "loss": 1.5568, "step": 3186 }, { "epoch": 0.6614777916147779, "grad_norm": 0.6435534078430243, "learning_rate": 1.3255297132970915e-07, "loss": 1.5377, "step": 3187 }, { "epoch": 0.6616853466168535, "grad_norm": 0.7875195225061454, "learning_rate": 1.3245040178874977e-07, "loss": 1.4831, "step": 3188 }, { "epoch": 0.661892901618929, "grad_norm": 0.664765418963022, "learning_rate": 1.3234786947155528e-07, "loss": 1.4932, "step": 3189 }, { "epoch": 0.6621004566210046, "grad_norm": 0.8601205547924736, "learning_rate": 1.322453744217196e-07, "loss": 1.5301, "step": 3190 }, { "epoch": 0.6623080116230801, "grad_norm": 0.7227998101842118, "learning_rate": 1.32142916682821e-07, "loss": 1.5086, "step": 3191 }, { "epoch": 0.6625155666251556, "grad_norm": 0.6778281131795043, "learning_rate": 1.3204049629842173e-07, "loss": 1.5216, "step": 3192 }, { "epoch": 0.6627231216272312, "grad_norm": 0.7300134584675743, "learning_rate": 1.3193811331206818e-07, "loss": 1.5261, "step": 3193 }, { "epoch": 0.6629306766293067, "grad_norm": 0.7680107298221206, "learning_rate": 1.318357677672911e-07, "loss": 1.5422, "step": 3194 }, { "epoch": 0.6631382316313823, "grad_norm": 8.887202603643503, "learning_rate": 1.317334597076048e-07, "loss": 1.5642, "step": 3195 }, { "epoch": 0.6633457866334579, "grad_norm": 0.6408879200284127, "learning_rate": 1.3163118917650813e-07, "loss": 1.5604, "step": 3196 }, { "epoch": 0.6635533416355334, "grad_norm": 1.4135421621674917, "learning_rate": 1.3152895621748377e-07, "loss": 1.4877, "step": 3197 }, { "epoch": 0.663760896637609, "grad_norm": 0.6786000783019871, "learning_rate": 1.3142676087399846e-07, "loss": 1.5575, "step": 3198 }, { "epoch": 0.6639684516396845, "grad_norm": 0.9303123804865102, "learning_rate": 1.3132460318950288e-07, "loss": 1.5206, "step": 3199 }, { "epoch": 0.66417600664176, "grad_norm": 1.4928343968334328, "learning_rate": 1.3122248320743187e-07, "loss": 1.4816, "step": 3200 }, { "epoch": 0.6643835616438356, "grad_norm": 0.824543269765013, "learning_rate": 1.3112040097120408e-07, "loss": 1.4694, "step": 3201 }, { "epoch": 0.6645911166459112, "grad_norm": 0.6939174251708157, "learning_rate": 1.3101835652422216e-07, "loss": 1.5584, "step": 3202 }, { "epoch": 0.6647986716479867, "grad_norm": 0.9524793869028257, "learning_rate": 1.3091634990987284e-07, "loss": 1.4583, "step": 3203 }, { "epoch": 0.6650062266500623, "grad_norm": 0.6432380631008792, "learning_rate": 1.3081438117152637e-07, "loss": 1.3888, "step": 3204 }, { "epoch": 0.6652137816521378, "grad_norm": 0.6818048942278007, "learning_rate": 1.3071245035253734e-07, "loss": 1.4895, "step": 3205 }, { "epoch": 0.6654213366542133, "grad_norm": 0.7490778227804438, "learning_rate": 1.3061055749624395e-07, "loss": 1.4659, "step": 3206 }, { "epoch": 0.6656288916562889, "grad_norm": 1.311766790008918, "learning_rate": 1.305087026459684e-07, "loss": 1.5891, "step": 3207 }, { "epoch": 0.6658364466583645, "grad_norm": 1.3812133017818926, "learning_rate": 1.3040688584501652e-07, "loss": 1.4967, "step": 3208 }, { "epoch": 0.66604400166044, "grad_norm": 0.9923122562684247, "learning_rate": 1.3030510713667837e-07, "loss": 1.4708, "step": 3209 }, { "epoch": 0.6662515566625156, "grad_norm": 0.7990055602099573, "learning_rate": 1.3020336656422725e-07, "loss": 1.4704, "step": 3210 }, { "epoch": 0.6664591116645912, "grad_norm": 0.9175499547845192, "learning_rate": 1.3010166417092077e-07, "loss": 1.4834, "step": 3211 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7487437399919964, "learning_rate": 1.3000000000000005e-07, "loss": 1.5701, "step": 3212 }, { "epoch": 0.6668742216687422, "grad_norm": 0.6955458046657547, "learning_rate": 1.2989837409468986e-07, "loss": 1.5305, "step": 3213 }, { "epoch": 0.6670817766708178, "grad_norm": 0.7623711556317108, "learning_rate": 1.2979678649819902e-07, "loss": 1.4678, "step": 3214 }, { "epoch": 0.6672893316728933, "grad_norm": 1.5874400225238208, "learning_rate": 1.2969523725371984e-07, "loss": 1.5153, "step": 3215 }, { "epoch": 0.6674968866749689, "grad_norm": 0.9161302428319901, "learning_rate": 1.2959372640442827e-07, "loss": 1.585, "step": 3216 }, { "epoch": 0.6677044416770445, "grad_norm": 0.7816286979283945, "learning_rate": 1.2949225399348404e-07, "loss": 1.4338, "step": 3217 }, { "epoch": 0.6679119966791199, "grad_norm": 4.376219216468093, "learning_rate": 1.293908200640307e-07, "loss": 1.4981, "step": 3218 }, { "epoch": 0.6681195516811955, "grad_norm": 0.7901166312069904, "learning_rate": 1.29289424659195e-07, "loss": 1.5413, "step": 3219 }, { "epoch": 0.6683271066832711, "grad_norm": 0.6159446063658047, "learning_rate": 1.2918806782208778e-07, "loss": 1.5397, "step": 3220 }, { "epoch": 0.6685346616853466, "grad_norm": 0.7342524373657747, "learning_rate": 1.290867495958032e-07, "loss": 1.517, "step": 3221 }, { "epoch": 0.6687422166874222, "grad_norm": 0.783300181565538, "learning_rate": 1.2898547002341906e-07, "loss": 1.5466, "step": 3222 }, { "epoch": 0.6689497716894978, "grad_norm": 0.6550201276590841, "learning_rate": 1.2888422914799676e-07, "loss": 1.5131, "step": 3223 }, { "epoch": 0.6691573266915732, "grad_norm": 0.9689901735166166, "learning_rate": 1.2878302701258123e-07, "loss": 1.5053, "step": 3224 }, { "epoch": 0.6693648816936488, "grad_norm": 0.8456313707287444, "learning_rate": 1.28681863660201e-07, "loss": 1.4502, "step": 3225 }, { "epoch": 0.6695724366957244, "grad_norm": 0.6863489930367463, "learning_rate": 1.2858073913386793e-07, "loss": 1.4779, "step": 3226 }, { "epoch": 0.6697799916977999, "grad_norm": 0.7022832351060266, "learning_rate": 1.2847965347657756e-07, "loss": 1.4645, "step": 3227 }, { "epoch": 0.6699875466998755, "grad_norm": 0.7552765311622475, "learning_rate": 1.2837860673130878e-07, "loss": 1.5234, "step": 3228 }, { "epoch": 0.6701951017019511, "grad_norm": 0.8719040512638067, "learning_rate": 1.2827759894102404e-07, "loss": 1.4879, "step": 3229 }, { "epoch": 0.6704026567040265, "grad_norm": 0.7289068680645504, "learning_rate": 1.2817663014866914e-07, "loss": 1.5704, "step": 3230 }, { "epoch": 0.6706102117061021, "grad_norm": 1.6309012453266398, "learning_rate": 1.280757003971733e-07, "loss": 1.4617, "step": 3231 }, { "epoch": 0.6708177667081777, "grad_norm": 0.6820231108481851, "learning_rate": 1.2797480972944916e-07, "loss": 1.571, "step": 3232 }, { "epoch": 0.6710253217102532, "grad_norm": 0.7473218877225032, "learning_rate": 1.2787395818839292e-07, "loss": 1.4453, "step": 3233 }, { "epoch": 0.6712328767123288, "grad_norm": 0.9916478133260138, "learning_rate": 1.2777314581688375e-07, "loss": 1.5689, "step": 3234 }, { "epoch": 0.6714404317144043, "grad_norm": 1.0789071126265648, "learning_rate": 1.2767237265778462e-07, "loss": 1.5831, "step": 3235 }, { "epoch": 0.6716479867164799, "grad_norm": 1.0820537657310563, "learning_rate": 1.275716387539413e-07, "loss": 1.5257, "step": 3236 }, { "epoch": 0.6718555417185554, "grad_norm": 0.6828000204932493, "learning_rate": 1.2747094414818344e-07, "loss": 1.4904, "step": 3237 }, { "epoch": 0.6720630967206309, "grad_norm": 1.0937862147446618, "learning_rate": 1.2737028888332364e-07, "loss": 1.5616, "step": 3238 }, { "epoch": 0.6722706517227065, "grad_norm": 0.7931219142100963, "learning_rate": 1.2726967300215774e-07, "loss": 1.5742, "step": 3239 }, { "epoch": 0.6724782067247821, "grad_norm": 0.979294804966168, "learning_rate": 1.271690965474651e-07, "loss": 1.5872, "step": 3240 }, { "epoch": 0.6726857617268576, "grad_norm": 1.1234927210135381, "learning_rate": 1.2706855956200808e-07, "loss": 1.4989, "step": 3241 }, { "epoch": 0.6728933167289332, "grad_norm": 0.9893755042487972, "learning_rate": 1.2696806208853238e-07, "loss": 1.5166, "step": 3242 }, { "epoch": 0.6731008717310087, "grad_norm": 0.9406993242009124, "learning_rate": 1.2686760416976673e-07, "loss": 1.5746, "step": 3243 }, { "epoch": 0.6733084267330842, "grad_norm": 0.8901696763855884, "learning_rate": 1.2676718584842337e-07, "loss": 1.5241, "step": 3244 }, { "epoch": 0.6735159817351598, "grad_norm": 0.804322396485139, "learning_rate": 1.2666680716719728e-07, "loss": 1.4835, "step": 3245 }, { "epoch": 0.6737235367372354, "grad_norm": 0.7513484901499353, "learning_rate": 1.2656646816876703e-07, "loss": 1.4895, "step": 3246 }, { "epoch": 0.6739310917393109, "grad_norm": 0.6183502696902251, "learning_rate": 1.2646616889579392e-07, "loss": 1.5037, "step": 3247 }, { "epoch": 0.6741386467413865, "grad_norm": 0.8730405953354398, "learning_rate": 1.263659093909227e-07, "loss": 1.5379, "step": 3248 }, { "epoch": 0.674346201743462, "grad_norm": 0.7194889363693627, "learning_rate": 1.2626568969678085e-07, "loss": 1.6299, "step": 3249 }, { "epoch": 0.6745537567455375, "grad_norm": 1.1877046457534175, "learning_rate": 1.2616550985597932e-07, "loss": 1.5008, "step": 3250 }, { "epoch": 0.6747613117476131, "grad_norm": 1.5876821453458347, "learning_rate": 1.2606536991111178e-07, "loss": 1.5053, "step": 3251 }, { "epoch": 0.6749688667496887, "grad_norm": 0.7438657559550432, "learning_rate": 1.2596526990475522e-07, "loss": 1.547, "step": 3252 }, { "epoch": 0.6751764217517642, "grad_norm": 0.671664921082297, "learning_rate": 1.2586520987946935e-07, "loss": 1.5518, "step": 3253 }, { "epoch": 0.6753839767538398, "grad_norm": 6.126359824906304, "learning_rate": 1.2576518987779706e-07, "loss": 1.4918, "step": 3254 }, { "epoch": 0.6755915317559154, "grad_norm": 0.7836723018809235, "learning_rate": 1.256652099422643e-07, "loss": 1.5205, "step": 3255 }, { "epoch": 0.6757990867579908, "grad_norm": 0.69736765216155, "learning_rate": 1.2556527011537986e-07, "loss": 1.4826, "step": 3256 }, { "epoch": 0.6760066417600664, "grad_norm": 0.7649587894359001, "learning_rate": 1.2546537043963544e-07, "loss": 1.5124, "step": 3257 }, { "epoch": 0.676214196762142, "grad_norm": 0.945064961123417, "learning_rate": 1.2536551095750567e-07, "loss": 1.5389, "step": 3258 }, { "epoch": 0.6764217517642175, "grad_norm": 0.8416403467807277, "learning_rate": 1.2526569171144838e-07, "loss": 1.5379, "step": 3259 }, { "epoch": 0.6766293067662931, "grad_norm": 0.7401161134509093, "learning_rate": 1.251659127439038e-07, "loss": 1.5646, "step": 3260 }, { "epoch": 0.6768368617683687, "grad_norm": 0.6790074303755887, "learning_rate": 1.2506617409729548e-07, "loss": 1.4979, "step": 3261 }, { "epoch": 0.6770444167704441, "grad_norm": 1.0133838218497426, "learning_rate": 1.2496647581402964e-07, "loss": 1.5388, "step": 3262 }, { "epoch": 0.6772519717725197, "grad_norm": 0.7243290771177783, "learning_rate": 1.2486681793649522e-07, "loss": 1.5235, "step": 3263 }, { "epoch": 0.6774595267745953, "grad_norm": 1.3742502898355515, "learning_rate": 1.2476720050706413e-07, "loss": 1.5499, "step": 3264 }, { "epoch": 0.6776670817766708, "grad_norm": 0.7587012980669989, "learning_rate": 1.2466762356809115e-07, "loss": 1.5038, "step": 3265 }, { "epoch": 0.6778746367787464, "grad_norm": 0.6989098005015468, "learning_rate": 1.2456808716191371e-07, "loss": 1.4801, "step": 3266 }, { "epoch": 0.678082191780822, "grad_norm": 1.2411159625764818, "learning_rate": 1.2446859133085194e-07, "loss": 1.5044, "step": 3267 }, { "epoch": 0.6782897467828974, "grad_norm": 0.9122984882729779, "learning_rate": 1.243691361172091e-07, "loss": 1.4728, "step": 3268 }, { "epoch": 0.678497301784973, "grad_norm": 6.165429613297105, "learning_rate": 1.242697215632706e-07, "loss": 1.4999, "step": 3269 }, { "epoch": 0.6787048567870486, "grad_norm": 0.8134231438380896, "learning_rate": 1.2417034771130502e-07, "loss": 1.4891, "step": 3270 }, { "epoch": 0.6789124117891241, "grad_norm": 0.829237266367444, "learning_rate": 1.2407101460356346e-07, "loss": 1.5122, "step": 3271 }, { "epoch": 0.6791199667911997, "grad_norm": 0.744338325002923, "learning_rate": 1.2397172228227973e-07, "loss": 1.4311, "step": 3272 }, { "epoch": 0.6793275217932753, "grad_norm": 1.2027530577161256, "learning_rate": 1.2387247078967023e-07, "loss": 1.5459, "step": 3273 }, { "epoch": 0.6795350767953507, "grad_norm": 0.6697714662395956, "learning_rate": 1.237732601679342e-07, "loss": 1.5082, "step": 3274 }, { "epoch": 0.6797426317974263, "grad_norm": 0.6558015891459825, "learning_rate": 1.236740904592532e-07, "loss": 1.5443, "step": 3275 }, { "epoch": 0.6799501867995019, "grad_norm": 0.8529060953687726, "learning_rate": 1.2357496170579167e-07, "loss": 1.4878, "step": 3276 }, { "epoch": 0.6801577418015774, "grad_norm": 0.9428887057006291, "learning_rate": 1.2347587394969647e-07, "loss": 1.5471, "step": 3277 }, { "epoch": 0.680365296803653, "grad_norm": 0.694225720923425, "learning_rate": 1.2337682723309716e-07, "loss": 1.4984, "step": 3278 }, { "epoch": 0.6805728518057285, "grad_norm": 0.7773502221000645, "learning_rate": 1.2327782159810562e-07, "loss": 1.5351, "step": 3279 }, { "epoch": 0.680780406807804, "grad_norm": 0.6685616537262326, "learning_rate": 1.231788570868165e-07, "loss": 1.6079, "step": 3280 }, { "epoch": 0.6809879618098796, "grad_norm": 0.9936856335858999, "learning_rate": 1.2307993374130694e-07, "loss": 1.5254, "step": 3281 }, { "epoch": 0.6811955168119551, "grad_norm": 0.8182567346760663, "learning_rate": 1.2298105160363648e-07, "loss": 1.5026, "step": 3282 }, { "epoch": 0.6814030718140307, "grad_norm": 0.814675612880644, "learning_rate": 1.2288221071584715e-07, "loss": 1.5434, "step": 3283 }, { "epoch": 0.6816106268161063, "grad_norm": 0.9567834636478196, "learning_rate": 1.2278341111996345e-07, "loss": 1.5131, "step": 3284 }, { "epoch": 0.6818181818181818, "grad_norm": 0.7737183380493936, "learning_rate": 1.226846528579925e-07, "loss": 1.5125, "step": 3285 }, { "epoch": 0.6820257368202574, "grad_norm": 1.0249993811725902, "learning_rate": 1.2258593597192347e-07, "loss": 1.4963, "step": 3286 }, { "epoch": 0.6822332918223329, "grad_norm": 0.7429199143125298, "learning_rate": 1.2248726050372832e-07, "loss": 1.507, "step": 3287 }, { "epoch": 0.6824408468244084, "grad_norm": 0.8296113467026645, "learning_rate": 1.2238862649536116e-07, "loss": 1.4945, "step": 3288 }, { "epoch": 0.682648401826484, "grad_norm": 0.7774667067970688, "learning_rate": 1.222900339887586e-07, "loss": 1.4813, "step": 3289 }, { "epoch": 0.6828559568285596, "grad_norm": 0.81425265512606, "learning_rate": 1.221914830258395e-07, "loss": 1.5438, "step": 3290 }, { "epoch": 0.6830635118306351, "grad_norm": 1.328499975100338, "learning_rate": 1.220929736485052e-07, "loss": 1.4945, "step": 3291 }, { "epoch": 0.6832710668327107, "grad_norm": 0.7367649751691349, "learning_rate": 1.2199450589863923e-07, "loss": 1.4778, "step": 3292 }, { "epoch": 0.6834786218347862, "grad_norm": 0.6885923992190716, "learning_rate": 1.2189607981810734e-07, "loss": 1.5139, "step": 3293 }, { "epoch": 0.6836861768368617, "grad_norm": 1.0397848070298938, "learning_rate": 1.2179769544875797e-07, "loss": 1.5217, "step": 3294 }, { "epoch": 0.6838937318389373, "grad_norm": 0.7619277465693344, "learning_rate": 1.2169935283242122e-07, "loss": 1.4922, "step": 3295 }, { "epoch": 0.6841012868410129, "grad_norm": 0.6614874402393118, "learning_rate": 1.2160105201091e-07, "loss": 1.5332, "step": 3296 }, { "epoch": 0.6843088418430884, "grad_norm": 0.9768741763711837, "learning_rate": 1.2150279302601915e-07, "loss": 1.5489, "step": 3297 }, { "epoch": 0.684516396845164, "grad_norm": 1.4033432752226545, "learning_rate": 1.214045759195257e-07, "loss": 1.5793, "step": 3298 }, { "epoch": 0.6847239518472396, "grad_norm": 0.7967113361811957, "learning_rate": 1.2130640073318895e-07, "loss": 1.4329, "step": 3299 }, { "epoch": 0.684931506849315, "grad_norm": 0.6020155266308058, "learning_rate": 1.212082675087506e-07, "loss": 1.4975, "step": 3300 }, { "epoch": 0.6851390618513906, "grad_norm": 1.7521579011489161, "learning_rate": 1.2111017628793398e-07, "loss": 1.495, "step": 3301 }, { "epoch": 0.6853466168534662, "grad_norm": 0.9102278581022684, "learning_rate": 1.210121271124451e-07, "loss": 1.5043, "step": 3302 }, { "epoch": 0.6855541718555417, "grad_norm": 0.8018568702051945, "learning_rate": 1.2091412002397178e-07, "loss": 1.6131, "step": 3303 }, { "epoch": 0.6857617268576173, "grad_norm": 0.68363912999568, "learning_rate": 1.2081615506418407e-07, "loss": 1.432, "step": 3304 }, { "epoch": 0.6859692818596929, "grad_norm": 0.7161736225818417, "learning_rate": 1.2071823227473398e-07, "loss": 1.4266, "step": 3305 }, { "epoch": 0.6861768368617683, "grad_norm": 0.8980527616086746, "learning_rate": 1.206203516972558e-07, "loss": 1.5259, "step": 3306 }, { "epoch": 0.6863843918638439, "grad_norm": 0.8532044066108526, "learning_rate": 1.205225133733657e-07, "loss": 1.5253, "step": 3307 }, { "epoch": 0.6865919468659195, "grad_norm": 1.1209907349542791, "learning_rate": 1.2042471734466186e-07, "loss": 1.5127, "step": 3308 }, { "epoch": 0.686799501867995, "grad_norm": 0.8125529387242744, "learning_rate": 1.2032696365272477e-07, "loss": 1.5256, "step": 3309 }, { "epoch": 0.6870070568700706, "grad_norm": 0.7587835731626091, "learning_rate": 1.2022925233911644e-07, "loss": 1.4907, "step": 3310 }, { "epoch": 0.6872146118721462, "grad_norm": 0.8429248957642861, "learning_rate": 1.201315834453813e-07, "loss": 1.5301, "step": 3311 }, { "epoch": 0.6874221668742216, "grad_norm": 0.8750320770033471, "learning_rate": 1.2003395701304553e-07, "loss": 1.5367, "step": 3312 }, { "epoch": 0.6876297218762972, "grad_norm": 0.7535733375252992, "learning_rate": 1.1993637308361732e-07, "loss": 1.5162, "step": 3313 }, { "epoch": 0.6878372768783728, "grad_norm": 0.7918701842023438, "learning_rate": 1.1983883169858665e-07, "loss": 1.4967, "step": 3314 }, { "epoch": 0.6880448318804483, "grad_norm": 1.5252905589841776, "learning_rate": 1.1974133289942575e-07, "loss": 1.5304, "step": 3315 }, { "epoch": 0.6882523868825239, "grad_norm": 12.902680839597537, "learning_rate": 1.1964387672758823e-07, "loss": 1.5442, "step": 3316 }, { "epoch": 0.6884599418845995, "grad_norm": 0.6834085475986064, "learning_rate": 1.195464632245101e-07, "loss": 1.4972, "step": 3317 }, { "epoch": 0.688667496886675, "grad_norm": 2.3087423028078775, "learning_rate": 1.1944909243160894e-07, "loss": 1.4164, "step": 3318 }, { "epoch": 0.6888750518887505, "grad_norm": 0.925745961665308, "learning_rate": 1.1935176439028413e-07, "loss": 1.5181, "step": 3319 }, { "epoch": 0.6890826068908261, "grad_norm": 0.8419661533384216, "learning_rate": 1.192544791419171e-07, "loss": 1.5236, "step": 3320 }, { "epoch": 0.6892901618929016, "grad_norm": 0.768152891059504, "learning_rate": 1.191572367278709e-07, "loss": 1.5419, "step": 3321 }, { "epoch": 0.6894977168949772, "grad_norm": 0.6787632415645909, "learning_rate": 1.190600371894904e-07, "loss": 1.4516, "step": 3322 }, { "epoch": 0.6897052718970528, "grad_norm": 0.8858615369776436, "learning_rate": 1.1896288056810224e-07, "loss": 1.5259, "step": 3323 }, { "epoch": 0.6899128268991283, "grad_norm": 0.8590260454942608, "learning_rate": 1.18865766905015e-07, "loss": 1.5474, "step": 3324 }, { "epoch": 0.6901203819012038, "grad_norm": 0.759344397446281, "learning_rate": 1.187686962415186e-07, "loss": 1.5552, "step": 3325 }, { "epoch": 0.6903279369032793, "grad_norm": 0.7050968244115102, "learning_rate": 1.1867166861888512e-07, "loss": 1.5404, "step": 3326 }, { "epoch": 0.6905354919053549, "grad_norm": 0.783512390219713, "learning_rate": 1.185746840783679e-07, "loss": 1.562, "step": 3327 }, { "epoch": 0.6907430469074305, "grad_norm": 1.2303305702469816, "learning_rate": 1.1847774266120243e-07, "loss": 1.5431, "step": 3328 }, { "epoch": 0.690950601909506, "grad_norm": 0.7077933651680663, "learning_rate": 1.1838084440860547e-07, "loss": 1.5424, "step": 3329 }, { "epoch": 0.6911581569115816, "grad_norm": 0.6707646089645365, "learning_rate": 1.1828398936177557e-07, "loss": 1.553, "step": 3330 }, { "epoch": 0.6913657119136571, "grad_norm": 0.6945010132617898, "learning_rate": 1.1818717756189309e-07, "loss": 1.5995, "step": 3331 }, { "epoch": 0.6915732669157326, "grad_norm": 0.7113711241103959, "learning_rate": 1.1809040905011972e-07, "loss": 1.5542, "step": 3332 }, { "epoch": 0.6917808219178082, "grad_norm": 0.741018194954576, "learning_rate": 1.1799368386759885e-07, "loss": 1.5128, "step": 3333 }, { "epoch": 0.6919883769198838, "grad_norm": 0.7578699710656742, "learning_rate": 1.1789700205545543e-07, "loss": 1.5334, "step": 3334 }, { "epoch": 0.6921959319219593, "grad_norm": 0.6931746645977676, "learning_rate": 1.1780036365479622e-07, "loss": 1.4796, "step": 3335 }, { "epoch": 0.6924034869240349, "grad_norm": 0.8950187295375504, "learning_rate": 1.1770376870670897e-07, "loss": 1.4896, "step": 3336 }, { "epoch": 0.6926110419261104, "grad_norm": 0.9078363976774274, "learning_rate": 1.1760721725226352e-07, "loss": 1.5204, "step": 3337 }, { "epoch": 0.6928185969281859, "grad_norm": 0.9472658491168165, "learning_rate": 1.1751070933251095e-07, "loss": 1.4614, "step": 3338 }, { "epoch": 0.6930261519302615, "grad_norm": 0.9016770323418097, "learning_rate": 1.174142449884838e-07, "loss": 1.5571, "step": 3339 }, { "epoch": 0.6932337069323371, "grad_norm": 0.7569346722844374, "learning_rate": 1.1731782426119614e-07, "loss": 1.5435, "step": 3340 }, { "epoch": 0.6934412619344126, "grad_norm": 0.8050539807043565, "learning_rate": 1.1722144719164369e-07, "loss": 1.5405, "step": 3341 }, { "epoch": 0.6936488169364882, "grad_norm": 0.7095210306297901, "learning_rate": 1.1712511382080313e-07, "loss": 1.4438, "step": 3342 }, { "epoch": 0.6938563719385638, "grad_norm": 0.7265093608447354, "learning_rate": 1.1702882418963304e-07, "loss": 1.5281, "step": 3343 }, { "epoch": 0.6940639269406392, "grad_norm": 0.9488333394268295, "learning_rate": 1.169325783390732e-07, "loss": 1.5596, "step": 3344 }, { "epoch": 0.6942714819427148, "grad_norm": 0.7796546526634055, "learning_rate": 1.168363763100447e-07, "loss": 1.4799, "step": 3345 }, { "epoch": 0.6944790369447904, "grad_norm": 0.6962087211105117, "learning_rate": 1.1674021814345025e-07, "loss": 1.4983, "step": 3346 }, { "epoch": 0.6946865919468659, "grad_norm": 0.7024505788531472, "learning_rate": 1.1664410388017363e-07, "loss": 1.4934, "step": 3347 }, { "epoch": 0.6948941469489415, "grad_norm": 0.8371346310222159, "learning_rate": 1.165480335610801e-07, "loss": 1.537, "step": 3348 }, { "epoch": 0.6951017019510171, "grad_norm": 5.766918684986479, "learning_rate": 1.1645200722701611e-07, "loss": 1.4984, "step": 3349 }, { "epoch": 0.6953092569530925, "grad_norm": 0.6324679222504724, "learning_rate": 1.1635602491880979e-07, "loss": 1.5612, "step": 3350 }, { "epoch": 0.6955168119551681, "grad_norm": 0.6374458169376153, "learning_rate": 1.1626008667726992e-07, "loss": 1.4523, "step": 3351 }, { "epoch": 0.6957243669572437, "grad_norm": 0.6733894863582579, "learning_rate": 1.1616419254318712e-07, "loss": 1.51, "step": 3352 }, { "epoch": 0.6959319219593192, "grad_norm": 0.9030733028122518, "learning_rate": 1.1606834255733298e-07, "loss": 1.4622, "step": 3353 }, { "epoch": 0.6961394769613948, "grad_norm": 0.7261027838379019, "learning_rate": 1.1597253676046034e-07, "loss": 1.5268, "step": 3354 }, { "epoch": 0.6963470319634704, "grad_norm": 0.820286833260227, "learning_rate": 1.1587677519330323e-07, "loss": 1.5128, "step": 3355 }, { "epoch": 0.6965545869655458, "grad_norm": 0.6825507510364497, "learning_rate": 1.1578105789657709e-07, "loss": 1.5899, "step": 3356 }, { "epoch": 0.6967621419676214, "grad_norm": 0.6792810922204955, "learning_rate": 1.1568538491097824e-07, "loss": 1.4961, "step": 3357 }, { "epoch": 0.696969696969697, "grad_norm": 1.0137170584353286, "learning_rate": 1.1558975627718435e-07, "loss": 1.5192, "step": 3358 }, { "epoch": 0.6971772519717725, "grad_norm": 0.7233152589245139, "learning_rate": 1.1549417203585417e-07, "loss": 1.5873, "step": 3359 }, { "epoch": 0.6973848069738481, "grad_norm": 0.7756375127051249, "learning_rate": 1.1539863222762747e-07, "loss": 1.5078, "step": 3360 }, { "epoch": 0.6975923619759237, "grad_norm": 1.080794483931704, "learning_rate": 1.1530313689312542e-07, "loss": 1.4975, "step": 3361 }, { "epoch": 0.6977999169779991, "grad_norm": 0.8838499449263395, "learning_rate": 1.1520768607295002e-07, "loss": 1.6161, "step": 3362 }, { "epoch": 0.6980074719800747, "grad_norm": 0.792520258563017, "learning_rate": 1.1511227980768441e-07, "loss": 1.5134, "step": 3363 }, { "epoch": 0.6982150269821503, "grad_norm": 0.7875389931040292, "learning_rate": 1.1501691813789278e-07, "loss": 1.4809, "step": 3364 }, { "epoch": 0.6984225819842258, "grad_norm": 0.7777011386628931, "learning_rate": 1.1492160110412053e-07, "loss": 1.5742, "step": 3365 }, { "epoch": 0.6986301369863014, "grad_norm": 1.0701278615616723, "learning_rate": 1.1482632874689367e-07, "loss": 1.5065, "step": 3366 }, { "epoch": 0.698837691988377, "grad_norm": 1.1254047769170288, "learning_rate": 1.1473110110671971e-07, "loss": 1.5128, "step": 3367 }, { "epoch": 0.6990452469904525, "grad_norm": 0.6609901295752498, "learning_rate": 1.1463591822408683e-07, "loss": 1.5167, "step": 3368 }, { "epoch": 0.699252801992528, "grad_norm": 0.835998566213407, "learning_rate": 1.1454078013946424e-07, "loss": 1.5469, "step": 3369 }, { "epoch": 0.6994603569946035, "grad_norm": 1.156215741766386, "learning_rate": 1.1444568689330218e-07, "loss": 1.5647, "step": 3370 }, { "epoch": 0.6996679119966791, "grad_norm": 0.7445806399656314, "learning_rate": 1.1435063852603168e-07, "loss": 1.513, "step": 3371 }, { "epoch": 0.6998754669987547, "grad_norm": 1.1131369662749901, "learning_rate": 1.1425563507806489e-07, "loss": 1.5949, "step": 3372 }, { "epoch": 0.7000830220008302, "grad_norm": 0.8551760962085827, "learning_rate": 1.1416067658979473e-07, "loss": 1.4022, "step": 3373 }, { "epoch": 0.7002905770029058, "grad_norm": 0.781042201990767, "learning_rate": 1.14065763101595e-07, "loss": 1.518, "step": 3374 }, { "epoch": 0.7004981320049813, "grad_norm": 0.7313206751075126, "learning_rate": 1.1397089465382035e-07, "loss": 1.4871, "step": 3375 }, { "epoch": 0.7007056870070568, "grad_norm": 0.8077026460850166, "learning_rate": 1.138760712868065e-07, "loss": 1.4924, "step": 3376 }, { "epoch": 0.7009132420091324, "grad_norm": 1.0750350606176107, "learning_rate": 1.1378129304086959e-07, "loss": 1.5342, "step": 3377 }, { "epoch": 0.701120797011208, "grad_norm": 5.540574661631686, "learning_rate": 1.1368655995630698e-07, "loss": 1.5273, "step": 3378 }, { "epoch": 0.7013283520132835, "grad_norm": 0.894835760034304, "learning_rate": 1.1359187207339665e-07, "loss": 1.4689, "step": 3379 }, { "epoch": 0.7015359070153591, "grad_norm": 0.7055104736417149, "learning_rate": 1.1349722943239731e-07, "loss": 1.566, "step": 3380 }, { "epoch": 0.7017434620174346, "grad_norm": 0.910771424105918, "learning_rate": 1.1340263207354845e-07, "loss": 1.5529, "step": 3381 }, { "epoch": 0.7019510170195101, "grad_norm": 0.8460787939571708, "learning_rate": 1.1330808003707053e-07, "loss": 1.4856, "step": 3382 }, { "epoch": 0.7021585720215857, "grad_norm": 0.6502972224687663, "learning_rate": 1.1321357336316443e-07, "loss": 1.5609, "step": 3383 }, { "epoch": 0.7023661270236613, "grad_norm": 0.797498816732242, "learning_rate": 1.1311911209201195e-07, "loss": 1.5528, "step": 3384 }, { "epoch": 0.7025736820257368, "grad_norm": 0.6798084192841621, "learning_rate": 1.1302469626377549e-07, "loss": 1.5647, "step": 3385 }, { "epoch": 0.7027812370278124, "grad_norm": 0.9007867850587731, "learning_rate": 1.1293032591859808e-07, "loss": 1.5564, "step": 3386 }, { "epoch": 0.702988792029888, "grad_norm": 0.8667662349900563, "learning_rate": 1.1283600109660359e-07, "loss": 1.495, "step": 3387 }, { "epoch": 0.7031963470319634, "grad_norm": 1.8222121137183087, "learning_rate": 1.1274172183789641e-07, "loss": 1.5639, "step": 3388 }, { "epoch": 0.703403902034039, "grad_norm": 0.8155154265528134, "learning_rate": 1.1264748818256155e-07, "loss": 1.5187, "step": 3389 }, { "epoch": 0.7036114570361146, "grad_norm": 1.0159828103052049, "learning_rate": 1.1255330017066458e-07, "loss": 1.4738, "step": 3390 }, { "epoch": 0.7038190120381901, "grad_norm": 0.9553159692328533, "learning_rate": 1.1245915784225202e-07, "loss": 1.5887, "step": 3391 }, { "epoch": 0.7040265670402657, "grad_norm": 0.8003320912955327, "learning_rate": 1.123650612373503e-07, "loss": 1.5768, "step": 3392 }, { "epoch": 0.7042341220423413, "grad_norm": 1.4264326332110921, "learning_rate": 1.1227101039596712e-07, "loss": 1.4663, "step": 3393 }, { "epoch": 0.7044416770444167, "grad_norm": 0.7296481548596724, "learning_rate": 1.1217700535809025e-07, "loss": 1.5041, "step": 3394 }, { "epoch": 0.7046492320464923, "grad_norm": 0.7214947984550868, "learning_rate": 1.1208304616368816e-07, "loss": 1.5116, "step": 3395 }, { "epoch": 0.7048567870485679, "grad_norm": 3.6798468607206267, "learning_rate": 1.1198913285270975e-07, "loss": 1.5258, "step": 3396 }, { "epoch": 0.7050643420506434, "grad_norm": 0.6495861512302876, "learning_rate": 1.1189526546508458e-07, "loss": 1.4752, "step": 3397 }, { "epoch": 0.705271897052719, "grad_norm": 0.8641430139462923, "learning_rate": 1.1180144404072253e-07, "loss": 1.5091, "step": 3398 }, { "epoch": 0.7054794520547946, "grad_norm": 1.2306453664608021, "learning_rate": 1.1170766861951389e-07, "loss": 1.4493, "step": 3399 }, { "epoch": 0.70568700705687, "grad_norm": 0.7239116576551664, "learning_rate": 1.1161393924132972e-07, "loss": 1.4877, "step": 3400 }, { "epoch": 0.7058945620589456, "grad_norm": 1.2269671774796294, "learning_rate": 1.1152025594602093e-07, "loss": 1.5184, "step": 3401 }, { "epoch": 0.7061021170610212, "grad_norm": 0.6758102647485595, "learning_rate": 1.1142661877341942e-07, "loss": 1.5247, "step": 3402 }, { "epoch": 0.7063096720630967, "grad_norm": 0.8147318006256409, "learning_rate": 1.1133302776333713e-07, "loss": 1.5336, "step": 3403 }, { "epoch": 0.7065172270651723, "grad_norm": 1.178490597600748, "learning_rate": 1.112394829555665e-07, "loss": 1.4254, "step": 3404 }, { "epoch": 0.7067247820672479, "grad_norm": 0.8676264160748454, "learning_rate": 1.1114598438988023e-07, "loss": 1.4749, "step": 3405 }, { "epoch": 0.7069323370693233, "grad_norm": 0.8391086821680002, "learning_rate": 1.1105253210603163e-07, "loss": 1.5734, "step": 3406 }, { "epoch": 0.7071398920713989, "grad_norm": 0.7113711131535023, "learning_rate": 1.1095912614375384e-07, "loss": 1.5361, "step": 3407 }, { "epoch": 0.7073474470734745, "grad_norm": 0.6918596428886382, "learning_rate": 1.1086576654276081e-07, "loss": 1.5083, "step": 3408 }, { "epoch": 0.70755500207555, "grad_norm": 0.7689142284314142, "learning_rate": 1.1077245334274649e-07, "loss": 1.557, "step": 3409 }, { "epoch": 0.7077625570776256, "grad_norm": 0.7011428368836177, "learning_rate": 1.106791865833852e-07, "loss": 1.5308, "step": 3410 }, { "epoch": 0.7079701120797012, "grad_norm": 1.5249158426303928, "learning_rate": 1.1058596630433141e-07, "loss": 1.4557, "step": 3411 }, { "epoch": 0.7081776670817767, "grad_norm": 0.6853799161808818, "learning_rate": 1.1049279254522002e-07, "loss": 1.4946, "step": 3412 }, { "epoch": 0.7083852220838522, "grad_norm": 0.6944368727896674, "learning_rate": 1.1039966534566598e-07, "loss": 1.4359, "step": 3413 }, { "epoch": 0.7085927770859277, "grad_norm": 0.8965955726509883, "learning_rate": 1.1030658474526453e-07, "loss": 1.5478, "step": 3414 }, { "epoch": 0.7088003320880033, "grad_norm": 1.8059073891547446, "learning_rate": 1.1021355078359105e-07, "loss": 1.5069, "step": 3415 }, { "epoch": 0.7090078870900789, "grad_norm": 0.9499686968573035, "learning_rate": 1.1012056350020103e-07, "loss": 1.5489, "step": 3416 }, { "epoch": 0.7092154420921544, "grad_norm": 1.6790197019262487, "learning_rate": 1.1002762293463041e-07, "loss": 1.4396, "step": 3417 }, { "epoch": 0.70942299709423, "grad_norm": 1.6726372354046732, "learning_rate": 1.0993472912639483e-07, "loss": 1.5242, "step": 3418 }, { "epoch": 0.7096305520963055, "grad_norm": 0.6870671077589678, "learning_rate": 1.0984188211499037e-07, "loss": 1.4819, "step": 3419 }, { "epoch": 0.709838107098381, "grad_norm": 0.7986800560958398, "learning_rate": 1.0974908193989314e-07, "loss": 1.4839, "step": 3420 }, { "epoch": 0.7100456621004566, "grad_norm": 0.701225474841331, "learning_rate": 1.0965632864055923e-07, "loss": 1.5095, "step": 3421 }, { "epoch": 0.7102532171025322, "grad_norm": 0.9000052779641537, "learning_rate": 1.095636222564249e-07, "loss": 1.4381, "step": 3422 }, { "epoch": 0.7104607721046077, "grad_norm": 0.9760847601231115, "learning_rate": 1.0947096282690651e-07, "loss": 1.5768, "step": 3423 }, { "epoch": 0.7106683271066833, "grad_norm": 0.7177336090841828, "learning_rate": 1.0937835039140036e-07, "loss": 1.4964, "step": 3424 }, { "epoch": 0.7108758821087588, "grad_norm": 1.1542762072593606, "learning_rate": 1.0928578498928271e-07, "loss": 1.6029, "step": 3425 }, { "epoch": 0.7110834371108343, "grad_norm": 2.087455477893475, "learning_rate": 1.0919326665991014e-07, "loss": 1.5359, "step": 3426 }, { "epoch": 0.7112909921129099, "grad_norm": 0.7889164796682878, "learning_rate": 1.0910079544261867e-07, "loss": 1.588, "step": 3427 }, { "epoch": 0.7114985471149855, "grad_norm": 0.7167140538992997, "learning_rate": 1.0900837137672487e-07, "loss": 1.5114, "step": 3428 }, { "epoch": 0.711706102117061, "grad_norm": 0.7436177106588999, "learning_rate": 1.0891599450152488e-07, "loss": 1.5227, "step": 3429 }, { "epoch": 0.7119136571191366, "grad_norm": 0.7285124348259286, "learning_rate": 1.0882366485629493e-07, "loss": 1.5462, "step": 3430 }, { "epoch": 0.7121212121212122, "grad_norm": 1.6351554078354409, "learning_rate": 1.0873138248029103e-07, "loss": 1.5069, "step": 3431 }, { "epoch": 0.7123287671232876, "grad_norm": 1.0571591945968657, "learning_rate": 1.0863914741274944e-07, "loss": 1.6116, "step": 3432 }, { "epoch": 0.7125363221253632, "grad_norm": 0.7444166242792438, "learning_rate": 1.0854695969288576e-07, "loss": 1.4893, "step": 3433 }, { "epoch": 0.7127438771274388, "grad_norm": 0.7116820816687842, "learning_rate": 1.0845481935989598e-07, "loss": 1.5422, "step": 3434 }, { "epoch": 0.7129514321295143, "grad_norm": 1.1038395448310774, "learning_rate": 1.0836272645295567e-07, "loss": 1.5807, "step": 3435 }, { "epoch": 0.7131589871315899, "grad_norm": 1.0974574100948222, "learning_rate": 1.082706810112202e-07, "loss": 1.4822, "step": 3436 }, { "epoch": 0.7133665421336655, "grad_norm": 0.7683395681093691, "learning_rate": 1.0817868307382499e-07, "loss": 1.5411, "step": 3437 }, { "epoch": 0.7135740971357409, "grad_norm": 0.7559514818352393, "learning_rate": 1.0808673267988501e-07, "loss": 1.5509, "step": 3438 }, { "epoch": 0.7137816521378165, "grad_norm": 1.1196120224305883, "learning_rate": 1.0799482986849517e-07, "loss": 1.5054, "step": 3439 }, { "epoch": 0.7139892071398921, "grad_norm": 0.6249036262364656, "learning_rate": 1.0790297467873006e-07, "loss": 1.4911, "step": 3440 }, { "epoch": 0.7141967621419676, "grad_norm": 0.67064140653349, "learning_rate": 1.0781116714964425e-07, "loss": 1.5888, "step": 3441 }, { "epoch": 0.7144043171440432, "grad_norm": 0.7229316229756434, "learning_rate": 1.0771940732027158e-07, "loss": 1.5085, "step": 3442 }, { "epoch": 0.7146118721461188, "grad_norm": 1.0515234357392989, "learning_rate": 1.0762769522962609e-07, "loss": 1.5273, "step": 3443 }, { "epoch": 0.7148194271481942, "grad_norm": 0.8279528336345802, "learning_rate": 1.075360309167013e-07, "loss": 1.5189, "step": 3444 }, { "epoch": 0.7150269821502698, "grad_norm": 0.6841549818736408, "learning_rate": 1.0744441442047038e-07, "loss": 1.5353, "step": 3445 }, { "epoch": 0.7152345371523454, "grad_norm": 1.0487289234961572, "learning_rate": 1.0735284577988624e-07, "loss": 1.6083, "step": 3446 }, { "epoch": 0.7154420921544209, "grad_norm": 0.9674360105650714, "learning_rate": 1.0726132503388157e-07, "loss": 1.5686, "step": 3447 }, { "epoch": 0.7156496471564965, "grad_norm": 0.765445488702532, "learning_rate": 1.0716985222136833e-07, "loss": 1.5377, "step": 3448 }, { "epoch": 0.7158572021585721, "grad_norm": 0.7100025050864576, "learning_rate": 1.0707842738123853e-07, "loss": 1.5377, "step": 3449 }, { "epoch": 0.7160647571606475, "grad_norm": 0.7078256190335965, "learning_rate": 1.0698705055236347e-07, "loss": 1.491, "step": 3450 }, { "epoch": 0.7162723121627231, "grad_norm": 0.8198441173871723, "learning_rate": 1.0689572177359419e-07, "loss": 1.5363, "step": 3451 }, { "epoch": 0.7164798671647987, "grad_norm": 2.647231521430732, "learning_rate": 1.0680444108376128e-07, "loss": 1.5115, "step": 3452 }, { "epoch": 0.7166874221668742, "grad_norm": 0.6459301821916063, "learning_rate": 1.0671320852167487e-07, "loss": 1.5117, "step": 3453 }, { "epoch": 0.7168949771689498, "grad_norm": 1.2455895148294254, "learning_rate": 1.066220241261246e-07, "loss": 1.6328, "step": 3454 }, { "epoch": 0.7171025321710254, "grad_norm": 1.0920924776240006, "learning_rate": 1.0653088793587959e-07, "loss": 1.6182, "step": 3455 }, { "epoch": 0.7173100871731009, "grad_norm": 0.6943053006762021, "learning_rate": 1.0643979998968874e-07, "loss": 1.5335, "step": 3456 }, { "epoch": 0.7175176421751764, "grad_norm": 0.8267328563694016, "learning_rate": 1.0634876032627995e-07, "loss": 1.4867, "step": 3457 }, { "epoch": 0.7177251971772519, "grad_norm": 0.6788692500886616, "learning_rate": 1.0625776898436109e-07, "loss": 1.4977, "step": 3458 }, { "epoch": 0.7179327521793275, "grad_norm": 1.2025370458462237, "learning_rate": 1.0616682600261908e-07, "loss": 1.5043, "step": 3459 }, { "epoch": 0.7181403071814031, "grad_norm": 0.9235647677862718, "learning_rate": 1.0607593141972065e-07, "loss": 1.4992, "step": 3460 }, { "epoch": 0.7183478621834786, "grad_norm": 0.7197334293487501, "learning_rate": 1.059850852743116e-07, "loss": 1.4912, "step": 3461 }, { "epoch": 0.7185554171855542, "grad_norm": 0.7836750158583452, "learning_rate": 1.0589428760501735e-07, "loss": 1.5512, "step": 3462 }, { "epoch": 0.7187629721876297, "grad_norm": 0.9897917405017157, "learning_rate": 1.0580353845044272e-07, "loss": 1.4891, "step": 3463 }, { "epoch": 0.7189705271897052, "grad_norm": 0.7329757449722253, "learning_rate": 1.057128378491718e-07, "loss": 1.5052, "step": 3464 }, { "epoch": 0.7191780821917808, "grad_norm": 1.3481118957459333, "learning_rate": 1.0562218583976808e-07, "loss": 1.467, "step": 3465 }, { "epoch": 0.7193856371938564, "grad_norm": 0.8198451687075072, "learning_rate": 1.0553158246077432e-07, "loss": 1.5052, "step": 3466 }, { "epoch": 0.7195931921959319, "grad_norm": 0.7314624809062473, "learning_rate": 1.0544102775071286e-07, "loss": 1.5383, "step": 3467 }, { "epoch": 0.7198007471980075, "grad_norm": 1.2523254175250966, "learning_rate": 1.053505217480849e-07, "loss": 1.5512, "step": 3468 }, { "epoch": 0.720008302200083, "grad_norm": 0.7628103383523822, "learning_rate": 1.052600644913714e-07, "loss": 1.5053, "step": 3469 }, { "epoch": 0.7202158572021585, "grad_norm": 0.7908765517854673, "learning_rate": 1.0516965601903229e-07, "loss": 1.5232, "step": 3470 }, { "epoch": 0.7204234122042341, "grad_norm": 0.7443546871985859, "learning_rate": 1.0507929636950686e-07, "loss": 1.5312, "step": 3471 }, { "epoch": 0.7206309672063097, "grad_norm": 5.823675098534744, "learning_rate": 1.0498898558121361e-07, "loss": 1.5589, "step": 3472 }, { "epoch": 0.7208385222083852, "grad_norm": 0.9633263239700761, "learning_rate": 1.0489872369255044e-07, "loss": 1.4893, "step": 3473 }, { "epoch": 0.7210460772104608, "grad_norm": 0.9445556149735829, "learning_rate": 1.0480851074189405e-07, "loss": 1.5584, "step": 3474 }, { "epoch": 0.7212536322125364, "grad_norm": 0.7408658572192602, "learning_rate": 1.047183467676008e-07, "loss": 1.5427, "step": 3475 }, { "epoch": 0.7214611872146118, "grad_norm": 0.7182503479605851, "learning_rate": 1.0462823180800592e-07, "loss": 1.5319, "step": 3476 }, { "epoch": 0.7216687422166874, "grad_norm": 0.6768545648627804, "learning_rate": 1.045381659014239e-07, "loss": 1.4635, "step": 3477 }, { "epoch": 0.721876297218763, "grad_norm": 0.9073556237571551, "learning_rate": 1.0444814908614847e-07, "loss": 1.5581, "step": 3478 }, { "epoch": 0.7220838522208385, "grad_norm": 0.7015626924716445, "learning_rate": 1.0435818140045232e-07, "loss": 1.4892, "step": 3479 }, { "epoch": 0.7222914072229141, "grad_norm": 0.927316628980158, "learning_rate": 1.0426826288258732e-07, "loss": 1.5168, "step": 3480 }, { "epoch": 0.7224989622249897, "grad_norm": 0.6851525828446593, "learning_rate": 1.0417839357078438e-07, "loss": 1.498, "step": 3481 }, { "epoch": 0.7227065172270651, "grad_norm": 1.3886246675256522, "learning_rate": 1.0408857350325373e-07, "loss": 1.4486, "step": 3482 }, { "epoch": 0.7229140722291407, "grad_norm": 0.9593852785656315, "learning_rate": 1.039988027181842e-07, "loss": 1.5566, "step": 3483 }, { "epoch": 0.7231216272312163, "grad_norm": 0.7535439580518706, "learning_rate": 1.039090812537442e-07, "loss": 1.5244, "step": 3484 }, { "epoch": 0.7233291822332918, "grad_norm": 1.0377846126660604, "learning_rate": 1.0381940914808079e-07, "loss": 1.5072, "step": 3485 }, { "epoch": 0.7235367372353674, "grad_norm": 0.6944576851705965, "learning_rate": 1.0372978643932017e-07, "loss": 1.4807, "step": 3486 }, { "epoch": 0.723744292237443, "grad_norm": 1.365358618798681, "learning_rate": 1.0364021316556753e-07, "loss": 1.5216, "step": 3487 }, { "epoch": 0.7239518472395184, "grad_norm": 1.837688085476273, "learning_rate": 1.0355068936490714e-07, "loss": 1.6805, "step": 3488 }, { "epoch": 0.724159402241594, "grad_norm": 1.0460044808421052, "learning_rate": 1.0346121507540207e-07, "loss": 1.5613, "step": 3489 }, { "epoch": 0.7243669572436696, "grad_norm": 1.3950140663041959, "learning_rate": 1.033717903350944e-07, "loss": 1.5042, "step": 3490 }, { "epoch": 0.7245745122457451, "grad_norm": 0.7513321151996887, "learning_rate": 1.032824151820052e-07, "loss": 1.4694, "step": 3491 }, { "epoch": 0.7247820672478207, "grad_norm": 0.7629524847712228, "learning_rate": 1.0319308965413432e-07, "loss": 1.4591, "step": 3492 }, { "epoch": 0.7249896222498963, "grad_norm": 0.9518255905102921, "learning_rate": 1.0310381378946073e-07, "loss": 1.5944, "step": 3493 }, { "epoch": 0.7251971772519717, "grad_norm": 0.6162696343292873, "learning_rate": 1.0301458762594211e-07, "loss": 1.5008, "step": 3494 }, { "epoch": 0.7254047322540473, "grad_norm": 0.8022839055270045, "learning_rate": 1.0292541120151504e-07, "loss": 1.511, "step": 3495 }, { "epoch": 0.7256122872561229, "grad_norm": 1.0940071065748838, "learning_rate": 1.0283628455409492e-07, "loss": 1.5909, "step": 3496 }, { "epoch": 0.7258198422581984, "grad_norm": 1.1709218608870133, "learning_rate": 1.0274720772157621e-07, "loss": 1.5549, "step": 3497 }, { "epoch": 0.726027397260274, "grad_norm": 0.9878535934356821, "learning_rate": 1.0265818074183173e-07, "loss": 1.461, "step": 3498 }, { "epoch": 0.7262349522623496, "grad_norm": 0.9975102925754061, "learning_rate": 1.0256920365271364e-07, "loss": 1.5375, "step": 3499 }, { "epoch": 0.726442507264425, "grad_norm": 0.8032812596782868, "learning_rate": 1.0248027649205258e-07, "loss": 1.5211, "step": 3500 }, { "epoch": 0.7266500622665006, "grad_norm": 1.060000298412728, "learning_rate": 1.0239139929765794e-07, "loss": 1.5059, "step": 3501 }, { "epoch": 0.7268576172685761, "grad_norm": 1.0244438171236696, "learning_rate": 1.0230257210731798e-07, "loss": 1.5762, "step": 3502 }, { "epoch": 0.7270651722706517, "grad_norm": 0.9245184548745388, "learning_rate": 1.0221379495879964e-07, "loss": 1.5557, "step": 3503 }, { "epoch": 0.7272727272727273, "grad_norm": 0.7810132847151354, "learning_rate": 1.0212506788984869e-07, "loss": 1.5716, "step": 3504 }, { "epoch": 0.7274802822748028, "grad_norm": 0.7138421150836209, "learning_rate": 1.0203639093818948e-07, "loss": 1.5515, "step": 3505 }, { "epoch": 0.7276878372768784, "grad_norm": 1.1715676253441438, "learning_rate": 1.0194776414152508e-07, "loss": 1.5042, "step": 3506 }, { "epoch": 0.727895392278954, "grad_norm": 0.9403212754332279, "learning_rate": 1.018591875375372e-07, "loss": 1.4847, "step": 3507 }, { "epoch": 0.7281029472810294, "grad_norm": 0.8960275640053457, "learning_rate": 1.0177066116388644e-07, "loss": 1.5453, "step": 3508 }, { "epoch": 0.728310502283105, "grad_norm": 0.6534933979268344, "learning_rate": 1.016821850582116e-07, "loss": 1.4915, "step": 3509 }, { "epoch": 0.7285180572851806, "grad_norm": 0.7161478778129308, "learning_rate": 1.0159375925813058e-07, "loss": 1.4892, "step": 3510 }, { "epoch": 0.7287256122872561, "grad_norm": 0.7409351012650822, "learning_rate": 1.0150538380123963e-07, "loss": 1.5268, "step": 3511 }, { "epoch": 0.7289331672893317, "grad_norm": 0.6564319732829017, "learning_rate": 1.0141705872511366e-07, "loss": 1.5091, "step": 3512 }, { "epoch": 0.7291407222914073, "grad_norm": 0.8775484123968684, "learning_rate": 1.0132878406730601e-07, "loss": 1.5611, "step": 3513 }, { "epoch": 0.7293482772934827, "grad_norm": 0.7175917073983599, "learning_rate": 1.0124055986534894e-07, "loss": 1.494, "step": 3514 }, { "epoch": 0.7295558322955583, "grad_norm": 0.8688244746594186, "learning_rate": 1.0115238615675287e-07, "loss": 1.411, "step": 3515 }, { "epoch": 0.7297633872976339, "grad_norm": 0.9891404415327963, "learning_rate": 1.0106426297900705e-07, "loss": 1.4801, "step": 3516 }, { "epoch": 0.7299709422997094, "grad_norm": 0.8288369506330701, "learning_rate": 1.0097619036957903e-07, "loss": 1.5297, "step": 3517 }, { "epoch": 0.730178497301785, "grad_norm": 0.6572737414427684, "learning_rate": 1.008881683659149e-07, "loss": 1.564, "step": 3518 }, { "epoch": 0.7303860523038606, "grad_norm": 1.139785157304229, "learning_rate": 1.0080019700543938e-07, "loss": 1.6262, "step": 3519 }, { "epoch": 0.730593607305936, "grad_norm": 0.7304149356373166, "learning_rate": 1.007122763255555e-07, "loss": 1.5035, "step": 3520 }, { "epoch": 0.7308011623080116, "grad_norm": 0.7315328606683559, "learning_rate": 1.0062440636364487e-07, "loss": 1.4751, "step": 3521 }, { "epoch": 0.7310087173100872, "grad_norm": 0.6663624177177068, "learning_rate": 1.0053658715706732e-07, "loss": 1.5196, "step": 3522 }, { "epoch": 0.7312162723121627, "grad_norm": 0.8689010473345146, "learning_rate": 1.0044881874316147e-07, "loss": 1.4941, "step": 3523 }, { "epoch": 0.7314238273142383, "grad_norm": 0.8804783563845247, "learning_rate": 1.0036110115924388e-07, "loss": 1.5124, "step": 3524 }, { "epoch": 0.7316313823163139, "grad_norm": 0.7191593442233523, "learning_rate": 1.0027343444260989e-07, "loss": 1.5318, "step": 3525 }, { "epoch": 0.7318389373183893, "grad_norm": 0.8240079618409041, "learning_rate": 1.0018581863053302e-07, "loss": 1.5371, "step": 3526 }, { "epoch": 0.7320464923204649, "grad_norm": 0.8446927467521336, "learning_rate": 1.0009825376026517e-07, "loss": 1.5352, "step": 3527 }, { "epoch": 0.7322540473225405, "grad_norm": 0.7817214014588983, "learning_rate": 1.0001073986903655e-07, "loss": 1.4841, "step": 3528 }, { "epoch": 0.732461602324616, "grad_norm": 0.7302446948987851, "learning_rate": 9.992327699405587e-08, "loss": 1.5353, "step": 3529 }, { "epoch": 0.7326691573266916, "grad_norm": 0.8350525115135822, "learning_rate": 9.983586517250995e-08, "loss": 1.552, "step": 3530 }, { "epoch": 0.7328767123287672, "grad_norm": 0.6675648454335639, "learning_rate": 9.974850444156393e-08, "loss": 1.5087, "step": 3531 }, { "epoch": 0.7330842673308426, "grad_norm": 0.7316053247659076, "learning_rate": 9.966119483836144e-08, "loss": 1.5407, "step": 3532 }, { "epoch": 0.7332918223329182, "grad_norm": 0.781601385051515, "learning_rate": 9.957393640002398e-08, "loss": 1.4458, "step": 3533 }, { "epoch": 0.7334993773349938, "grad_norm": 0.7570949871519662, "learning_rate": 9.948672916365172e-08, "loss": 1.514, "step": 3534 }, { "epoch": 0.7337069323370693, "grad_norm": 0.7145193620209511, "learning_rate": 9.939957316632273e-08, "loss": 1.5104, "step": 3535 }, { "epoch": 0.7339144873391449, "grad_norm": 0.7024855382939132, "learning_rate": 9.931246844509349e-08, "loss": 1.6003, "step": 3536 }, { "epoch": 0.7341220423412205, "grad_norm": 0.7871894616169899, "learning_rate": 9.922541503699854e-08, "loss": 1.571, "step": 3537 }, { "epoch": 0.734329597343296, "grad_norm": 1.0107601811326696, "learning_rate": 9.913841297905084e-08, "loss": 1.5042, "step": 3538 }, { "epoch": 0.7345371523453715, "grad_norm": 0.6877909911620147, "learning_rate": 9.905146230824111e-08, "loss": 1.4833, "step": 3539 }, { "epoch": 0.7347447073474471, "grad_norm": 0.6940409940112109, "learning_rate": 9.89645630615387e-08, "loss": 1.4972, "step": 3540 }, { "epoch": 0.7349522623495226, "grad_norm": 0.9914106869704771, "learning_rate": 9.887771527589076e-08, "loss": 1.5252, "step": 3541 }, { "epoch": 0.7351598173515982, "grad_norm": 0.7530933879533854, "learning_rate": 9.879091898822259e-08, "loss": 1.5458, "step": 3542 }, { "epoch": 0.7353673723536738, "grad_norm": 0.7820889112215141, "learning_rate": 9.870417423543783e-08, "loss": 1.5081, "step": 3543 }, { "epoch": 0.7355749273557493, "grad_norm": 7.343112071419662, "learning_rate": 9.861748105441796e-08, "loss": 1.5927, "step": 3544 }, { "epoch": 0.7357824823578248, "grad_norm": 1.1255535248113777, "learning_rate": 9.853083948202262e-08, "loss": 1.557, "step": 3545 }, { "epoch": 0.7359900373599004, "grad_norm": 0.7464328021477331, "learning_rate": 9.844424955508946e-08, "loss": 1.5543, "step": 3546 }, { "epoch": 0.7361975923619759, "grad_norm": 1.1609289607772046, "learning_rate": 9.835771131043437e-08, "loss": 1.5586, "step": 3547 }, { "epoch": 0.7364051473640515, "grad_norm": 0.745131720083381, "learning_rate": 9.827122478485091e-08, "loss": 1.4728, "step": 3548 }, { "epoch": 0.736612702366127, "grad_norm": 1.1781646569331006, "learning_rate": 9.818479001511108e-08, "loss": 1.5383, "step": 3549 }, { "epoch": 0.7368202573682026, "grad_norm": 0.916623850817447, "learning_rate": 9.809840703796436e-08, "loss": 1.5477, "step": 3550 }, { "epoch": 0.7370278123702781, "grad_norm": 1.1912368837048912, "learning_rate": 9.801207589013877e-08, "loss": 1.4877, "step": 3551 }, { "epoch": 0.7372353673723536, "grad_norm": 0.6688069036968155, "learning_rate": 9.79257966083399e-08, "loss": 1.5085, "step": 3552 }, { "epoch": 0.7374429223744292, "grad_norm": 0.8349703665188563, "learning_rate": 9.783956922925143e-08, "loss": 1.4892, "step": 3553 }, { "epoch": 0.7376504773765048, "grad_norm": 0.7333367595107295, "learning_rate": 9.775339378953489e-08, "loss": 1.583, "step": 3554 }, { "epoch": 0.7378580323785803, "grad_norm": 1.0229688919066944, "learning_rate": 9.766727032582991e-08, "loss": 1.5252, "step": 3555 }, { "epoch": 0.7380655873806559, "grad_norm": 0.9033812777674429, "learning_rate": 9.758119887475383e-08, "loss": 1.4958, "step": 3556 }, { "epoch": 0.7382731423827315, "grad_norm": 1.0052481750564066, "learning_rate": 9.74951794729019e-08, "loss": 1.4912, "step": 3557 }, { "epoch": 0.7384806973848069, "grad_norm": 0.6936452152040502, "learning_rate": 9.740921215684746e-08, "loss": 1.4876, "step": 3558 }, { "epoch": 0.7386882523868825, "grad_norm": 0.6390009641008648, "learning_rate": 9.732329696314128e-08, "loss": 1.5676, "step": 3559 }, { "epoch": 0.7388958073889581, "grad_norm": 0.7461845916104995, "learning_rate": 9.723743392831242e-08, "loss": 1.5522, "step": 3560 }, { "epoch": 0.7391033623910336, "grad_norm": 1.1391861126335914, "learning_rate": 9.715162308886748e-08, "loss": 1.4372, "step": 3561 }, { "epoch": 0.7393109173931092, "grad_norm": 1.0590953767184115, "learning_rate": 9.706586448129098e-08, "loss": 1.5627, "step": 3562 }, { "epoch": 0.7395184723951848, "grad_norm": 1.0695954128895118, "learning_rate": 9.698015814204513e-08, "loss": 1.4766, "step": 3563 }, { "epoch": 0.7397260273972602, "grad_norm": 0.730742232955605, "learning_rate": 9.689450410757015e-08, "loss": 1.4653, "step": 3564 }, { "epoch": 0.7399335823993358, "grad_norm": 1.2244434674195188, "learning_rate": 9.680890241428366e-08, "loss": 1.5253, "step": 3565 }, { "epoch": 0.7401411374014114, "grad_norm": 0.6902041798656504, "learning_rate": 9.672335309858136e-08, "loss": 1.5975, "step": 3566 }, { "epoch": 0.7403486924034869, "grad_norm": 0.9870079348127876, "learning_rate": 9.663785619683654e-08, "loss": 1.5879, "step": 3567 }, { "epoch": 0.7405562474055625, "grad_norm": 0.8590093097119726, "learning_rate": 9.65524117454001e-08, "loss": 1.5186, "step": 3568 }, { "epoch": 0.7407638024076381, "grad_norm": 0.9991589484120666, "learning_rate": 9.646701978060092e-08, "loss": 1.5371, "step": 3569 }, { "epoch": 0.7409713574097135, "grad_norm": 0.8045653839935082, "learning_rate": 9.63816803387453e-08, "loss": 1.5223, "step": 3570 }, { "epoch": 0.7411789124117891, "grad_norm": 1.055883259237904, "learning_rate": 9.629639345611733e-08, "loss": 1.5017, "step": 3571 }, { "epoch": 0.7413864674138647, "grad_norm": 0.6599155835299202, "learning_rate": 9.621115916897863e-08, "loss": 1.5045, "step": 3572 }, { "epoch": 0.7415940224159402, "grad_norm": 0.7489401737430197, "learning_rate": 9.612597751356881e-08, "loss": 1.5073, "step": 3573 }, { "epoch": 0.7418015774180158, "grad_norm": 1.0100408628632684, "learning_rate": 9.604084852610455e-08, "loss": 1.4853, "step": 3574 }, { "epoch": 0.7420091324200914, "grad_norm": 1.3088106546240301, "learning_rate": 9.59557722427806e-08, "loss": 1.5599, "step": 3575 }, { "epoch": 0.7422166874221668, "grad_norm": 1.0287162147288085, "learning_rate": 9.587074869976913e-08, "loss": 1.4606, "step": 3576 }, { "epoch": 0.7424242424242424, "grad_norm": 1.8692419602648973, "learning_rate": 9.578577793321987e-08, "loss": 1.5265, "step": 3577 }, { "epoch": 0.742631797426318, "grad_norm": 0.7643858945820965, "learning_rate": 9.570085997926007e-08, "loss": 1.6107, "step": 3578 }, { "epoch": 0.7428393524283935, "grad_norm": 1.2367791797192762, "learning_rate": 9.561599487399477e-08, "loss": 1.5758, "step": 3579 }, { "epoch": 0.7430469074304691, "grad_norm": 1.1170930395952943, "learning_rate": 9.553118265350612e-08, "loss": 1.5485, "step": 3580 }, { "epoch": 0.7432544624325447, "grad_norm": 0.8440723816098336, "learning_rate": 9.54464233538542e-08, "loss": 1.562, "step": 3581 }, { "epoch": 0.7434620174346201, "grad_norm": 0.7638712575843175, "learning_rate": 9.536171701107638e-08, "loss": 1.47, "step": 3582 }, { "epoch": 0.7436695724366957, "grad_norm": 0.805192030151996, "learning_rate": 9.527706366118746e-08, "loss": 1.5099, "step": 3583 }, { "epoch": 0.7438771274387713, "grad_norm": 1.01454666943058, "learning_rate": 9.519246334017993e-08, "loss": 1.6119, "step": 3584 }, { "epoch": 0.7440846824408468, "grad_norm": 0.7878363968586458, "learning_rate": 9.510791608402352e-08, "loss": 1.5805, "step": 3585 }, { "epoch": 0.7442922374429224, "grad_norm": 0.793042311915429, "learning_rate": 9.50234219286655e-08, "loss": 1.5931, "step": 3586 }, { "epoch": 0.744499792444998, "grad_norm": 0.6847976949602224, "learning_rate": 9.493898091003047e-08, "loss": 1.4501, "step": 3587 }, { "epoch": 0.7447073474470735, "grad_norm": 0.6412388438618505, "learning_rate": 9.485459306402071e-08, "loss": 1.5356, "step": 3588 }, { "epoch": 0.744914902449149, "grad_norm": 0.7173748124955761, "learning_rate": 9.477025842651545e-08, "loss": 1.4932, "step": 3589 }, { "epoch": 0.7451224574512246, "grad_norm": 1.001471632627153, "learning_rate": 9.468597703337168e-08, "loss": 1.554, "step": 3590 }, { "epoch": 0.7453300124533001, "grad_norm": 0.6349163338583317, "learning_rate": 9.460174892042359e-08, "loss": 1.5331, "step": 3591 }, { "epoch": 0.7455375674553757, "grad_norm": 0.7513834034170537, "learning_rate": 9.451757412348272e-08, "loss": 1.5469, "step": 3592 }, { "epoch": 0.7457451224574512, "grad_norm": 0.8243141395715089, "learning_rate": 9.443345267833797e-08, "loss": 1.5075, "step": 3593 }, { "epoch": 0.7459526774595268, "grad_norm": 0.7663924863171154, "learning_rate": 9.434938462075545e-08, "loss": 1.4336, "step": 3594 }, { "epoch": 0.7461602324616023, "grad_norm": 0.7371219286925582, "learning_rate": 9.426536998647886e-08, "loss": 1.5257, "step": 3595 }, { "epoch": 0.7463677874636778, "grad_norm": 0.6943179125038027, "learning_rate": 9.418140881122886e-08, "loss": 1.5305, "step": 3596 }, { "epoch": 0.7465753424657534, "grad_norm": 1.5726619247420668, "learning_rate": 9.409750113070357e-08, "loss": 1.4289, "step": 3597 }, { "epoch": 0.746782897467829, "grad_norm": 0.7556701318492306, "learning_rate": 9.401364698057818e-08, "loss": 1.4868, "step": 3598 }, { "epoch": 0.7469904524699045, "grad_norm": 1.1105610790603844, "learning_rate": 9.392984639650546e-08, "loss": 1.5944, "step": 3599 }, { "epoch": 0.7471980074719801, "grad_norm": 0.7832087145495985, "learning_rate": 9.3846099414115e-08, "loss": 1.5109, "step": 3600 }, { "epoch": 0.7474055624740557, "grad_norm": 0.6840864812485508, "learning_rate": 9.37624060690139e-08, "loss": 1.4812, "step": 3601 }, { "epoch": 0.7476131174761311, "grad_norm": 0.8426543652504688, "learning_rate": 9.367876639678635e-08, "loss": 1.4512, "step": 3602 }, { "epoch": 0.7478206724782067, "grad_norm": 1.7193620263692206, "learning_rate": 9.359518043299368e-08, "loss": 1.5339, "step": 3603 }, { "epoch": 0.7480282274802823, "grad_norm": 1.0330931124169942, "learning_rate": 9.351164821317436e-08, "loss": 1.5384, "step": 3604 }, { "epoch": 0.7482357824823578, "grad_norm": 0.8152406280891313, "learning_rate": 9.342816977284423e-08, "loss": 1.526, "step": 3605 }, { "epoch": 0.7484433374844334, "grad_norm": 0.6346991091453212, "learning_rate": 9.3344745147496e-08, "loss": 1.5611, "step": 3606 }, { "epoch": 0.748650892486509, "grad_norm": 0.7762644819740941, "learning_rate": 9.326137437259963e-08, "loss": 1.5273, "step": 3607 }, { "epoch": 0.7488584474885844, "grad_norm": 0.7579529789462294, "learning_rate": 9.317805748360216e-08, "loss": 1.4517, "step": 3608 }, { "epoch": 0.74906600249066, "grad_norm": 0.8055395691430277, "learning_rate": 9.309479451592766e-08, "loss": 1.6014, "step": 3609 }, { "epoch": 0.7492735574927356, "grad_norm": 0.6772453086448496, "learning_rate": 9.301158550497743e-08, "loss": 1.4379, "step": 3610 }, { "epoch": 0.7494811124948111, "grad_norm": 1.1633975361715683, "learning_rate": 9.292843048612973e-08, "loss": 1.4662, "step": 3611 }, { "epoch": 0.7496886674968867, "grad_norm": 0.7628559056287824, "learning_rate": 9.284532949473979e-08, "loss": 1.5149, "step": 3612 }, { "epoch": 0.7498962224989623, "grad_norm": 0.7102182469626219, "learning_rate": 9.276228256613996e-08, "loss": 1.4979, "step": 3613 }, { "epoch": 0.7501037775010377, "grad_norm": 0.7879041883352768, "learning_rate": 9.26792897356397e-08, "loss": 1.5298, "step": 3614 }, { "epoch": 0.7503113325031133, "grad_norm": 0.7259513279103693, "learning_rate": 9.259635103852517e-08, "loss": 1.5131, "step": 3615 }, { "epoch": 0.7505188875051889, "grad_norm": 0.7684661547124279, "learning_rate": 9.251346651005985e-08, "loss": 1.4974, "step": 3616 }, { "epoch": 0.7507264425072644, "grad_norm": 0.9703851415539535, "learning_rate": 9.243063618548402e-08, "loss": 1.5228, "step": 3617 }, { "epoch": 0.75093399750934, "grad_norm": 0.9346028824908276, "learning_rate": 9.23478601000149e-08, "loss": 1.54, "step": 3618 }, { "epoch": 0.7511415525114156, "grad_norm": 0.7843313760590046, "learning_rate": 9.226513828884662e-08, "loss": 1.5666, "step": 3619 }, { "epoch": 0.751349107513491, "grad_norm": 0.6549349648424905, "learning_rate": 9.218247078715045e-08, "loss": 1.6034, "step": 3620 }, { "epoch": 0.7515566625155666, "grad_norm": 1.294863237446233, "learning_rate": 9.209985763007435e-08, "loss": 1.4559, "step": 3621 }, { "epoch": 0.7517642175176422, "grad_norm": 0.6597904381426672, "learning_rate": 9.20172988527432e-08, "loss": 1.4973, "step": 3622 }, { "epoch": 0.7519717725197177, "grad_norm": 0.7819001676580447, "learning_rate": 9.193479449025885e-08, "loss": 1.5146, "step": 3623 }, { "epoch": 0.7521793275217933, "grad_norm": 0.9259703191820416, "learning_rate": 9.185234457769988e-08, "loss": 1.4776, "step": 3624 }, { "epoch": 0.7523868825238689, "grad_norm": 2.3432664924602506, "learning_rate": 9.176994915012192e-08, "loss": 1.5577, "step": 3625 }, { "epoch": 0.7525944375259443, "grad_norm": 0.9119218311963444, "learning_rate": 9.168760824255727e-08, "loss": 1.4676, "step": 3626 }, { "epoch": 0.7528019925280199, "grad_norm": 0.7482764664301839, "learning_rate": 9.160532189001508e-08, "loss": 1.5307, "step": 3627 }, { "epoch": 0.7530095475300955, "grad_norm": 1.1246070961082144, "learning_rate": 9.15230901274813e-08, "loss": 1.4735, "step": 3628 }, { "epoch": 0.753217102532171, "grad_norm": 1.0573383598738648, "learning_rate": 9.144091298991885e-08, "loss": 1.5765, "step": 3629 }, { "epoch": 0.7534246575342466, "grad_norm": 0.7302090915243562, "learning_rate": 9.135879051226703e-08, "loss": 1.555, "step": 3630 }, { "epoch": 0.7536322125363222, "grad_norm": 0.698951332245233, "learning_rate": 9.12767227294423e-08, "loss": 1.4702, "step": 3631 }, { "epoch": 0.7538397675383977, "grad_norm": 0.9210042144448171, "learning_rate": 9.119470967633767e-08, "loss": 1.5207, "step": 3632 }, { "epoch": 0.7540473225404732, "grad_norm": 0.7837425659381553, "learning_rate": 9.111275138782288e-08, "loss": 1.4977, "step": 3633 }, { "epoch": 0.7542548775425488, "grad_norm": 0.8845803645583383, "learning_rate": 9.103084789874439e-08, "loss": 1.5369, "step": 3634 }, { "epoch": 0.7544624325446243, "grad_norm": 0.7447126493612226, "learning_rate": 9.09489992439255e-08, "loss": 1.5491, "step": 3635 }, { "epoch": 0.7546699875466999, "grad_norm": 0.7301421253401057, "learning_rate": 9.086720545816603e-08, "loss": 1.3985, "step": 3636 }, { "epoch": 0.7548775425487754, "grad_norm": 0.8743236769031292, "learning_rate": 9.078546657624251e-08, "loss": 1.5653, "step": 3637 }, { "epoch": 0.755085097550851, "grad_norm": 0.9437770186127424, "learning_rate": 9.070378263290813e-08, "loss": 1.5162, "step": 3638 }, { "epoch": 0.7552926525529265, "grad_norm": 1.0029874017489346, "learning_rate": 9.062215366289272e-08, "loss": 1.4785, "step": 3639 }, { "epoch": 0.755500207555002, "grad_norm": 1.0329217893835239, "learning_rate": 9.054057970090291e-08, "loss": 1.4984, "step": 3640 }, { "epoch": 0.7557077625570776, "grad_norm": 0.9337780088755829, "learning_rate": 9.045906078162154e-08, "loss": 1.4553, "step": 3641 }, { "epoch": 0.7559153175591532, "grad_norm": 3.12311257996803, "learning_rate": 9.037759693970843e-08, "loss": 1.5756, "step": 3642 }, { "epoch": 0.7561228725612287, "grad_norm": 0.6561975771699147, "learning_rate": 9.029618820979987e-08, "loss": 1.5128, "step": 3643 }, { "epoch": 0.7563304275633043, "grad_norm": 1.265134904146197, "learning_rate": 9.02148346265086e-08, "loss": 1.547, "step": 3644 }, { "epoch": 0.7565379825653799, "grad_norm": 1.3010035821836963, "learning_rate": 9.013353622442403e-08, "loss": 1.4933, "step": 3645 }, { "epoch": 0.7567455375674553, "grad_norm": 2.9548535906281033, "learning_rate": 9.005229303811214e-08, "loss": 1.4702, "step": 3646 }, { "epoch": 0.7569530925695309, "grad_norm": 0.7062970208458317, "learning_rate": 8.997110510211532e-08, "loss": 1.4853, "step": 3647 }, { "epoch": 0.7571606475716065, "grad_norm": 0.7453569534830744, "learning_rate": 8.988997245095249e-08, "loss": 1.4645, "step": 3648 }, { "epoch": 0.757368202573682, "grad_norm": 0.9781340107487748, "learning_rate": 8.980889511911929e-08, "loss": 1.4859, "step": 3649 }, { "epoch": 0.7575757575757576, "grad_norm": 0.9825116589942944, "learning_rate": 8.972787314108736e-08, "loss": 1.4699, "step": 3650 }, { "epoch": 0.7577833125778332, "grad_norm": 0.6445189638333721, "learning_rate": 8.964690655130533e-08, "loss": 1.5362, "step": 3651 }, { "epoch": 0.7579908675799086, "grad_norm": 0.7373274396811409, "learning_rate": 8.956599538419791e-08, "loss": 1.4911, "step": 3652 }, { "epoch": 0.7581984225819842, "grad_norm": 1.1659684112711577, "learning_rate": 8.948513967416648e-08, "loss": 1.4624, "step": 3653 }, { "epoch": 0.7584059775840598, "grad_norm": 0.9417157911081553, "learning_rate": 8.940433945558858e-08, "loss": 1.5196, "step": 3654 }, { "epoch": 0.7586135325861353, "grad_norm": 0.7096003176473102, "learning_rate": 8.932359476281857e-08, "loss": 1.5523, "step": 3655 }, { "epoch": 0.7588210875882109, "grad_norm": 0.9762969946562743, "learning_rate": 8.924290563018669e-08, "loss": 1.5188, "step": 3656 }, { "epoch": 0.7590286425902865, "grad_norm": 0.9150895164904417, "learning_rate": 8.9162272092e-08, "loss": 1.5406, "step": 3657 }, { "epoch": 0.7592361975923619, "grad_norm": 2.9040983958213284, "learning_rate": 8.908169418254164e-08, "loss": 1.5876, "step": 3658 }, { "epoch": 0.7594437525944375, "grad_norm": 0.7344825206485066, "learning_rate": 8.900117193607128e-08, "loss": 1.504, "step": 3659 }, { "epoch": 0.7596513075965131, "grad_norm": 0.7751599626545403, "learning_rate": 8.89207053868247e-08, "loss": 1.5777, "step": 3660 }, { "epoch": 0.7598588625985886, "grad_norm": 0.6869096818130381, "learning_rate": 8.884029456901428e-08, "loss": 1.5168, "step": 3661 }, { "epoch": 0.7600664176006642, "grad_norm": 0.9347348822598524, "learning_rate": 8.875993951682855e-08, "loss": 1.5377, "step": 3662 }, { "epoch": 0.7602739726027398, "grad_norm": 0.91803327954358, "learning_rate": 8.867964026443223e-08, "loss": 1.5877, "step": 3663 }, { "epoch": 0.7604815276048152, "grad_norm": 0.6701587500166564, "learning_rate": 8.859939684596662e-08, "loss": 1.4773, "step": 3664 }, { "epoch": 0.7606890826068908, "grad_norm": 0.92745092959334, "learning_rate": 8.851920929554888e-08, "loss": 1.5237, "step": 3665 }, { "epoch": 0.7608966376089664, "grad_norm": 0.762215020846363, "learning_rate": 8.843907764727279e-08, "loss": 1.4637, "step": 3666 }, { "epoch": 0.7611041926110419, "grad_norm": 0.8446275863309926, "learning_rate": 8.835900193520813e-08, "loss": 1.5173, "step": 3667 }, { "epoch": 0.7613117476131175, "grad_norm": 0.6371087787526062, "learning_rate": 8.8278982193401e-08, "loss": 1.4613, "step": 3668 }, { "epoch": 0.7615193026151931, "grad_norm": 0.6543524756484076, "learning_rate": 8.819901845587356e-08, "loss": 1.4976, "step": 3669 }, { "epoch": 0.7617268576172685, "grad_norm": 1.1075388492482394, "learning_rate": 8.81191107566245e-08, "loss": 1.5791, "step": 3670 }, { "epoch": 0.7619344126193441, "grad_norm": 0.8167834355485803, "learning_rate": 8.803925912962817e-08, "loss": 1.5175, "step": 3671 }, { "epoch": 0.7621419676214197, "grad_norm": 0.9714348238066599, "learning_rate": 8.795946360883558e-08, "loss": 1.5938, "step": 3672 }, { "epoch": 0.7623495226234952, "grad_norm": 0.7166402771149134, "learning_rate": 8.787972422817357e-08, "loss": 1.4976, "step": 3673 }, { "epoch": 0.7625570776255708, "grad_norm": 0.8369066453950968, "learning_rate": 8.780004102154518e-08, "loss": 1.4956, "step": 3674 }, { "epoch": 0.7627646326276464, "grad_norm": 1.316099403142914, "learning_rate": 8.772041402282968e-08, "loss": 1.5624, "step": 3675 }, { "epoch": 0.7629721876297219, "grad_norm": 1.719596367736537, "learning_rate": 8.76408432658823e-08, "loss": 1.4942, "step": 3676 }, { "epoch": 0.7631797426317974, "grad_norm": 0.6515883407703427, "learning_rate": 8.756132878453446e-08, "loss": 1.4572, "step": 3677 }, { "epoch": 0.763387297633873, "grad_norm": 0.7904696550358193, "learning_rate": 8.748187061259352e-08, "loss": 1.5274, "step": 3678 }, { "epoch": 0.7635948526359485, "grad_norm": 0.711986518451615, "learning_rate": 8.740246878384313e-08, "loss": 1.5569, "step": 3679 }, { "epoch": 0.7638024076380241, "grad_norm": 0.7103109837342821, "learning_rate": 8.732312333204264e-08, "loss": 1.5171, "step": 3680 }, { "epoch": 0.7640099626400996, "grad_norm": 0.6564687552264464, "learning_rate": 8.724383429092786e-08, "loss": 1.5703, "step": 3681 }, { "epoch": 0.7642175176421752, "grad_norm": 0.7106015365302101, "learning_rate": 8.716460169421013e-08, "loss": 1.455, "step": 3682 }, { "epoch": 0.7644250726442507, "grad_norm": 0.6628386709799882, "learning_rate": 8.708542557557725e-08, "loss": 1.4875, "step": 3683 }, { "epoch": 0.7646326276463262, "grad_norm": 0.9369132019515705, "learning_rate": 8.700630596869274e-08, "loss": 1.5483, "step": 3684 }, { "epoch": 0.7648401826484018, "grad_norm": 0.9708001105081899, "learning_rate": 8.692724290719615e-08, "loss": 1.5552, "step": 3685 }, { "epoch": 0.7650477376504774, "grad_norm": 0.6799600387927957, "learning_rate": 8.684823642470294e-08, "loss": 1.464, "step": 3686 }, { "epoch": 0.7652552926525529, "grad_norm": 0.9400216422936265, "learning_rate": 8.676928655480467e-08, "loss": 1.4919, "step": 3687 }, { "epoch": 0.7654628476546285, "grad_norm": 0.6616351345272905, "learning_rate": 8.669039333106869e-08, "loss": 1.5028, "step": 3688 }, { "epoch": 0.765670402656704, "grad_norm": 1.0346354200029755, "learning_rate": 8.661155678703824e-08, "loss": 1.4399, "step": 3689 }, { "epoch": 0.7658779576587795, "grad_norm": 1.1564937823973798, "learning_rate": 8.653277695623269e-08, "loss": 1.5234, "step": 3690 }, { "epoch": 0.7660855126608551, "grad_norm": 0.9097324875657153, "learning_rate": 8.645405387214691e-08, "loss": 1.5455, "step": 3691 }, { "epoch": 0.7662930676629307, "grad_norm": 0.8511477824718381, "learning_rate": 8.637538756825204e-08, "loss": 1.5288, "step": 3692 }, { "epoch": 0.7665006226650062, "grad_norm": 0.7056324365479921, "learning_rate": 8.629677807799484e-08, "loss": 1.4495, "step": 3693 }, { "epoch": 0.7667081776670818, "grad_norm": 0.8962247940387371, "learning_rate": 8.6218225434798e-08, "loss": 1.4997, "step": 3694 }, { "epoch": 0.7669157326691574, "grad_norm": 0.9727264386207618, "learning_rate": 8.613972967205993e-08, "loss": 1.4703, "step": 3695 }, { "epoch": 0.7671232876712328, "grad_norm": 1.2402968049705936, "learning_rate": 8.606129082315514e-08, "loss": 1.4662, "step": 3696 }, { "epoch": 0.7673308426733084, "grad_norm": 0.9480454389682683, "learning_rate": 8.59829089214335e-08, "loss": 1.5176, "step": 3697 }, { "epoch": 0.767538397675384, "grad_norm": 0.7791763063083721, "learning_rate": 8.590458400022109e-08, "loss": 1.562, "step": 3698 }, { "epoch": 0.7677459526774595, "grad_norm": 0.8795455326700431, "learning_rate": 8.582631609281954e-08, "loss": 1.6003, "step": 3699 }, { "epoch": 0.7679535076795351, "grad_norm": 0.9303963120593105, "learning_rate": 8.574810523250622e-08, "loss": 1.4352, "step": 3700 }, { "epoch": 0.7681610626816107, "grad_norm": 0.745958652852297, "learning_rate": 8.56699514525344e-08, "loss": 1.5844, "step": 3701 }, { "epoch": 0.7683686176836861, "grad_norm": 1.5248799469735759, "learning_rate": 8.559185478613299e-08, "loss": 1.5652, "step": 3702 }, { "epoch": 0.7685761726857617, "grad_norm": 0.9504135251306799, "learning_rate": 8.551381526650658e-08, "loss": 1.5243, "step": 3703 }, { "epoch": 0.7687837276878373, "grad_norm": 0.6910282403052183, "learning_rate": 8.543583292683547e-08, "loss": 1.5383, "step": 3704 }, { "epoch": 0.7689912826899128, "grad_norm": 0.6499442373461638, "learning_rate": 8.535790780027582e-08, "loss": 1.5775, "step": 3705 }, { "epoch": 0.7691988376919884, "grad_norm": 0.9535545726573528, "learning_rate": 8.528003991995914e-08, "loss": 1.5118, "step": 3706 }, { "epoch": 0.769406392694064, "grad_norm": 0.9369225524492816, "learning_rate": 8.520222931899292e-08, "loss": 1.4968, "step": 3707 }, { "epoch": 0.7696139476961394, "grad_norm": 1.4228802776735283, "learning_rate": 8.512447603046011e-08, "loss": 1.5087, "step": 3708 }, { "epoch": 0.769821502698215, "grad_norm": 0.7752328550439567, "learning_rate": 8.504678008741936e-08, "loss": 1.4768, "step": 3709 }, { "epoch": 0.7700290577002906, "grad_norm": 1.0805202104476346, "learning_rate": 8.496914152290485e-08, "loss": 1.5587, "step": 3710 }, { "epoch": 0.7702366127023661, "grad_norm": 0.6949638840831297, "learning_rate": 8.489156036992655e-08, "loss": 1.4693, "step": 3711 }, { "epoch": 0.7704441677044417, "grad_norm": 1.4180052788818878, "learning_rate": 8.481403666146987e-08, "loss": 1.5181, "step": 3712 }, { "epoch": 0.7706517227065173, "grad_norm": 0.7642757731984444, "learning_rate": 8.47365704304958e-08, "loss": 1.4588, "step": 3713 }, { "epoch": 0.7708592777085927, "grad_norm": 2.261112460955011, "learning_rate": 8.465916170994094e-08, "loss": 1.5696, "step": 3714 }, { "epoch": 0.7710668327106683, "grad_norm": 0.9840937688868856, "learning_rate": 8.45818105327174e-08, "loss": 1.5552, "step": 3715 }, { "epoch": 0.7712743877127439, "grad_norm": 0.9044154791431672, "learning_rate": 8.45045169317129e-08, "loss": 1.4646, "step": 3716 }, { "epoch": 0.7714819427148194, "grad_norm": 0.7233508299839303, "learning_rate": 8.442728093979061e-08, "loss": 1.5773, "step": 3717 }, { "epoch": 0.771689497716895, "grad_norm": 0.6547539358787329, "learning_rate": 8.435010258978922e-08, "loss": 1.5136, "step": 3718 }, { "epoch": 0.7718970527189706, "grad_norm": 0.8478591806594814, "learning_rate": 8.427298191452285e-08, "loss": 1.5112, "step": 3719 }, { "epoch": 0.772104607721046, "grad_norm": 0.8263975586927169, "learning_rate": 8.419591894678134e-08, "loss": 1.5382, "step": 3720 }, { "epoch": 0.7723121627231216, "grad_norm": 0.8752860535749272, "learning_rate": 8.411891371932958e-08, "loss": 1.4869, "step": 3721 }, { "epoch": 0.7725197177251972, "grad_norm": 0.7368123606567933, "learning_rate": 8.404196626490831e-08, "loss": 1.5335, "step": 3722 }, { "epoch": 0.7727272727272727, "grad_norm": 2.2823409655822764, "learning_rate": 8.396507661623354e-08, "loss": 1.5371, "step": 3723 }, { "epoch": 0.7729348277293483, "grad_norm": 0.8785549336127969, "learning_rate": 8.388824480599664e-08, "loss": 1.5247, "step": 3724 }, { "epoch": 0.7731423827314239, "grad_norm": 0.6689860470059837, "learning_rate": 8.381147086686444e-08, "loss": 1.4838, "step": 3725 }, { "epoch": 0.7733499377334994, "grad_norm": 0.6419935459006911, "learning_rate": 8.373475483147929e-08, "loss": 1.5401, "step": 3726 }, { "epoch": 0.773557492735575, "grad_norm": 0.6435875932662984, "learning_rate": 8.365809673245872e-08, "loss": 1.458, "step": 3727 }, { "epoch": 0.7737650477376504, "grad_norm": 1.094980297827521, "learning_rate": 8.358149660239578e-08, "loss": 1.512, "step": 3728 }, { "epoch": 0.773972602739726, "grad_norm": 1.279880254885234, "learning_rate": 8.350495447385878e-08, "loss": 1.5499, "step": 3729 }, { "epoch": 0.7741801577418016, "grad_norm": 0.7391367560584082, "learning_rate": 8.342847037939133e-08, "loss": 1.5163, "step": 3730 }, { "epoch": 0.7743877127438771, "grad_norm": 0.8069549910756837, "learning_rate": 8.335204435151262e-08, "loss": 1.4291, "step": 3731 }, { "epoch": 0.7745952677459527, "grad_norm": 1.3074851644422694, "learning_rate": 8.327567642271676e-08, "loss": 1.5477, "step": 3732 }, { "epoch": 0.7748028227480283, "grad_norm": 1.4786036555935946, "learning_rate": 8.319936662547349e-08, "loss": 1.5801, "step": 3733 }, { "epoch": 0.7750103777501037, "grad_norm": 0.8115404428015045, "learning_rate": 8.31231149922277e-08, "loss": 1.498, "step": 3734 }, { "epoch": 0.7752179327521793, "grad_norm": 0.7048538488265006, "learning_rate": 8.304692155539952e-08, "loss": 1.486, "step": 3735 }, { "epoch": 0.7754254877542549, "grad_norm": 1.3901521643293466, "learning_rate": 8.297078634738436e-08, "loss": 1.5362, "step": 3736 }, { "epoch": 0.7756330427563304, "grad_norm": 0.7722605542096388, "learning_rate": 8.289470940055297e-08, "loss": 1.5278, "step": 3737 }, { "epoch": 0.775840597758406, "grad_norm": 0.9071026501168962, "learning_rate": 8.28186907472512e-08, "loss": 1.5438, "step": 3738 }, { "epoch": 0.7760481527604816, "grad_norm": 0.7428620731943374, "learning_rate": 8.27427304198002e-08, "loss": 1.489, "step": 3739 }, { "epoch": 0.776255707762557, "grad_norm": 0.6993922725125539, "learning_rate": 8.266682845049621e-08, "loss": 1.5453, "step": 3740 }, { "epoch": 0.7764632627646326, "grad_norm": 0.8349717976196509, "learning_rate": 8.259098487161076e-08, "loss": 1.5133, "step": 3741 }, { "epoch": 0.7766708177667082, "grad_norm": 0.6512627588323215, "learning_rate": 8.251519971539057e-08, "loss": 1.5113, "step": 3742 }, { "epoch": 0.7768783727687837, "grad_norm": 0.7237646398392438, "learning_rate": 8.243947301405745e-08, "loss": 1.5484, "step": 3743 }, { "epoch": 0.7770859277708593, "grad_norm": 0.6834301678937142, "learning_rate": 8.236380479980838e-08, "loss": 1.5154, "step": 3744 }, { "epoch": 0.7772934827729349, "grad_norm": 2.676292870574534, "learning_rate": 8.228819510481544e-08, "loss": 1.4527, "step": 3745 }, { "epoch": 0.7775010377750103, "grad_norm": 0.7187387132947246, "learning_rate": 8.221264396122598e-08, "loss": 1.5916, "step": 3746 }, { "epoch": 0.7777085927770859, "grad_norm": 0.9796527956740613, "learning_rate": 8.213715140116217e-08, "loss": 1.5333, "step": 3747 }, { "epoch": 0.7779161477791615, "grad_norm": 0.7220937755834875, "learning_rate": 8.20617174567216e-08, "loss": 1.5335, "step": 3748 }, { "epoch": 0.778123702781237, "grad_norm": 0.7611049438253926, "learning_rate": 8.198634215997669e-08, "loss": 1.5446, "step": 3749 }, { "epoch": 0.7783312577833126, "grad_norm": 0.7040971316583947, "learning_rate": 8.191102554297505e-08, "loss": 1.5059, "step": 3750 }, { "epoch": 0.7785388127853882, "grad_norm": 0.8601770693392301, "learning_rate": 8.183576763773925e-08, "loss": 1.4694, "step": 3751 }, { "epoch": 0.7787463677874636, "grad_norm": 0.8470768345754324, "learning_rate": 8.176056847626705e-08, "loss": 1.5573, "step": 3752 }, { "epoch": 0.7789539227895392, "grad_norm": 0.7332075382870787, "learning_rate": 8.168542809053108e-08, "loss": 1.5627, "step": 3753 }, { "epoch": 0.7791614777916148, "grad_norm": 0.639110979242346, "learning_rate": 8.161034651247895e-08, "loss": 1.4819, "step": 3754 }, { "epoch": 0.7793690327936903, "grad_norm": 0.7283092367073781, "learning_rate": 8.15353237740336e-08, "loss": 1.5144, "step": 3755 }, { "epoch": 0.7795765877957659, "grad_norm": 0.895981108138558, "learning_rate": 8.146035990709246e-08, "loss": 1.5767, "step": 3756 }, { "epoch": 0.7797841427978415, "grad_norm": 0.6467164460809052, "learning_rate": 8.138545494352828e-08, "loss": 1.5692, "step": 3757 }, { "epoch": 0.779991697799917, "grad_norm": 0.7404725669024991, "learning_rate": 8.131060891518869e-08, "loss": 1.4578, "step": 3758 }, { "epoch": 0.7801992528019925, "grad_norm": 0.6595972029259889, "learning_rate": 8.123582185389616e-08, "loss": 1.5813, "step": 3759 }, { "epoch": 0.7804068078040681, "grad_norm": 0.7221318755264007, "learning_rate": 8.116109379144817e-08, "loss": 1.5387, "step": 3760 }, { "epoch": 0.7806143628061436, "grad_norm": 0.6215279182105392, "learning_rate": 8.108642475961725e-08, "loss": 1.5125, "step": 3761 }, { "epoch": 0.7808219178082192, "grad_norm": 0.67488476820223, "learning_rate": 8.101181479015043e-08, "loss": 1.4738, "step": 3762 }, { "epoch": 0.7810294728102948, "grad_norm": 1.2130955024336798, "learning_rate": 8.093726391477011e-08, "loss": 1.4904, "step": 3763 }, { "epoch": 0.7812370278123703, "grad_norm": 0.7563625260734779, "learning_rate": 8.086277216517327e-08, "loss": 1.5131, "step": 3764 }, { "epoch": 0.7814445828144458, "grad_norm": 1.5034617378440747, "learning_rate": 8.078833957303184e-08, "loss": 1.4702, "step": 3765 }, { "epoch": 0.7816521378165214, "grad_norm": 0.6873898163346628, "learning_rate": 8.071396616999248e-08, "loss": 1.4982, "step": 3766 }, { "epoch": 0.7818596928185969, "grad_norm": 1.4986158607595441, "learning_rate": 8.063965198767692e-08, "loss": 1.4948, "step": 3767 }, { "epoch": 0.7820672478206725, "grad_norm": 0.726055707165679, "learning_rate": 8.056539705768155e-08, "loss": 1.5342, "step": 3768 }, { "epoch": 0.7822748028227481, "grad_norm": 0.8321359651169101, "learning_rate": 8.049120141157752e-08, "loss": 1.5281, "step": 3769 }, { "epoch": 0.7824823578248236, "grad_norm": 0.669062632940801, "learning_rate": 8.041706508091102e-08, "loss": 1.5371, "step": 3770 }, { "epoch": 0.7826899128268991, "grad_norm": 0.7072107789837956, "learning_rate": 8.034298809720259e-08, "loss": 1.4802, "step": 3771 }, { "epoch": 0.7828974678289746, "grad_norm": 0.9694839930411056, "learning_rate": 8.026897049194805e-08, "loss": 1.5827, "step": 3772 }, { "epoch": 0.7831050228310502, "grad_norm": 2.2323015066991014, "learning_rate": 8.019501229661753e-08, "loss": 1.5571, "step": 3773 }, { "epoch": 0.7833125778331258, "grad_norm": 0.8829954343640335, "learning_rate": 8.01211135426562e-08, "loss": 1.513, "step": 3774 }, { "epoch": 0.7835201328352013, "grad_norm": 0.762528386982982, "learning_rate": 8.004727426148384e-08, "loss": 1.427, "step": 3775 }, { "epoch": 0.7837276878372769, "grad_norm": 1.5006911856192857, "learning_rate": 7.997349448449491e-08, "loss": 1.5287, "step": 3776 }, { "epoch": 0.7839352428393525, "grad_norm": 0.7332664802109321, "learning_rate": 7.989977424305859e-08, "loss": 1.4988, "step": 3777 }, { "epoch": 0.7841427978414279, "grad_norm": 0.6956308212824712, "learning_rate": 7.982611356851887e-08, "loss": 1.596, "step": 3778 }, { "epoch": 0.7843503528435035, "grad_norm": 1.3954923716870664, "learning_rate": 7.975251249219424e-08, "loss": 1.4597, "step": 3779 }, { "epoch": 0.7845579078455791, "grad_norm": 0.828086102970035, "learning_rate": 7.967897104537788e-08, "loss": 1.5027, "step": 3780 }, { "epoch": 0.7847654628476546, "grad_norm": 0.8836328427525264, "learning_rate": 7.960548925933786e-08, "loss": 1.5138, "step": 3781 }, { "epoch": 0.7849730178497302, "grad_norm": 1.0363897182174382, "learning_rate": 7.953206716531639e-08, "loss": 1.5149, "step": 3782 }, { "epoch": 0.7851805728518058, "grad_norm": 0.7921597815198995, "learning_rate": 7.945870479453084e-08, "loss": 1.5505, "step": 3783 }, { "epoch": 0.7853881278538812, "grad_norm": 2.0687899270821166, "learning_rate": 7.938540217817285e-08, "loss": 1.5512, "step": 3784 }, { "epoch": 0.7855956828559568, "grad_norm": 0.7272525902992321, "learning_rate": 7.931215934740873e-08, "loss": 1.5294, "step": 3785 }, { "epoch": 0.7858032378580324, "grad_norm": 0.8660182828580697, "learning_rate": 7.923897633337939e-08, "loss": 1.4758, "step": 3786 }, { "epoch": 0.7860107928601079, "grad_norm": 1.2755352483770819, "learning_rate": 7.916585316720039e-08, "loss": 1.4728, "step": 3787 }, { "epoch": 0.7862183478621835, "grad_norm": 0.8623628404181108, "learning_rate": 7.909278987996157e-08, "loss": 1.5123, "step": 3788 }, { "epoch": 0.7864259028642591, "grad_norm": 0.7458981809530579, "learning_rate": 7.901978650272772e-08, "loss": 1.5909, "step": 3789 }, { "epoch": 0.7866334578663345, "grad_norm": 0.6640707473838863, "learning_rate": 7.894684306653781e-08, "loss": 1.5525, "step": 3790 }, { "epoch": 0.7868410128684101, "grad_norm": 0.8683760806748698, "learning_rate": 7.887395960240548e-08, "loss": 1.5188, "step": 3791 }, { "epoch": 0.7870485678704857, "grad_norm": 0.9084728902160711, "learning_rate": 7.88011361413188e-08, "loss": 1.4716, "step": 3792 }, { "epoch": 0.7872561228725612, "grad_norm": 0.8931162535612022, "learning_rate": 7.872837271424044e-08, "loss": 1.5544, "step": 3793 }, { "epoch": 0.7874636778746368, "grad_norm": 0.6460004877130135, "learning_rate": 7.86556693521075e-08, "loss": 1.4741, "step": 3794 }, { "epoch": 0.7876712328767124, "grad_norm": 0.7034857915995364, "learning_rate": 7.858302608583138e-08, "loss": 1.4803, "step": 3795 }, { "epoch": 0.7878787878787878, "grad_norm": 0.6623982003852003, "learning_rate": 7.85104429462983e-08, "loss": 1.5303, "step": 3796 }, { "epoch": 0.7880863428808634, "grad_norm": 1.1133133388108711, "learning_rate": 7.843791996436841e-08, "loss": 1.5157, "step": 3797 }, { "epoch": 0.788293897882939, "grad_norm": 1.1949157436516982, "learning_rate": 7.836545717087675e-08, "loss": 1.5005, "step": 3798 }, { "epoch": 0.7885014528850145, "grad_norm": 1.3589645995798711, "learning_rate": 7.829305459663253e-08, "loss": 1.4688, "step": 3799 }, { "epoch": 0.7887090078870901, "grad_norm": 1.1256971724652678, "learning_rate": 7.82207122724194e-08, "loss": 1.5563, "step": 3800 }, { "epoch": 0.7889165628891657, "grad_norm": 0.6952310962030432, "learning_rate": 7.814843022899531e-08, "loss": 1.5187, "step": 3801 }, { "epoch": 0.7891241178912412, "grad_norm": 0.9310453197307528, "learning_rate": 7.807620849709286e-08, "loss": 1.5251, "step": 3802 }, { "epoch": 0.7893316728933167, "grad_norm": 1.000908443559828, "learning_rate": 7.800404710741857e-08, "loss": 1.5833, "step": 3803 }, { "epoch": 0.7895392278953923, "grad_norm": 0.8475468738785007, "learning_rate": 7.793194609065373e-08, "loss": 1.4907, "step": 3804 }, { "epoch": 0.7897467828974678, "grad_norm": 0.7078086153324371, "learning_rate": 7.785990547745374e-08, "loss": 1.4704, "step": 3805 }, { "epoch": 0.7899543378995434, "grad_norm": 0.7119022791600783, "learning_rate": 7.778792529844826e-08, "loss": 1.5093, "step": 3806 }, { "epoch": 0.790161892901619, "grad_norm": 0.6738375791146888, "learning_rate": 7.771600558424152e-08, "loss": 1.5654, "step": 3807 }, { "epoch": 0.7903694479036945, "grad_norm": 1.1868599223865224, "learning_rate": 7.764414636541175e-08, "loss": 1.4883, "step": 3808 }, { "epoch": 0.79057700290577, "grad_norm": 0.6224598939070389, "learning_rate": 7.757234767251159e-08, "loss": 1.5634, "step": 3809 }, { "epoch": 0.7907845579078456, "grad_norm": 1.4109504168100024, "learning_rate": 7.750060953606795e-08, "loss": 1.5388, "step": 3810 }, { "epoch": 0.7909921129099211, "grad_norm": 0.750672852855631, "learning_rate": 7.742893198658207e-08, "loss": 1.5594, "step": 3811 }, { "epoch": 0.7911996679119967, "grad_norm": 0.9601149668161083, "learning_rate": 7.735731505452916e-08, "loss": 1.5171, "step": 3812 }, { "epoch": 0.7914072229140723, "grad_norm": 0.6934940716436475, "learning_rate": 7.7285758770359e-08, "loss": 1.5707, "step": 3813 }, { "epoch": 0.7916147779161478, "grad_norm": 0.9586831429557638, "learning_rate": 7.721426316449538e-08, "loss": 1.503, "step": 3814 }, { "epoch": 0.7918223329182233, "grad_norm": 0.727117044470924, "learning_rate": 7.714282826733627e-08, "loss": 1.573, "step": 3815 }, { "epoch": 0.7920298879202988, "grad_norm": 0.7258883514207821, "learning_rate": 7.707145410925397e-08, "loss": 1.5284, "step": 3816 }, { "epoch": 0.7922374429223744, "grad_norm": 12.323516611009495, "learning_rate": 7.70001407205948e-08, "loss": 1.4886, "step": 3817 }, { "epoch": 0.79244499792445, "grad_norm": 0.6711876057145292, "learning_rate": 7.692888813167942e-08, "loss": 1.5204, "step": 3818 }, { "epoch": 0.7926525529265255, "grad_norm": 1.0533549166405607, "learning_rate": 7.685769637280246e-08, "loss": 1.4556, "step": 3819 }, { "epoch": 0.7928601079286011, "grad_norm": 0.972836022817293, "learning_rate": 7.678656547423282e-08, "loss": 1.59, "step": 3820 }, { "epoch": 0.7930676629306767, "grad_norm": 0.767392440444129, "learning_rate": 7.671549546621337e-08, "loss": 1.4897, "step": 3821 }, { "epoch": 0.7932752179327521, "grad_norm": 0.7178476520542515, "learning_rate": 7.664448637896135e-08, "loss": 1.4294, "step": 3822 }, { "epoch": 0.7934827729348277, "grad_norm": 0.796196181175966, "learning_rate": 7.657353824266777e-08, "loss": 1.5647, "step": 3823 }, { "epoch": 0.7936903279369033, "grad_norm": 0.8611464441889768, "learning_rate": 7.6502651087498e-08, "loss": 1.519, "step": 3824 }, { "epoch": 0.7938978829389788, "grad_norm": 0.788630629261394, "learning_rate": 7.643182494359137e-08, "loss": 1.6052, "step": 3825 }, { "epoch": 0.7941054379410544, "grad_norm": 0.6879220984840951, "learning_rate": 7.636105984106125e-08, "loss": 1.4583, "step": 3826 }, { "epoch": 0.79431299294313, "grad_norm": 0.7485399682769517, "learning_rate": 7.629035580999504e-08, "loss": 1.5529, "step": 3827 }, { "epoch": 0.7945205479452054, "grad_norm": 0.9942223646125695, "learning_rate": 7.621971288045436e-08, "loss": 1.5415, "step": 3828 }, { "epoch": 0.794728102947281, "grad_norm": 0.6681319118955458, "learning_rate": 7.614913108247451e-08, "loss": 1.4598, "step": 3829 }, { "epoch": 0.7949356579493566, "grad_norm": 1.1967036992304174, "learning_rate": 7.607861044606516e-08, "loss": 1.467, "step": 3830 }, { "epoch": 0.7951432129514321, "grad_norm": 1.6029952828306122, "learning_rate": 7.600815100120977e-08, "loss": 1.489, "step": 3831 }, { "epoch": 0.7953507679535077, "grad_norm": 0.7758285658518153, "learning_rate": 7.593775277786572e-08, "loss": 1.5299, "step": 3832 }, { "epoch": 0.7955583229555833, "grad_norm": 1.236228425852546, "learning_rate": 7.586741580596464e-08, "loss": 1.4974, "step": 3833 }, { "epoch": 0.7957658779576587, "grad_norm": 0.8086451800153249, "learning_rate": 7.579714011541183e-08, "loss": 1.5466, "step": 3834 }, { "epoch": 0.7959734329597343, "grad_norm": 1.9761647944911873, "learning_rate": 7.572692573608667e-08, "loss": 1.5511, "step": 3835 }, { "epoch": 0.7961809879618099, "grad_norm": 0.7472347685052534, "learning_rate": 7.565677269784243e-08, "loss": 1.499, "step": 3836 }, { "epoch": 0.7963885429638854, "grad_norm": 0.7078437594225262, "learning_rate": 7.558668103050643e-08, "loss": 1.4638, "step": 3837 }, { "epoch": 0.796596097965961, "grad_norm": 0.671621701620675, "learning_rate": 7.55166507638796e-08, "loss": 1.4281, "step": 3838 }, { "epoch": 0.7968036529680366, "grad_norm": 0.6390390284844484, "learning_rate": 7.544668192773712e-08, "loss": 1.5204, "step": 3839 }, { "epoch": 0.797011207970112, "grad_norm": 1.104077219766796, "learning_rate": 7.53767745518278e-08, "loss": 1.5223, "step": 3840 }, { "epoch": 0.7972187629721876, "grad_norm": 0.9119077366866154, "learning_rate": 7.53069286658744e-08, "loss": 1.4567, "step": 3841 }, { "epoch": 0.7974263179742632, "grad_norm": 0.9714383685205503, "learning_rate": 7.523714429957351e-08, "loss": 1.5393, "step": 3842 }, { "epoch": 0.7976338729763387, "grad_norm": 0.8706994516262466, "learning_rate": 7.516742148259568e-08, "loss": 1.52, "step": 3843 }, { "epoch": 0.7978414279784143, "grad_norm": 0.8319415582822318, "learning_rate": 7.509776024458514e-08, "loss": 1.5033, "step": 3844 }, { "epoch": 0.7980489829804899, "grad_norm": 0.6780742323181039, "learning_rate": 7.502816061516002e-08, "loss": 1.4795, "step": 3845 }, { "epoch": 0.7982565379825654, "grad_norm": 0.8182092940025123, "learning_rate": 7.495862262391222e-08, "loss": 1.4866, "step": 3846 }, { "epoch": 0.7984640929846409, "grad_norm": 0.7862229344316162, "learning_rate": 7.488914630040737e-08, "loss": 1.4307, "step": 3847 }, { "epoch": 0.7986716479867165, "grad_norm": 1.1802221959396697, "learning_rate": 7.481973167418512e-08, "loss": 1.6294, "step": 3848 }, { "epoch": 0.798879202988792, "grad_norm": 0.803247594134049, "learning_rate": 7.475037877475863e-08, "loss": 1.5033, "step": 3849 }, { "epoch": 0.7990867579908676, "grad_norm": 1.510249894192323, "learning_rate": 7.468108763161495e-08, "loss": 1.5315, "step": 3850 }, { "epoch": 0.7992943129929432, "grad_norm": 0.6407938632163084, "learning_rate": 7.461185827421475e-08, "loss": 1.4926, "step": 3851 }, { "epoch": 0.7995018679950187, "grad_norm": 0.6622262503774939, "learning_rate": 7.454269073199267e-08, "loss": 1.4781, "step": 3852 }, { "epoch": 0.7997094229970942, "grad_norm": 1.0209070709245331, "learning_rate": 7.447358503435673e-08, "loss": 1.5373, "step": 3853 }, { "epoch": 0.7999169779991698, "grad_norm": 0.6329284519458666, "learning_rate": 7.440454121068895e-08, "loss": 1.5231, "step": 3854 }, { "epoch": 0.8001245330012453, "grad_norm": 0.9136660302900956, "learning_rate": 7.433555929034493e-08, "loss": 1.5606, "step": 3855 }, { "epoch": 0.8003320880033209, "grad_norm": 0.6295169325286589, "learning_rate": 7.426663930265394e-08, "loss": 1.5154, "step": 3856 }, { "epoch": 0.8005396430053965, "grad_norm": 1.1273239063433547, "learning_rate": 7.419778127691885e-08, "loss": 1.5534, "step": 3857 }, { "epoch": 0.800747198007472, "grad_norm": 0.8445575302503551, "learning_rate": 7.41289852424164e-08, "loss": 1.4837, "step": 3858 }, { "epoch": 0.8009547530095475, "grad_norm": 1.2778901830517406, "learning_rate": 7.406025122839674e-08, "loss": 1.5638, "step": 3859 }, { "epoch": 0.801162308011623, "grad_norm": 0.7747781489593895, "learning_rate": 7.399157926408379e-08, "loss": 1.4585, "step": 3860 }, { "epoch": 0.8013698630136986, "grad_norm": 0.9269615278458849, "learning_rate": 7.392296937867504e-08, "loss": 1.5848, "step": 3861 }, { "epoch": 0.8015774180157742, "grad_norm": 0.6722431753258891, "learning_rate": 7.385442160134154e-08, "loss": 1.538, "step": 3862 }, { "epoch": 0.8017849730178497, "grad_norm": 0.8534815699612766, "learning_rate": 7.37859359612281e-08, "loss": 1.5587, "step": 3863 }, { "epoch": 0.8019925280199253, "grad_norm": 0.7334316412176408, "learning_rate": 7.371751248745287e-08, "loss": 1.5953, "step": 3864 }, { "epoch": 0.8022000830220009, "grad_norm": 0.7292637896641967, "learning_rate": 7.364915120910777e-08, "loss": 1.6417, "step": 3865 }, { "epoch": 0.8024076380240763, "grad_norm": 0.7643331973665636, "learning_rate": 7.35808521552582e-08, "loss": 1.5008, "step": 3866 }, { "epoch": 0.8026151930261519, "grad_norm": 0.9534567074707053, "learning_rate": 7.351261535494309e-08, "loss": 1.5734, "step": 3867 }, { "epoch": 0.8028227480282275, "grad_norm": 0.8442508637811588, "learning_rate": 7.344444083717483e-08, "loss": 1.4857, "step": 3868 }, { "epoch": 0.803030303030303, "grad_norm": 1.035905981812537, "learning_rate": 7.337632863093956e-08, "loss": 1.4805, "step": 3869 }, { "epoch": 0.8032378580323786, "grad_norm": 0.7324116497096257, "learning_rate": 7.330827876519674e-08, "loss": 1.4544, "step": 3870 }, { "epoch": 0.8034454130344542, "grad_norm": 0.9948183654442785, "learning_rate": 7.324029126887934e-08, "loss": 1.5091, "step": 3871 }, { "epoch": 0.8036529680365296, "grad_norm": 0.6639923825482803, "learning_rate": 7.317236617089384e-08, "loss": 1.445, "step": 3872 }, { "epoch": 0.8038605230386052, "grad_norm": 0.7544995742017733, "learning_rate": 7.310450350012014e-08, "loss": 1.4993, "step": 3873 }, { "epoch": 0.8040680780406808, "grad_norm": 0.8374010144518952, "learning_rate": 7.303670328541174e-08, "loss": 1.4786, "step": 3874 }, { "epoch": 0.8042756330427563, "grad_norm": 0.7765825548957686, "learning_rate": 7.296896555559545e-08, "loss": 1.5372, "step": 3875 }, { "epoch": 0.8044831880448319, "grad_norm": 0.8292215954590633, "learning_rate": 7.290129033947157e-08, "loss": 1.5174, "step": 3876 }, { "epoch": 0.8046907430469075, "grad_norm": 0.6371192683135943, "learning_rate": 7.283367766581374e-08, "loss": 1.5826, "step": 3877 }, { "epoch": 0.8048982980489829, "grad_norm": 6.161995203666757, "learning_rate": 7.27661275633692e-08, "loss": 1.477, "step": 3878 }, { "epoch": 0.8051058530510585, "grad_norm": 0.6754767166600503, "learning_rate": 7.269864006085828e-08, "loss": 1.4497, "step": 3879 }, { "epoch": 0.8053134080531341, "grad_norm": 0.7662155229934128, "learning_rate": 7.263121518697504e-08, "loss": 1.5367, "step": 3880 }, { "epoch": 0.8055209630552096, "grad_norm": 0.6607989901290842, "learning_rate": 7.256385297038669e-08, "loss": 1.4935, "step": 3881 }, { "epoch": 0.8057285180572852, "grad_norm": 0.9536220755827555, "learning_rate": 7.249655343973384e-08, "loss": 1.5654, "step": 3882 }, { "epoch": 0.8059360730593608, "grad_norm": 0.750685502810434, "learning_rate": 7.242931662363043e-08, "loss": 1.5163, "step": 3883 }, { "epoch": 0.8061436280614362, "grad_norm": 2.5670962030730626, "learning_rate": 7.236214255066387e-08, "loss": 1.4778, "step": 3884 }, { "epoch": 0.8063511830635118, "grad_norm": 0.9602932267573792, "learning_rate": 7.229503124939474e-08, "loss": 1.5177, "step": 3885 }, { "epoch": 0.8065587380655874, "grad_norm": 1.4307181956918524, "learning_rate": 7.222798274835691e-08, "loss": 1.4839, "step": 3886 }, { "epoch": 0.8067662930676629, "grad_norm": 1.5139680655817953, "learning_rate": 7.21609970760578e-08, "loss": 1.5649, "step": 3887 }, { "epoch": 0.8069738480697385, "grad_norm": 0.7507957692202775, "learning_rate": 7.209407426097771e-08, "loss": 1.5233, "step": 3888 }, { "epoch": 0.8071814030718141, "grad_norm": 0.7970275094463274, "learning_rate": 7.202721433157065e-08, "loss": 1.4786, "step": 3889 }, { "epoch": 0.8073889580738896, "grad_norm": 1.5403777399492706, "learning_rate": 7.196041731626357e-08, "loss": 1.4664, "step": 3890 }, { "epoch": 0.8075965130759651, "grad_norm": 0.7170626884402947, "learning_rate": 7.189368324345684e-08, "loss": 1.4986, "step": 3891 }, { "epoch": 0.8078040680780407, "grad_norm": 0.8497841324387797, "learning_rate": 7.182701214152393e-08, "loss": 1.4893, "step": 3892 }, { "epoch": 0.8080116230801162, "grad_norm": 1.7606896202113627, "learning_rate": 7.17604040388118e-08, "loss": 1.5014, "step": 3893 }, { "epoch": 0.8082191780821918, "grad_norm": 2.380417792949077, "learning_rate": 7.169385896364024e-08, "loss": 1.6079, "step": 3894 }, { "epoch": 0.8084267330842674, "grad_norm": 1.2069511101542256, "learning_rate": 7.162737694430258e-08, "loss": 1.4392, "step": 3895 }, { "epoch": 0.8086342880863429, "grad_norm": 1.2657653724173512, "learning_rate": 7.156095800906519e-08, "loss": 1.5376, "step": 3896 }, { "epoch": 0.8088418430884184, "grad_norm": 0.881473986770486, "learning_rate": 7.149460218616762e-08, "loss": 1.5203, "step": 3897 }, { "epoch": 0.809049398090494, "grad_norm": 0.939380879352767, "learning_rate": 7.142830950382255e-08, "loss": 1.4858, "step": 3898 }, { "epoch": 0.8092569530925695, "grad_norm": 0.6894016139318221, "learning_rate": 7.136207999021598e-08, "loss": 1.5571, "step": 3899 }, { "epoch": 0.8094645080946451, "grad_norm": 0.9955442421253108, "learning_rate": 7.129591367350687e-08, "loss": 1.605, "step": 3900 }, { "epoch": 0.8096720630967207, "grad_norm": 0.9466330902587341, "learning_rate": 7.122981058182738e-08, "loss": 1.5101, "step": 3901 }, { "epoch": 0.8098796180987962, "grad_norm": 0.9201279792541084, "learning_rate": 7.116377074328286e-08, "loss": 1.4535, "step": 3902 }, { "epoch": 0.8100871731008717, "grad_norm": 0.8987459809397822, "learning_rate": 7.109779418595156e-08, "loss": 1.5099, "step": 3903 }, { "epoch": 0.8102947281029472, "grad_norm": 0.8394512381360564, "learning_rate": 7.103188093788514e-08, "loss": 1.5202, "step": 3904 }, { "epoch": 0.8105022831050228, "grad_norm": 1.0019271931051434, "learning_rate": 7.096603102710801e-08, "loss": 1.4988, "step": 3905 }, { "epoch": 0.8107098381070984, "grad_norm": 0.8486595373804554, "learning_rate": 7.090024448161787e-08, "loss": 1.4988, "step": 3906 }, { "epoch": 0.8109173931091739, "grad_norm": 0.627107944819042, "learning_rate": 7.08345213293854e-08, "loss": 1.4776, "step": 3907 }, { "epoch": 0.8111249481112495, "grad_norm": 0.7180127528183975, "learning_rate": 7.076886159835437e-08, "loss": 1.4889, "step": 3908 }, { "epoch": 0.811332503113325, "grad_norm": 0.8250597194821029, "learning_rate": 7.070326531644147e-08, "loss": 1.5134, "step": 3909 }, { "epoch": 0.8115400581154005, "grad_norm": 0.7623502901965159, "learning_rate": 7.063773251153657e-08, "loss": 1.5209, "step": 3910 }, { "epoch": 0.8117476131174761, "grad_norm": 0.8853965008680758, "learning_rate": 7.057226321150249e-08, "loss": 1.4935, "step": 3911 }, { "epoch": 0.8119551681195517, "grad_norm": 0.658447897728513, "learning_rate": 7.050685744417497e-08, "loss": 1.5537, "step": 3912 }, { "epoch": 0.8121627231216272, "grad_norm": 0.772900224816853, "learning_rate": 7.044151523736295e-08, "loss": 1.5726, "step": 3913 }, { "epoch": 0.8123702781237028, "grad_norm": 0.7762644944873829, "learning_rate": 7.037623661884798e-08, "loss": 1.5239, "step": 3914 }, { "epoch": 0.8125778331257784, "grad_norm": 1.7418646295764761, "learning_rate": 7.031102161638496e-08, "loss": 1.5462, "step": 3915 }, { "epoch": 0.8127853881278538, "grad_norm": 1.037776632386954, "learning_rate": 7.024587025770154e-08, "loss": 1.4912, "step": 3916 }, { "epoch": 0.8129929431299294, "grad_norm": 0.8405495028598344, "learning_rate": 7.018078257049836e-08, "loss": 1.503, "step": 3917 }, { "epoch": 0.813200498132005, "grad_norm": 1.2601807883092309, "learning_rate": 7.011575858244889e-08, "loss": 1.5968, "step": 3918 }, { "epoch": 0.8134080531340805, "grad_norm": 0.8631842028497757, "learning_rate": 7.005079832119977e-08, "loss": 1.5123, "step": 3919 }, { "epoch": 0.8136156081361561, "grad_norm": 0.8109946111067116, "learning_rate": 6.998590181437018e-08, "loss": 1.5462, "step": 3920 }, { "epoch": 0.8138231631382317, "grad_norm": 0.7499304668430169, "learning_rate": 6.992106908955253e-08, "loss": 1.5361, "step": 3921 }, { "epoch": 0.8140307181403071, "grad_norm": 0.6435543150522368, "learning_rate": 6.985630017431195e-08, "loss": 1.5213, "step": 3922 }, { "epoch": 0.8142382731423827, "grad_norm": 3.350248213726436, "learning_rate": 6.97915950961864e-08, "loss": 1.5266, "step": 3923 }, { "epoch": 0.8144458281444583, "grad_norm": 0.7466573640325941, "learning_rate": 6.972695388268683e-08, "loss": 1.431, "step": 3924 }, { "epoch": 0.8146533831465338, "grad_norm": 0.6616118246346304, "learning_rate": 6.966237656129699e-08, "loss": 1.5716, "step": 3925 }, { "epoch": 0.8148609381486094, "grad_norm": 0.7911929576274094, "learning_rate": 6.959786315947337e-08, "loss": 1.4813, "step": 3926 }, { "epoch": 0.815068493150685, "grad_norm": 0.6657740916108909, "learning_rate": 6.953341370464534e-08, "loss": 1.5693, "step": 3927 }, { "epoch": 0.8152760481527604, "grad_norm": 0.8922021290033862, "learning_rate": 6.946902822421523e-08, "loss": 1.4883, "step": 3928 }, { "epoch": 0.815483603154836, "grad_norm": 0.6748956994472686, "learning_rate": 6.940470674555787e-08, "loss": 1.5426, "step": 3929 }, { "epoch": 0.8156911581569116, "grad_norm": 0.8180035705269678, "learning_rate": 6.934044929602118e-08, "loss": 1.4922, "step": 3930 }, { "epoch": 0.8158987131589871, "grad_norm": 0.6885504284129773, "learning_rate": 6.927625590292562e-08, "loss": 1.5001, "step": 3931 }, { "epoch": 0.8161062681610627, "grad_norm": 0.7493248564437155, "learning_rate": 6.921212659356462e-08, "loss": 1.4672, "step": 3932 }, { "epoch": 0.8163138231631383, "grad_norm": 0.7009465607093497, "learning_rate": 6.914806139520412e-08, "loss": 1.5368, "step": 3933 }, { "epoch": 0.8165213781652138, "grad_norm": 0.6555749643294975, "learning_rate": 6.908406033508311e-08, "loss": 1.5656, "step": 3934 }, { "epoch": 0.8167289331672893, "grad_norm": 2.862404783126632, "learning_rate": 6.9020123440413e-08, "loss": 1.5035, "step": 3935 }, { "epoch": 0.8169364881693649, "grad_norm": 1.8440126188902186, "learning_rate": 6.895625073837813e-08, "loss": 1.4635, "step": 3936 }, { "epoch": 0.8171440431714404, "grad_norm": 0.7583126981058725, "learning_rate": 6.889244225613549e-08, "loss": 1.582, "step": 3937 }, { "epoch": 0.817351598173516, "grad_norm": 0.9761642608640844, "learning_rate": 6.882869802081463e-08, "loss": 1.493, "step": 3938 }, { "epoch": 0.8175591531755916, "grad_norm": 0.6784036569302359, "learning_rate": 6.87650180595181e-08, "loss": 1.5166, "step": 3939 }, { "epoch": 0.8177667081776671, "grad_norm": 1.537508250417813, "learning_rate": 6.870140239932081e-08, "loss": 1.5095, "step": 3940 }, { "epoch": 0.8179742631797426, "grad_norm": 0.8462411484393287, "learning_rate": 6.863785106727044e-08, "loss": 1.5169, "step": 3941 }, { "epoch": 0.8181818181818182, "grad_norm": 1.1495082592010728, "learning_rate": 6.857436409038737e-08, "loss": 1.544, "step": 3942 }, { "epoch": 0.8183893731838937, "grad_norm": 1.2199801385914453, "learning_rate": 6.851094149566463e-08, "loss": 1.5085, "step": 3943 }, { "epoch": 0.8185969281859693, "grad_norm": 1.1344819929576584, "learning_rate": 6.844758331006767e-08, "loss": 1.6313, "step": 3944 }, { "epoch": 0.8188044831880449, "grad_norm": 0.7719865711365146, "learning_rate": 6.838428956053484e-08, "loss": 1.4869, "step": 3945 }, { "epoch": 0.8190120381901204, "grad_norm": 0.7716814754133661, "learning_rate": 6.832106027397692e-08, "loss": 1.4697, "step": 3946 }, { "epoch": 0.819219593192196, "grad_norm": 0.7200453306772558, "learning_rate": 6.825789547727734e-08, "loss": 1.5536, "step": 3947 }, { "epoch": 0.8194271481942715, "grad_norm": 0.9333078881532242, "learning_rate": 6.819479519729203e-08, "loss": 1.5322, "step": 3948 }, { "epoch": 0.819634703196347, "grad_norm": 1.0854579305499146, "learning_rate": 6.813175946084964e-08, "loss": 1.4809, "step": 3949 }, { "epoch": 0.8198422581984226, "grad_norm": 0.8929740088085423, "learning_rate": 6.806878829475126e-08, "loss": 1.577, "step": 3950 }, { "epoch": 0.8200498132004981, "grad_norm": 0.8738077849303773, "learning_rate": 6.800588172577056e-08, "loss": 1.4622, "step": 3951 }, { "epoch": 0.8202573682025737, "grad_norm": 0.7658881722551253, "learning_rate": 6.79430397806537e-08, "loss": 1.5241, "step": 3952 }, { "epoch": 0.8204649232046493, "grad_norm": 0.7887480792608634, "learning_rate": 6.788026248611943e-08, "loss": 1.5417, "step": 3953 }, { "epoch": 0.8206724782067247, "grad_norm": 1.0391172925583911, "learning_rate": 6.781754986885908e-08, "loss": 1.5583, "step": 3954 }, { "epoch": 0.8208800332088003, "grad_norm": 0.6797121632495665, "learning_rate": 6.775490195553623e-08, "loss": 1.5143, "step": 3955 }, { "epoch": 0.8210875882108759, "grad_norm": 0.930437352774962, "learning_rate": 6.769231877278722e-08, "loss": 1.483, "step": 3956 }, { "epoch": 0.8212951432129514, "grad_norm": 0.977655021724759, "learning_rate": 6.762980034722074e-08, "loss": 1.556, "step": 3957 }, { "epoch": 0.821502698215027, "grad_norm": 1.1061971670670543, "learning_rate": 6.756734670541796e-08, "loss": 1.5894, "step": 3958 }, { "epoch": 0.8217102532171026, "grad_norm": 0.7392699414131901, "learning_rate": 6.750495787393246e-08, "loss": 1.4559, "step": 3959 }, { "epoch": 0.821917808219178, "grad_norm": 0.7135311695772597, "learning_rate": 6.744263387929043e-08, "loss": 1.588, "step": 3960 }, { "epoch": 0.8221253632212536, "grad_norm": 0.83828066088895, "learning_rate": 6.738037474799024e-08, "loss": 1.5301, "step": 3961 }, { "epoch": 0.8223329182233292, "grad_norm": 0.678534812742404, "learning_rate": 6.731818050650291e-08, "loss": 1.5302, "step": 3962 }, { "epoch": 0.8225404732254047, "grad_norm": 0.7335522427644094, "learning_rate": 6.725605118127178e-08, "loss": 1.5232, "step": 3963 }, { "epoch": 0.8227480282274803, "grad_norm": 0.6417187060611894, "learning_rate": 6.719398679871251e-08, "loss": 1.4722, "step": 3964 }, { "epoch": 0.8229555832295559, "grad_norm": 0.7662115995940865, "learning_rate": 6.713198738521333e-08, "loss": 1.5787, "step": 3965 }, { "epoch": 0.8231631382316313, "grad_norm": 1.8027099372868731, "learning_rate": 6.707005296713468e-08, "loss": 1.4893, "step": 3966 }, { "epoch": 0.8233706932337069, "grad_norm": 0.8526968726829183, "learning_rate": 6.700818357080946e-08, "loss": 1.5642, "step": 3967 }, { "epoch": 0.8235782482357825, "grad_norm": 0.7536337905438388, "learning_rate": 6.694637922254285e-08, "loss": 1.5283, "step": 3968 }, { "epoch": 0.823785803237858, "grad_norm": 0.6901380135153965, "learning_rate": 6.688463994861256e-08, "loss": 1.5496, "step": 3969 }, { "epoch": 0.8239933582399336, "grad_norm": 0.9394012051448764, "learning_rate": 6.682296577526825e-08, "loss": 1.5479, "step": 3970 }, { "epoch": 0.8242009132420092, "grad_norm": 3.338279899333179, "learning_rate": 6.676135672873235e-08, "loss": 1.5816, "step": 3971 }, { "epoch": 0.8244084682440846, "grad_norm": 0.837861545862268, "learning_rate": 6.66998128351993e-08, "loss": 1.5243, "step": 3972 }, { "epoch": 0.8246160232461602, "grad_norm": 0.8099423206670616, "learning_rate": 6.663833412083594e-08, "loss": 1.5165, "step": 3973 }, { "epoch": 0.8248235782482358, "grad_norm": 0.7002104648336371, "learning_rate": 6.657692061178135e-08, "loss": 1.5257, "step": 3974 }, { "epoch": 0.8250311332503113, "grad_norm": 0.8528257669780135, "learning_rate": 6.651557233414701e-08, "loss": 1.6136, "step": 3975 }, { "epoch": 0.8252386882523869, "grad_norm": 0.9462603281941957, "learning_rate": 6.645428931401654e-08, "loss": 1.5316, "step": 3976 }, { "epoch": 0.8254462432544625, "grad_norm": 0.6783504465284893, "learning_rate": 6.639307157744584e-08, "loss": 1.5971, "step": 3977 }, { "epoch": 0.825653798256538, "grad_norm": 1.014143670423745, "learning_rate": 6.633191915046308e-08, "loss": 1.4433, "step": 3978 }, { "epoch": 0.8258613532586135, "grad_norm": 0.6574601956332877, "learning_rate": 6.627083205906858e-08, "loss": 1.4916, "step": 3979 }, { "epoch": 0.8260689082606891, "grad_norm": 0.7508584673423091, "learning_rate": 6.620981032923507e-08, "loss": 1.5214, "step": 3980 }, { "epoch": 0.8262764632627646, "grad_norm": 1.9789408390185967, "learning_rate": 6.614885398690731e-08, "loss": 1.5309, "step": 3981 }, { "epoch": 0.8264840182648402, "grad_norm": 0.756981072224144, "learning_rate": 6.608796305800233e-08, "loss": 1.5772, "step": 3982 }, { "epoch": 0.8266915732669158, "grad_norm": 0.7642241825056885, "learning_rate": 6.602713756840925e-08, "loss": 1.5674, "step": 3983 }, { "epoch": 0.8268991282689913, "grad_norm": 0.6732728997199778, "learning_rate": 6.596637754398964e-08, "loss": 1.5036, "step": 3984 }, { "epoch": 0.8271066832710668, "grad_norm": 1.2061167295007007, "learning_rate": 6.590568301057684e-08, "loss": 1.5607, "step": 3985 }, { "epoch": 0.8273142382731424, "grad_norm": 0.8267377943104486, "learning_rate": 6.584505399397671e-08, "loss": 1.4657, "step": 3986 }, { "epoch": 0.8275217932752179, "grad_norm": 0.7452701686992949, "learning_rate": 6.578449051996704e-08, "loss": 1.5487, "step": 3987 }, { "epoch": 0.8277293482772935, "grad_norm": 0.7849842204686843, "learning_rate": 6.572399261429779e-08, "loss": 1.5224, "step": 3988 }, { "epoch": 0.8279369032793691, "grad_norm": 0.6890583808368584, "learning_rate": 6.566356030269107e-08, "loss": 1.5383, "step": 3989 }, { "epoch": 0.8281444582814446, "grad_norm": 0.6631750490967718, "learning_rate": 6.560319361084113e-08, "loss": 1.5068, "step": 3990 }, { "epoch": 0.8283520132835201, "grad_norm": 0.6934382234608076, "learning_rate": 6.554289256441428e-08, "loss": 1.5096, "step": 3991 }, { "epoch": 0.8285595682855957, "grad_norm": 0.7029998840001955, "learning_rate": 6.548265718904885e-08, "loss": 1.4434, "step": 3992 }, { "epoch": 0.8287671232876712, "grad_norm": 0.7602917098136772, "learning_rate": 6.542248751035549e-08, "loss": 1.534, "step": 3993 }, { "epoch": 0.8289746782897468, "grad_norm": 0.6970463824668981, "learning_rate": 6.536238355391653e-08, "loss": 1.5488, "step": 3994 }, { "epoch": 0.8291822332918223, "grad_norm": 0.8554720805167183, "learning_rate": 6.530234534528678e-08, "loss": 1.4676, "step": 3995 }, { "epoch": 0.8293897882938979, "grad_norm": 0.7399200491620423, "learning_rate": 6.524237290999273e-08, "loss": 1.5223, "step": 3996 }, { "epoch": 0.8295973432959735, "grad_norm": 1.109589195801519, "learning_rate": 6.518246627353316e-08, "loss": 1.5512, "step": 3997 }, { "epoch": 0.8298048982980489, "grad_norm": 0.8261470761235, "learning_rate": 6.512262546137879e-08, "loss": 1.4925, "step": 3998 }, { "epoch": 0.8300124533001245, "grad_norm": 0.7896728165008715, "learning_rate": 6.506285049897226e-08, "loss": 1.5333, "step": 3999 }, { "epoch": 0.8302200083022001, "grad_norm": 0.8472332292152489, "learning_rate": 6.500314141172835e-08, "loss": 1.5613, "step": 4000 }, { "epoch": 0.8304275633042756, "grad_norm": 0.7266928084604373, "learning_rate": 6.49434982250338e-08, "loss": 1.6294, "step": 4001 }, { "epoch": 0.8306351183063512, "grad_norm": 0.9058703164168328, "learning_rate": 6.488392096424731e-08, "loss": 1.4667, "step": 4002 }, { "epoch": 0.8308426733084268, "grad_norm": 0.7157024306252904, "learning_rate": 6.482440965469952e-08, "loss": 1.4278, "step": 4003 }, { "epoch": 0.8310502283105022, "grad_norm": 0.8619017286337939, "learning_rate": 6.476496432169305e-08, "loss": 1.4644, "step": 4004 }, { "epoch": 0.8312577833125778, "grad_norm": 0.8196160308817438, "learning_rate": 6.470558499050247e-08, "loss": 1.5187, "step": 4005 }, { "epoch": 0.8314653383146534, "grad_norm": 1.0416874361096575, "learning_rate": 6.464627168637437e-08, "loss": 1.4788, "step": 4006 }, { "epoch": 0.8316728933167289, "grad_norm": 0.6446496176647841, "learning_rate": 6.458702443452712e-08, "loss": 1.4749, "step": 4007 }, { "epoch": 0.8318804483188045, "grad_norm": 0.9902385284942313, "learning_rate": 6.452784326015112e-08, "loss": 1.5644, "step": 4008 }, { "epoch": 0.8320880033208801, "grad_norm": 0.6637870809850734, "learning_rate": 6.446872818840857e-08, "loss": 1.5092, "step": 4009 }, { "epoch": 0.8322955583229555, "grad_norm": 1.3557226768033919, "learning_rate": 6.440967924443376e-08, "loss": 1.464, "step": 4010 }, { "epoch": 0.8325031133250311, "grad_norm": 0.8224373065000923, "learning_rate": 6.435069645333255e-08, "loss": 1.5493, "step": 4011 }, { "epoch": 0.8327106683271067, "grad_norm": 0.6673155421788557, "learning_rate": 6.429177984018299e-08, "loss": 1.492, "step": 4012 }, { "epoch": 0.8329182233291822, "grad_norm": 1.1966523136035498, "learning_rate": 6.423292943003483e-08, "loss": 1.4943, "step": 4013 }, { "epoch": 0.8331257783312578, "grad_norm": 1.1958785207033091, "learning_rate": 6.417414524790972e-08, "loss": 1.5035, "step": 4014 }, { "epoch": 0.8333333333333334, "grad_norm": 0.683083340107233, "learning_rate": 6.411542731880104e-08, "loss": 1.4767, "step": 4015 }, { "epoch": 0.8335408883354088, "grad_norm": 0.8023184503380092, "learning_rate": 6.405677566767422e-08, "loss": 1.5772, "step": 4016 }, { "epoch": 0.8337484433374844, "grad_norm": 1.9828963594388602, "learning_rate": 6.39981903194663e-08, "loss": 1.49, "step": 4017 }, { "epoch": 0.83395599833956, "grad_norm": 0.6552785754007138, "learning_rate": 6.393967129908623e-08, "loss": 1.5242, "step": 4018 }, { "epoch": 0.8341635533416355, "grad_norm": 1.115460944958447, "learning_rate": 6.388121863141485e-08, "loss": 1.481, "step": 4019 }, { "epoch": 0.8343711083437111, "grad_norm": 2.3952332672823013, "learning_rate": 6.382283234130449e-08, "loss": 1.5793, "step": 4020 }, { "epoch": 0.8345786633457867, "grad_norm": 0.7691691239866543, "learning_rate": 6.37645124535796e-08, "loss": 1.5019, "step": 4021 }, { "epoch": 0.8347862183478622, "grad_norm": 1.0999299882569888, "learning_rate": 6.370625899303619e-08, "loss": 1.4645, "step": 4022 }, { "epoch": 0.8349937733499377, "grad_norm": 1.1305404759027466, "learning_rate": 6.36480719844421e-08, "loss": 1.4836, "step": 4023 }, { "epoch": 0.8352013283520133, "grad_norm": 0.7426489174245947, "learning_rate": 6.358995145253684e-08, "loss": 1.4689, "step": 4024 }, { "epoch": 0.8354088833540888, "grad_norm": 3.657741647753389, "learning_rate": 6.353189742203186e-08, "loss": 1.5608, "step": 4025 }, { "epoch": 0.8356164383561644, "grad_norm": 1.619263971695883, "learning_rate": 6.347390991761001e-08, "loss": 1.5022, "step": 4026 }, { "epoch": 0.83582399335824, "grad_norm": 0.7463013493328976, "learning_rate": 6.341598896392622e-08, "loss": 1.5812, "step": 4027 }, { "epoch": 0.8360315483603155, "grad_norm": 0.8919105757367832, "learning_rate": 6.33581345856068e-08, "loss": 1.5046, "step": 4028 }, { "epoch": 0.836239103362391, "grad_norm": 1.045769014945916, "learning_rate": 6.330034680724994e-08, "loss": 1.5252, "step": 4029 }, { "epoch": 0.8364466583644666, "grad_norm": 1.1356060064209288, "learning_rate": 6.324262565342551e-08, "loss": 1.5143, "step": 4030 }, { "epoch": 0.8366542133665421, "grad_norm": 0.863722223247178, "learning_rate": 6.318497114867496e-08, "loss": 1.5349, "step": 4031 }, { "epoch": 0.8368617683686177, "grad_norm": 0.7454296691034005, "learning_rate": 6.312738331751151e-08, "loss": 1.4336, "step": 4032 }, { "epoch": 0.8370693233706933, "grad_norm": 0.7981852137376518, "learning_rate": 6.306986218441989e-08, "loss": 1.5751, "step": 4033 }, { "epoch": 0.8372768783727688, "grad_norm": 0.9196672384985635, "learning_rate": 6.301240777385668e-08, "loss": 1.5902, "step": 4034 }, { "epoch": 0.8374844333748444, "grad_norm": 0.7031730723823548, "learning_rate": 6.295502011024982e-08, "loss": 1.5013, "step": 4035 }, { "epoch": 0.8376919883769199, "grad_norm": 0.7216430684092466, "learning_rate": 6.289769921799917e-08, "loss": 1.512, "step": 4036 }, { "epoch": 0.8378995433789954, "grad_norm": 2.729593799347814, "learning_rate": 6.284044512147594e-08, "loss": 1.5635, "step": 4037 }, { "epoch": 0.838107098381071, "grad_norm": 0.6598622031005454, "learning_rate": 6.278325784502313e-08, "loss": 1.5532, "step": 4038 }, { "epoch": 0.8383146533831465, "grad_norm": 0.7528190074016754, "learning_rate": 6.272613741295521e-08, "loss": 1.4635, "step": 4039 }, { "epoch": 0.8385222083852221, "grad_norm": 0.6400501362935004, "learning_rate": 6.266908384955827e-08, "loss": 1.5204, "step": 4040 }, { "epoch": 0.8387297633872977, "grad_norm": 1.813795407042309, "learning_rate": 6.261209717908995e-08, "loss": 1.5519, "step": 4041 }, { "epoch": 0.8389373183893731, "grad_norm": 0.7276495992698796, "learning_rate": 6.255517742577952e-08, "loss": 1.5482, "step": 4042 }, { "epoch": 0.8391448733914487, "grad_norm": 0.8518484520662879, "learning_rate": 6.249832461382775e-08, "loss": 1.4735, "step": 4043 }, { "epoch": 0.8393524283935243, "grad_norm": 2.598369620965311, "learning_rate": 6.244153876740686e-08, "loss": 1.545, "step": 4044 }, { "epoch": 0.8395599833955998, "grad_norm": 0.9537866917180096, "learning_rate": 6.238481991066085e-08, "loss": 1.4609, "step": 4045 }, { "epoch": 0.8397675383976754, "grad_norm": 0.6656352183916423, "learning_rate": 6.232816806770487e-08, "loss": 1.4427, "step": 4046 }, { "epoch": 0.839975093399751, "grad_norm": 0.7057138708790681, "learning_rate": 6.227158326262591e-08, "loss": 1.4061, "step": 4047 }, { "epoch": 0.8401826484018264, "grad_norm": 0.7004764760883524, "learning_rate": 6.221506551948233e-08, "loss": 1.5352, "step": 4048 }, { "epoch": 0.840390203403902, "grad_norm": 0.8454728395337476, "learning_rate": 6.215861486230392e-08, "loss": 1.5024, "step": 4049 }, { "epoch": 0.8405977584059776, "grad_norm": 1.8347602893046522, "learning_rate": 6.210223131509197e-08, "loss": 1.4925, "step": 4050 }, { "epoch": 0.8408053134080531, "grad_norm": 1.0991041899686835, "learning_rate": 6.204591490181941e-08, "loss": 1.5169, "step": 4051 }, { "epoch": 0.8410128684101287, "grad_norm": 0.6797311961478647, "learning_rate": 6.198966564643031e-08, "loss": 1.5296, "step": 4052 }, { "epoch": 0.8412204234122043, "grad_norm": 0.6797181790647425, "learning_rate": 6.193348357284048e-08, "loss": 1.5148, "step": 4053 }, { "epoch": 0.8414279784142797, "grad_norm": 2.276414250337877, "learning_rate": 6.187736870493699e-08, "loss": 1.5085, "step": 4054 }, { "epoch": 0.8416355334163553, "grad_norm": 0.8307526684781883, "learning_rate": 6.182132106657839e-08, "loss": 1.5162, "step": 4055 }, { "epoch": 0.8418430884184309, "grad_norm": 3.8172882819525, "learning_rate": 6.176534068159471e-08, "loss": 1.4723, "step": 4056 }, { "epoch": 0.8420506434205064, "grad_norm": 0.6544425682927617, "learning_rate": 6.170942757378728e-08, "loss": 1.5078, "step": 4057 }, { "epoch": 0.842258198422582, "grad_norm": 0.9381897686102861, "learning_rate": 6.165358176692885e-08, "loss": 1.5589, "step": 4058 }, { "epoch": 0.8424657534246576, "grad_norm": 0.8562378369972459, "learning_rate": 6.159780328476358e-08, "loss": 1.5122, "step": 4059 }, { "epoch": 0.842673308426733, "grad_norm": 0.7729498223687795, "learning_rate": 6.154209215100709e-08, "loss": 1.5176, "step": 4060 }, { "epoch": 0.8428808634288086, "grad_norm": 0.7682316854340769, "learning_rate": 6.14864483893461e-08, "loss": 1.4993, "step": 4061 }, { "epoch": 0.8430884184308842, "grad_norm": 0.7712317870670445, "learning_rate": 6.1430872023439e-08, "loss": 1.4995, "step": 4062 }, { "epoch": 0.8432959734329597, "grad_norm": 0.8989037497727227, "learning_rate": 6.137536307691535e-08, "loss": 1.5372, "step": 4063 }, { "epoch": 0.8435035284350353, "grad_norm": 0.6893389098040597, "learning_rate": 6.131992157337608e-08, "loss": 1.5526, "step": 4064 }, { "epoch": 0.8437110834371109, "grad_norm": 0.7643914499723264, "learning_rate": 6.126454753639342e-08, "loss": 1.4557, "step": 4065 }, { "epoch": 0.8439186384391864, "grad_norm": 0.6529769647658299, "learning_rate": 6.120924098951102e-08, "loss": 1.4867, "step": 4066 }, { "epoch": 0.8441261934412619, "grad_norm": 0.8971531483074574, "learning_rate": 6.115400195624363e-08, "loss": 1.5656, "step": 4067 }, { "epoch": 0.8443337484433375, "grad_norm": 0.7366126592976487, "learning_rate": 6.109883046007749e-08, "loss": 1.4596, "step": 4068 }, { "epoch": 0.844541303445413, "grad_norm": 0.7260828512171599, "learning_rate": 6.10437265244701e-08, "loss": 1.4864, "step": 4069 }, { "epoch": 0.8447488584474886, "grad_norm": 1.2388362343258517, "learning_rate": 6.098869017285007e-08, "loss": 1.5321, "step": 4070 }, { "epoch": 0.8449564134495642, "grad_norm": 0.6813257330196466, "learning_rate": 6.09337214286175e-08, "loss": 1.4887, "step": 4071 }, { "epoch": 0.8451639684516397, "grad_norm": 0.7391326802325049, "learning_rate": 6.087882031514364e-08, "loss": 1.5583, "step": 4072 }, { "epoch": 0.8453715234537152, "grad_norm": 0.8887157733844627, "learning_rate": 6.082398685577094e-08, "loss": 1.4544, "step": 4073 }, { "epoch": 0.8455790784557908, "grad_norm": 0.7114636308472719, "learning_rate": 6.07692210738131e-08, "loss": 1.5509, "step": 4074 }, { "epoch": 0.8457866334578663, "grad_norm": 0.9378076225994207, "learning_rate": 6.071452299255522e-08, "loss": 1.4951, "step": 4075 }, { "epoch": 0.8459941884599419, "grad_norm": 0.7049201342496771, "learning_rate": 6.065989263525329e-08, "loss": 1.4885, "step": 4076 }, { "epoch": 0.8462017434620175, "grad_norm": 0.8435285100069949, "learning_rate": 6.060533002513481e-08, "loss": 1.4984, "step": 4077 }, { "epoch": 0.846409298464093, "grad_norm": 4.0280121079449716, "learning_rate": 6.055083518539831e-08, "loss": 1.4455, "step": 4078 }, { "epoch": 0.8466168534661686, "grad_norm": 1.5884707723620994, "learning_rate": 6.049640813921356e-08, "loss": 1.5793, "step": 4079 }, { "epoch": 0.8468244084682441, "grad_norm": 0.7859921295499089, "learning_rate": 6.044204890972144e-08, "loss": 1.4924, "step": 4080 }, { "epoch": 0.8470319634703196, "grad_norm": 1.0117214614418315, "learning_rate": 6.038775752003415e-08, "loss": 1.4763, "step": 4081 }, { "epoch": 0.8472395184723952, "grad_norm": 0.8687583311362441, "learning_rate": 6.033353399323491e-08, "loss": 1.5534, "step": 4082 }, { "epoch": 0.8474470734744707, "grad_norm": 0.8520361933348216, "learning_rate": 6.027937835237808e-08, "loss": 1.4673, "step": 4083 }, { "epoch": 0.8476546284765463, "grad_norm": 0.8579679305047718, "learning_rate": 6.022529062048925e-08, "loss": 1.4336, "step": 4084 }, { "epoch": 0.8478621834786219, "grad_norm": 2.903900444805852, "learning_rate": 6.017127082056505e-08, "loss": 1.5954, "step": 4085 }, { "epoch": 0.8480697384806973, "grad_norm": 2.115870460475413, "learning_rate": 6.011731897557333e-08, "loss": 1.5902, "step": 4086 }, { "epoch": 0.8482772934827729, "grad_norm": 1.3854393479840557, "learning_rate": 6.006343510845288e-08, "loss": 1.5892, "step": 4087 }, { "epoch": 0.8484848484848485, "grad_norm": 0.9180946898742948, "learning_rate": 6.00096192421138e-08, "loss": 1.5293, "step": 4088 }, { "epoch": 0.848692403486924, "grad_norm": 0.7679457186793486, "learning_rate": 5.995587139943708e-08, "loss": 1.5339, "step": 4089 }, { "epoch": 0.8488999584889996, "grad_norm": 0.7799100023818198, "learning_rate": 5.990219160327494e-08, "loss": 1.5169, "step": 4090 }, { "epoch": 0.8491075134910752, "grad_norm": 0.8760153198603559, "learning_rate": 5.984857987645054e-08, "loss": 1.533, "step": 4091 }, { "epoch": 0.8493150684931506, "grad_norm": 0.6843072573169289, "learning_rate": 5.97950362417582e-08, "loss": 1.5652, "step": 4092 }, { "epoch": 0.8495226234952262, "grad_norm": 0.9997743818174234, "learning_rate": 5.974156072196326e-08, "loss": 1.5145, "step": 4093 }, { "epoch": 0.8497301784973018, "grad_norm": 0.9971245097302768, "learning_rate": 5.96881533398021e-08, "loss": 1.4771, "step": 4094 }, { "epoch": 0.8499377334993773, "grad_norm": 0.9221102322880435, "learning_rate": 5.96348141179821e-08, "loss": 1.527, "step": 4095 }, { "epoch": 0.8501452885014529, "grad_norm": 1.0828798853149835, "learning_rate": 5.958154307918165e-08, "loss": 1.5759, "step": 4096 }, { "epoch": 0.8503528435035285, "grad_norm": 1.8271044083505592, "learning_rate": 5.9528340246050245e-08, "loss": 1.4829, "step": 4097 }, { "epoch": 0.8505603985056039, "grad_norm": 0.6920566831847973, "learning_rate": 5.947520564120829e-08, "loss": 1.444, "step": 4098 }, { "epoch": 0.8507679535076795, "grad_norm": 0.6859673802190966, "learning_rate": 5.94221392872472e-08, "loss": 1.5767, "step": 4099 }, { "epoch": 0.8509755085097551, "grad_norm": 0.9842430714096108, "learning_rate": 5.9369141206729345e-08, "loss": 1.478, "step": 4100 }, { "epoch": 0.8511830635118306, "grad_norm": 0.8889043794707988, "learning_rate": 5.9316211422188224e-08, "loss": 1.6093, "step": 4101 }, { "epoch": 0.8513906185139062, "grad_norm": 1.3491699629533125, "learning_rate": 5.926334995612802e-08, "loss": 1.5002, "step": 4102 }, { "epoch": 0.8515981735159818, "grad_norm": 0.659643775828377, "learning_rate": 5.9210556831024115e-08, "loss": 1.4739, "step": 4103 }, { "epoch": 0.8518057285180572, "grad_norm": 0.7217318821852536, "learning_rate": 5.9157832069322763e-08, "loss": 1.5892, "step": 4104 }, { "epoch": 0.8520132835201328, "grad_norm": 0.8551784757463411, "learning_rate": 5.910517569344108e-08, "loss": 1.4656, "step": 4105 }, { "epoch": 0.8522208385222084, "grad_norm": 0.8819767414008842, "learning_rate": 5.905258772576714e-08, "loss": 1.5238, "step": 4106 }, { "epoch": 0.8524283935242839, "grad_norm": 2.072011996681699, "learning_rate": 5.900006818866003e-08, "loss": 1.5415, "step": 4107 }, { "epoch": 0.8526359485263595, "grad_norm": 1.3649564615315766, "learning_rate": 5.894761710444961e-08, "loss": 1.5169, "step": 4108 }, { "epoch": 0.8528435035284351, "grad_norm": 0.6393359145198483, "learning_rate": 5.889523449543672e-08, "loss": 1.5239, "step": 4109 }, { "epoch": 0.8530510585305106, "grad_norm": 0.7792328380483428, "learning_rate": 5.8842920383893014e-08, "loss": 1.4813, "step": 4110 }, { "epoch": 0.8532586135325861, "grad_norm": 1.0054966380207717, "learning_rate": 5.879067479206107e-08, "loss": 1.5708, "step": 4111 }, { "epoch": 0.8534661685346617, "grad_norm": 1.6157621894184846, "learning_rate": 5.873849774215435e-08, "loss": 1.5545, "step": 4112 }, { "epoch": 0.8536737235367372, "grad_norm": 0.6940989261812918, "learning_rate": 5.8686389256357164e-08, "loss": 1.5008, "step": 4113 }, { "epoch": 0.8538812785388128, "grad_norm": 0.6655757647220574, "learning_rate": 5.863434935682461e-08, "loss": 1.5302, "step": 4114 }, { "epoch": 0.8540888335408884, "grad_norm": 1.6014266122475473, "learning_rate": 5.858237806568267e-08, "loss": 1.5292, "step": 4115 }, { "epoch": 0.8542963885429639, "grad_norm": 1.5053212959295068, "learning_rate": 5.853047540502826e-08, "loss": 1.4585, "step": 4116 }, { "epoch": 0.8545039435450394, "grad_norm": 1.3008247275325335, "learning_rate": 5.847864139692886e-08, "loss": 1.4977, "step": 4117 }, { "epoch": 0.854711498547115, "grad_norm": 1.116950383213203, "learning_rate": 5.842687606342301e-08, "loss": 1.4599, "step": 4118 }, { "epoch": 0.8549190535491905, "grad_norm": 0.8278884111697892, "learning_rate": 5.837517942651996e-08, "loss": 1.4832, "step": 4119 }, { "epoch": 0.8551266085512661, "grad_norm": 0.8400076822517751, "learning_rate": 5.8323551508199717e-08, "loss": 1.5074, "step": 4120 }, { "epoch": 0.8553341635533417, "grad_norm": 0.665323811090533, "learning_rate": 5.827199233041306e-08, "loss": 1.5407, "step": 4121 }, { "epoch": 0.8555417185554172, "grad_norm": 1.086483612788927, "learning_rate": 5.8220501915081674e-08, "loss": 1.5669, "step": 4122 }, { "epoch": 0.8557492735574928, "grad_norm": 0.9649567223562108, "learning_rate": 5.816908028409787e-08, "loss": 1.514, "step": 4123 }, { "epoch": 0.8559568285595683, "grad_norm": 0.9446684263406112, "learning_rate": 5.8117727459324726e-08, "loss": 1.4992, "step": 4124 }, { "epoch": 0.8561643835616438, "grad_norm": 0.997418081391977, "learning_rate": 5.8066443462596216e-08, "loss": 1.4549, "step": 4125 }, { "epoch": 0.8563719385637194, "grad_norm": 0.6849424593675485, "learning_rate": 5.801522831571677e-08, "loss": 1.5443, "step": 4126 }, { "epoch": 0.8565794935657949, "grad_norm": 0.651125153577864, "learning_rate": 5.796408204046186e-08, "loss": 1.4817, "step": 4127 }, { "epoch": 0.8567870485678705, "grad_norm": 2.123893077361472, "learning_rate": 5.791300465857741e-08, "loss": 1.4618, "step": 4128 }, { "epoch": 0.8569946035699461, "grad_norm": 0.8021709304260402, "learning_rate": 5.786199619178023e-08, "loss": 1.5456, "step": 4129 }, { "epoch": 0.8572021585720215, "grad_norm": 0.8451273955406761, "learning_rate": 5.781105666175776e-08, "loss": 1.5068, "step": 4130 }, { "epoch": 0.8574097135740971, "grad_norm": 0.637188116898242, "learning_rate": 5.7760186090168115e-08, "loss": 1.5736, "step": 4131 }, { "epoch": 0.8576172685761727, "grad_norm": 1.0107153983033976, "learning_rate": 5.770938449864009e-08, "loss": 1.4868, "step": 4132 }, { "epoch": 0.8578248235782482, "grad_norm": 2.365642913075964, "learning_rate": 5.7658651908773256e-08, "loss": 1.5215, "step": 4133 }, { "epoch": 0.8580323785803238, "grad_norm": 0.8824901576220718, "learning_rate": 5.7607988342137715e-08, "loss": 1.3926, "step": 4134 }, { "epoch": 0.8582399335823994, "grad_norm": 0.7803924668582455, "learning_rate": 5.7557393820274204e-08, "loss": 1.4852, "step": 4135 }, { "epoch": 0.8584474885844748, "grad_norm": 0.6677374403236563, "learning_rate": 5.750686836469433e-08, "loss": 1.5503, "step": 4136 }, { "epoch": 0.8586550435865504, "grad_norm": 0.6167136380753402, "learning_rate": 5.745641199688001e-08, "loss": 1.5213, "step": 4137 }, { "epoch": 0.858862598588626, "grad_norm": 1.0645362874290583, "learning_rate": 5.740602473828402e-08, "loss": 1.5123, "step": 4138 }, { "epoch": 0.8590701535907015, "grad_norm": 0.6499849215289663, "learning_rate": 5.7355706610329716e-08, "loss": 1.5587, "step": 4139 }, { "epoch": 0.8592777085927771, "grad_norm": 1.1319038296417008, "learning_rate": 5.7305457634411e-08, "loss": 1.5284, "step": 4140 }, { "epoch": 0.8594852635948527, "grad_norm": 0.804954701781392, "learning_rate": 5.725527783189239e-08, "loss": 1.4607, "step": 4141 }, { "epoch": 0.8596928185969281, "grad_norm": 0.7438323490016324, "learning_rate": 5.720516722410904e-08, "loss": 1.448, "step": 4142 }, { "epoch": 0.8599003735990037, "grad_norm": 0.7976820323740148, "learning_rate": 5.71551258323666e-08, "loss": 1.4899, "step": 4143 }, { "epoch": 0.8601079286010793, "grad_norm": 2.458745574904749, "learning_rate": 5.7105153677941375e-08, "loss": 1.4524, "step": 4144 }, { "epoch": 0.8603154836031548, "grad_norm": 0.9145255952623502, "learning_rate": 5.705525078208022e-08, "loss": 1.4915, "step": 4145 }, { "epoch": 0.8605230386052304, "grad_norm": 0.6765272493454979, "learning_rate": 5.700541716600048e-08, "loss": 1.4882, "step": 4146 }, { "epoch": 0.860730593607306, "grad_norm": 0.7432066502522412, "learning_rate": 5.6955652850890076e-08, "loss": 1.5213, "step": 4147 }, { "epoch": 0.8609381486093814, "grad_norm": 0.8804565519988562, "learning_rate": 5.690595785790753e-08, "loss": 1.5755, "step": 4148 }, { "epoch": 0.861145703611457, "grad_norm": 0.8229643361269555, "learning_rate": 5.68563322081818e-08, "loss": 1.509, "step": 4149 }, { "epoch": 0.8613532586135326, "grad_norm": 0.8743217596728453, "learning_rate": 5.6806775922812364e-08, "loss": 1.4992, "step": 4150 }, { "epoch": 0.8615608136156081, "grad_norm": 1.0385558238892758, "learning_rate": 5.6757289022869346e-08, "loss": 1.4313, "step": 4151 }, { "epoch": 0.8617683686176837, "grad_norm": 0.8545273338243716, "learning_rate": 5.670787152939311e-08, "loss": 1.4924, "step": 4152 }, { "epoch": 0.8619759236197593, "grad_norm": 0.7217691153188052, "learning_rate": 5.6658523463394766e-08, "loss": 1.5483, "step": 4153 }, { "epoch": 0.8621834786218348, "grad_norm": 0.732358806800885, "learning_rate": 5.660924484585579e-08, "loss": 1.4881, "step": 4154 }, { "epoch": 0.8623910336239103, "grad_norm": 1.2570711275175508, "learning_rate": 5.6560035697728123e-08, "loss": 1.5465, "step": 4155 }, { "epoch": 0.8625985886259859, "grad_norm": 0.7396997836660634, "learning_rate": 5.651089603993415e-08, "loss": 1.5714, "step": 4156 }, { "epoch": 0.8628061436280614, "grad_norm": 1.0294913668569072, "learning_rate": 5.6461825893366874e-08, "loss": 1.4659, "step": 4157 }, { "epoch": 0.863013698630137, "grad_norm": 2.3175215153323983, "learning_rate": 5.641282527888947e-08, "loss": 1.4965, "step": 4158 }, { "epoch": 0.8632212536322126, "grad_norm": 0.9819983911736111, "learning_rate": 5.6363894217335803e-08, "loss": 1.5411, "step": 4159 }, { "epoch": 0.8634288086342881, "grad_norm": 0.7072225792616519, "learning_rate": 5.631503272951001e-08, "loss": 1.4452, "step": 4160 }, { "epoch": 0.8636363636363636, "grad_norm": 0.9733887170555072, "learning_rate": 5.626624083618668e-08, "loss": 1.6014, "step": 4161 }, { "epoch": 0.8638439186384392, "grad_norm": 0.813595502246141, "learning_rate": 5.62175185581109e-08, "loss": 1.4301, "step": 4162 }, { "epoch": 0.8640514736405147, "grad_norm": 0.8372090441523727, "learning_rate": 5.616886591599806e-08, "loss": 1.5035, "step": 4163 }, { "epoch": 0.8642590286425903, "grad_norm": 0.8064299937919819, "learning_rate": 5.6120282930533944e-08, "loss": 1.5675, "step": 4164 }, { "epoch": 0.8644665836446659, "grad_norm": 0.9033349856205647, "learning_rate": 5.6071769622374746e-08, "loss": 1.5799, "step": 4165 }, { "epoch": 0.8646741386467414, "grad_norm": 1.4560557057536148, "learning_rate": 5.602332601214711e-08, "loss": 1.5069, "step": 4166 }, { "epoch": 0.864881693648817, "grad_norm": 0.7661794136719537, "learning_rate": 5.5974952120447875e-08, "loss": 1.5017, "step": 4167 }, { "epoch": 0.8650892486508925, "grad_norm": 0.7517363879885167, "learning_rate": 5.59266479678444e-08, "loss": 1.5414, "step": 4168 }, { "epoch": 0.865296803652968, "grad_norm": 0.6986495433826778, "learning_rate": 5.587841357487431e-08, "loss": 1.5208, "step": 4169 }, { "epoch": 0.8655043586550436, "grad_norm": 0.7529383038820391, "learning_rate": 5.583024896204559e-08, "loss": 1.6272, "step": 4170 }, { "epoch": 0.8657119136571192, "grad_norm": 0.6211978460501443, "learning_rate": 5.5782154149836514e-08, "loss": 1.4486, "step": 4171 }, { "epoch": 0.8659194686591947, "grad_norm": 0.7761169013788679, "learning_rate": 5.5734129158695844e-08, "loss": 1.5324, "step": 4172 }, { "epoch": 0.8661270236612703, "grad_norm": 0.9257611442368944, "learning_rate": 5.5686174009042364e-08, "loss": 1.5136, "step": 4173 }, { "epoch": 0.8663345786633457, "grad_norm": 1.0235908485719383, "learning_rate": 5.5638288721265427e-08, "loss": 1.5922, "step": 4174 }, { "epoch": 0.8665421336654213, "grad_norm": 0.7283903810303131, "learning_rate": 5.5590473315724595e-08, "loss": 1.4859, "step": 4175 }, { "epoch": 0.8667496886674969, "grad_norm": 0.9405004054817377, "learning_rate": 5.5542727812749645e-08, "loss": 1.4551, "step": 4176 }, { "epoch": 0.8669572436695724, "grad_norm": 1.1197242467483994, "learning_rate": 5.549505223264081e-08, "loss": 1.4883, "step": 4177 }, { "epoch": 0.867164798671648, "grad_norm": 0.7700679216421907, "learning_rate": 5.5447446595668327e-08, "loss": 1.5388, "step": 4178 }, { "epoch": 0.8673723536737236, "grad_norm": 0.6823895391875299, "learning_rate": 5.539991092207296e-08, "loss": 1.4758, "step": 4179 }, { "epoch": 0.867579908675799, "grad_norm": 1.0629000749707977, "learning_rate": 5.535244523206559e-08, "loss": 1.5405, "step": 4180 }, { "epoch": 0.8677874636778746, "grad_norm": 0.6778443767062662, "learning_rate": 5.5305049545827366e-08, "loss": 1.4496, "step": 4181 }, { "epoch": 0.8679950186799502, "grad_norm": 0.9285642258982703, "learning_rate": 5.5257723883509645e-08, "loss": 1.5067, "step": 4182 }, { "epoch": 0.8682025736820257, "grad_norm": 0.8395923263599718, "learning_rate": 5.521046826523413e-08, "loss": 1.5268, "step": 4183 }, { "epoch": 0.8684101286841013, "grad_norm": 0.8162135082748619, "learning_rate": 5.516328271109253e-08, "loss": 1.5262, "step": 4184 }, { "epoch": 0.8686176836861769, "grad_norm": 1.0771736494419826, "learning_rate": 5.5116167241147003e-08, "loss": 1.4944, "step": 4185 }, { "epoch": 0.8688252386882523, "grad_norm": 0.8292095731176862, "learning_rate": 5.5069121875429724e-08, "loss": 1.5152, "step": 4186 }, { "epoch": 0.8690327936903279, "grad_norm": 2.0213811479064447, "learning_rate": 5.5022146633943136e-08, "loss": 1.5375, "step": 4187 }, { "epoch": 0.8692403486924035, "grad_norm": 0.6859199044064056, "learning_rate": 5.4975241536659914e-08, "loss": 1.5366, "step": 4188 }, { "epoch": 0.869447903694479, "grad_norm": 2.126172541469436, "learning_rate": 5.492840660352285e-08, "loss": 1.5683, "step": 4189 }, { "epoch": 0.8696554586965546, "grad_norm": 0.7433443582528955, "learning_rate": 5.4881641854444885e-08, "loss": 1.5988, "step": 4190 }, { "epoch": 0.8698630136986302, "grad_norm": 0.8145355296085374, "learning_rate": 5.483494730930911e-08, "loss": 1.4664, "step": 4191 }, { "epoch": 0.8700705687007056, "grad_norm": 7.258279082738799, "learning_rate": 5.478832298796895e-08, "loss": 1.4882, "step": 4192 }, { "epoch": 0.8702781237027812, "grad_norm": 1.081515734963167, "learning_rate": 5.4741768910247665e-08, "loss": 1.4902, "step": 4193 }, { "epoch": 0.8704856787048568, "grad_norm": 0.9896179450721011, "learning_rate": 5.4695285095938905e-08, "loss": 1.5437, "step": 4194 }, { "epoch": 0.8706932337069323, "grad_norm": 0.6525901152976705, "learning_rate": 5.464887156480633e-08, "loss": 1.4839, "step": 4195 }, { "epoch": 0.8709007887090079, "grad_norm": 0.8832141859934202, "learning_rate": 5.460252833658374e-08, "loss": 1.5311, "step": 4196 }, { "epoch": 0.8711083437110835, "grad_norm": 1.0599630386727497, "learning_rate": 5.455625543097503e-08, "loss": 1.5021, "step": 4197 }, { "epoch": 0.871315898713159, "grad_norm": 1.42664385688762, "learning_rate": 5.451005286765424e-08, "loss": 1.5206, "step": 4198 }, { "epoch": 0.8715234537152345, "grad_norm": 0.7265401135034683, "learning_rate": 5.4463920666265493e-08, "loss": 1.5428, "step": 4199 }, { "epoch": 0.8717310087173101, "grad_norm": 0.7619546583017525, "learning_rate": 5.4417858846422924e-08, "loss": 1.6151, "step": 4200 }, { "epoch": 0.8719385637193856, "grad_norm": 1.0970789377494663, "learning_rate": 5.437186742771083e-08, "loss": 1.4679, "step": 4201 }, { "epoch": 0.8721461187214612, "grad_norm": 1.142686229205972, "learning_rate": 5.432594642968349e-08, "loss": 1.5342, "step": 4202 }, { "epoch": 0.8723536737235368, "grad_norm": 0.663847238892591, "learning_rate": 5.428009587186535e-08, "loss": 1.5082, "step": 4203 }, { "epoch": 0.8725612287256123, "grad_norm": 4.310129815916642, "learning_rate": 5.423431577375085e-08, "loss": 1.4768, "step": 4204 }, { "epoch": 0.8727687837276878, "grad_norm": 0.9347674646692149, "learning_rate": 5.418860615480445e-08, "loss": 1.5613, "step": 4205 }, { "epoch": 0.8729763387297634, "grad_norm": 0.7720927812734257, "learning_rate": 5.414296703446063e-08, "loss": 1.4526, "step": 4206 }, { "epoch": 0.8731838937318389, "grad_norm": 0.6868919417199573, "learning_rate": 5.409739843212406e-08, "loss": 1.4771, "step": 4207 }, { "epoch": 0.8733914487339145, "grad_norm": 0.8804363251874378, "learning_rate": 5.405190036716912e-08, "loss": 1.4903, "step": 4208 }, { "epoch": 0.8735990037359901, "grad_norm": 0.9938283284415154, "learning_rate": 5.4006472858940535e-08, "loss": 1.5267, "step": 4209 }, { "epoch": 0.8738065587380656, "grad_norm": 0.6702845584703695, "learning_rate": 5.3961115926752786e-08, "loss": 1.497, "step": 4210 }, { "epoch": 0.8740141137401412, "grad_norm": 0.6828171091381926, "learning_rate": 5.391582958989047e-08, "loss": 1.6409, "step": 4211 }, { "epoch": 0.8742216687422167, "grad_norm": 0.8883174027937882, "learning_rate": 5.3870613867608104e-08, "loss": 1.5631, "step": 4212 }, { "epoch": 0.8744292237442922, "grad_norm": 0.6266442412496519, "learning_rate": 5.382546877913026e-08, "loss": 1.536, "step": 4213 }, { "epoch": 0.8746367787463678, "grad_norm": 2.356246620684954, "learning_rate": 5.378039434365143e-08, "loss": 1.5815, "step": 4214 }, { "epoch": 0.8748443337484434, "grad_norm": 0.9445070211315894, "learning_rate": 5.3735390580336024e-08, "loss": 1.5322, "step": 4215 }, { "epoch": 0.8750518887505189, "grad_norm": 0.7320135726769582, "learning_rate": 5.3690457508318464e-08, "loss": 1.4941, "step": 4216 }, { "epoch": 0.8752594437525945, "grad_norm": 0.7033963475871209, "learning_rate": 5.3645595146703085e-08, "loss": 1.4586, "step": 4217 }, { "epoch": 0.8754669987546699, "grad_norm": 0.9536858445805583, "learning_rate": 5.3600803514564255e-08, "loss": 1.5208, "step": 4218 }, { "epoch": 0.8756745537567455, "grad_norm": 0.9280960572717364, "learning_rate": 5.355608263094607e-08, "loss": 1.5253, "step": 4219 }, { "epoch": 0.8758821087588211, "grad_norm": 0.7964899446883809, "learning_rate": 5.351143251486271e-08, "loss": 1.5894, "step": 4220 }, { "epoch": 0.8760896637608966, "grad_norm": 0.6131247619568557, "learning_rate": 5.3466853185298265e-08, "loss": 1.5307, "step": 4221 }, { "epoch": 0.8762972187629722, "grad_norm": 0.8473869228543437, "learning_rate": 5.342234466120662e-08, "loss": 1.5252, "step": 4222 }, { "epoch": 0.8765047737650478, "grad_norm": 0.7112192747153537, "learning_rate": 5.337790696151162e-08, "loss": 1.5337, "step": 4223 }, { "epoch": 0.8767123287671232, "grad_norm": 0.979106709730265, "learning_rate": 5.333354010510703e-08, "loss": 1.5374, "step": 4224 }, { "epoch": 0.8769198837691988, "grad_norm": 0.672210708302282, "learning_rate": 5.3289244110856456e-08, "loss": 1.4724, "step": 4225 }, { "epoch": 0.8771274387712744, "grad_norm": 0.7382489583601339, "learning_rate": 5.324501899759336e-08, "loss": 1.5052, "step": 4226 }, { "epoch": 0.8773349937733499, "grad_norm": 0.648877679246535, "learning_rate": 5.32008647841211e-08, "loss": 1.4578, "step": 4227 }, { "epoch": 0.8775425487754255, "grad_norm": 0.6891513737866265, "learning_rate": 5.315678148921284e-08, "loss": 1.5207, "step": 4228 }, { "epoch": 0.8777501037775011, "grad_norm": 0.6335163523907139, "learning_rate": 5.311276913161169e-08, "loss": 1.5482, "step": 4229 }, { "epoch": 0.8779576587795765, "grad_norm": 0.6845397792008311, "learning_rate": 5.306882773003047e-08, "loss": 1.462, "step": 4230 }, { "epoch": 0.8781652137816521, "grad_norm": 0.9797956229546194, "learning_rate": 5.302495730315196e-08, "loss": 1.4755, "step": 4231 }, { "epoch": 0.8783727687837277, "grad_norm": 2.4637326093339227, "learning_rate": 5.2981157869628646e-08, "loss": 1.5182, "step": 4232 }, { "epoch": 0.8785803237858032, "grad_norm": 0.6523987461597316, "learning_rate": 5.293742944808296e-08, "loss": 1.5461, "step": 4233 }, { "epoch": 0.8787878787878788, "grad_norm": 0.9102005775740785, "learning_rate": 5.2893772057106945e-08, "loss": 1.5329, "step": 4234 }, { "epoch": 0.8789954337899544, "grad_norm": 0.6878377862214508, "learning_rate": 5.285018571526266e-08, "loss": 1.4771, "step": 4235 }, { "epoch": 0.8792029887920298, "grad_norm": 1.0666386433823898, "learning_rate": 5.280667044108184e-08, "loss": 1.4533, "step": 4236 }, { "epoch": 0.8794105437941054, "grad_norm": 0.9152677055223459, "learning_rate": 5.276322625306601e-08, "loss": 1.5399, "step": 4237 }, { "epoch": 0.879618098796181, "grad_norm": 0.7648227800062127, "learning_rate": 5.271985316968647e-08, "loss": 1.5596, "step": 4238 }, { "epoch": 0.8798256537982565, "grad_norm": 0.7166525771998916, "learning_rate": 5.267655120938434e-08, "loss": 1.5551, "step": 4239 }, { "epoch": 0.8800332088003321, "grad_norm": 0.8567833446848079, "learning_rate": 5.263332039057048e-08, "loss": 1.5274, "step": 4240 }, { "epoch": 0.8802407638024077, "grad_norm": 0.759213438469565, "learning_rate": 5.259016073162541e-08, "loss": 1.5716, "step": 4241 }, { "epoch": 0.8804483188044832, "grad_norm": 0.9688616544910612, "learning_rate": 5.254707225089958e-08, "loss": 1.5298, "step": 4242 }, { "epoch": 0.8806558738065587, "grad_norm": 0.8737294001400681, "learning_rate": 5.250405496671296e-08, "loss": 1.463, "step": 4243 }, { "epoch": 0.8808634288086343, "grad_norm": 0.8053355337454994, "learning_rate": 5.246110889735541e-08, "loss": 1.4607, "step": 4244 }, { "epoch": 0.8810709838107098, "grad_norm": 1.295230301483268, "learning_rate": 5.2418234061086476e-08, "loss": 1.4871, "step": 4245 }, { "epoch": 0.8812785388127854, "grad_norm": 0.678528741831449, "learning_rate": 5.2375430476135374e-08, "loss": 1.4612, "step": 4246 }, { "epoch": 0.881486093814861, "grad_norm": 0.9622688739992937, "learning_rate": 5.2332698160701e-08, "loss": 1.4754, "step": 4247 }, { "epoch": 0.8816936488169365, "grad_norm": 0.7119298611054514, "learning_rate": 5.229003713295213e-08, "loss": 1.5235, "step": 4248 }, { "epoch": 0.881901203819012, "grad_norm": 1.05046913972188, "learning_rate": 5.224744741102697e-08, "loss": 1.5191, "step": 4249 }, { "epoch": 0.8821087588210876, "grad_norm": 0.7666164712001088, "learning_rate": 5.220492901303362e-08, "loss": 1.492, "step": 4250 }, { "epoch": 0.8823163138231631, "grad_norm": 0.7700859781958104, "learning_rate": 5.2162481957049754e-08, "loss": 1.4919, "step": 4251 }, { "epoch": 0.8825238688252387, "grad_norm": 0.7334359996695763, "learning_rate": 5.212010626112271e-08, "loss": 1.535, "step": 4252 }, { "epoch": 0.8827314238273143, "grad_norm": 0.7693179701976535, "learning_rate": 5.207780194326952e-08, "loss": 1.4861, "step": 4253 }, { "epoch": 0.8829389788293898, "grad_norm": 0.7190063652592567, "learning_rate": 5.2035569021476866e-08, "loss": 1.5472, "step": 4254 }, { "epoch": 0.8831465338314654, "grad_norm": 0.6821158850330902, "learning_rate": 5.199340751370107e-08, "loss": 1.4748, "step": 4255 }, { "epoch": 0.8833540888335409, "grad_norm": 1.5571151444271683, "learning_rate": 5.195131743786807e-08, "loss": 1.5677, "step": 4256 }, { "epoch": 0.8835616438356164, "grad_norm": 0.8142576940205603, "learning_rate": 5.1909298811873484e-08, "loss": 1.4703, "step": 4257 }, { "epoch": 0.883769198837692, "grad_norm": 0.7192879281080775, "learning_rate": 5.1867351653582434e-08, "loss": 1.4792, "step": 4258 }, { "epoch": 0.8839767538397676, "grad_norm": 0.8739303422913324, "learning_rate": 5.182547598082983e-08, "loss": 1.5629, "step": 4259 }, { "epoch": 0.8841843088418431, "grad_norm": 0.6737205626905679, "learning_rate": 5.178367181142007e-08, "loss": 1.5414, "step": 4260 }, { "epoch": 0.8843918638439187, "grad_norm": 0.7835482444865381, "learning_rate": 5.174193916312717e-08, "loss": 1.5058, "step": 4261 }, { "epoch": 0.8845994188459941, "grad_norm": 1.3818172229022785, "learning_rate": 5.1700278053694724e-08, "loss": 1.5235, "step": 4262 }, { "epoch": 0.8848069738480697, "grad_norm": 0.8367357269564156, "learning_rate": 5.1658688500835937e-08, "loss": 1.467, "step": 4263 }, { "epoch": 0.8850145288501453, "grad_norm": 0.7080073868049961, "learning_rate": 5.161717052223358e-08, "loss": 1.5468, "step": 4264 }, { "epoch": 0.8852220838522208, "grad_norm": 0.6422585614531965, "learning_rate": 5.1575724135540044e-08, "loss": 1.6077, "step": 4265 }, { "epoch": 0.8854296388542964, "grad_norm": 0.7018188776745039, "learning_rate": 5.153434935837717e-08, "loss": 1.5084, "step": 4266 }, { "epoch": 0.885637193856372, "grad_norm": 0.7996096745563676, "learning_rate": 5.1493046208336434e-08, "loss": 1.4675, "step": 4267 }, { "epoch": 0.8858447488584474, "grad_norm": 0.8817282853164217, "learning_rate": 5.145181470297888e-08, "loss": 1.4802, "step": 4268 }, { "epoch": 0.886052303860523, "grad_norm": 3.3772671493344615, "learning_rate": 5.141065485983497e-08, "loss": 1.4752, "step": 4269 }, { "epoch": 0.8862598588625986, "grad_norm": 0.8372472219496861, "learning_rate": 5.136956669640486e-08, "loss": 1.4755, "step": 4270 }, { "epoch": 0.8864674138646741, "grad_norm": 0.7506953327093302, "learning_rate": 5.132855023015809e-08, "loss": 1.5137, "step": 4271 }, { "epoch": 0.8866749688667497, "grad_norm": 0.8936346567465829, "learning_rate": 5.128760547853379e-08, "loss": 1.486, "step": 4272 }, { "epoch": 0.8868825238688253, "grad_norm": 1.296900704664816, "learning_rate": 5.124673245894058e-08, "loss": 1.4206, "step": 4273 }, { "epoch": 0.8870900788709007, "grad_norm": 0.8835573287414936, "learning_rate": 5.120593118875661e-08, "loss": 1.4375, "step": 4274 }, { "epoch": 0.8872976338729763, "grad_norm": 0.8481941939830817, "learning_rate": 5.116520168532942e-08, "loss": 1.525, "step": 4275 }, { "epoch": 0.8875051888750519, "grad_norm": 2.2552084923334887, "learning_rate": 5.11245439659762e-08, "loss": 1.4724, "step": 4276 }, { "epoch": 0.8877127438771274, "grad_norm": 0.8629327342298814, "learning_rate": 5.1083958047983486e-08, "loss": 1.5254, "step": 4277 }, { "epoch": 0.887920298879203, "grad_norm": 0.9498131123080511, "learning_rate": 5.104344394860736e-08, "loss": 1.5334, "step": 4278 }, { "epoch": 0.8881278538812786, "grad_norm": 1.7009220910789542, "learning_rate": 5.100300168507328e-08, "loss": 1.505, "step": 4279 }, { "epoch": 0.888335408883354, "grad_norm": 0.7809552615299162, "learning_rate": 5.096263127457631e-08, "loss": 1.5391, "step": 4280 }, { "epoch": 0.8885429638854296, "grad_norm": 0.7497246029049855, "learning_rate": 5.0922332734280836e-08, "loss": 1.4543, "step": 4281 }, { "epoch": 0.8887505188875052, "grad_norm": 0.782328456157103, "learning_rate": 5.0882106081320694e-08, "loss": 1.5103, "step": 4282 }, { "epoch": 0.8889580738895807, "grad_norm": 0.7978758306197025, "learning_rate": 5.084195133279927e-08, "loss": 1.5552, "step": 4283 }, { "epoch": 0.8891656288916563, "grad_norm": 1.1181287179920762, "learning_rate": 5.0801868505789205e-08, "loss": 1.5197, "step": 4284 }, { "epoch": 0.8893731838937319, "grad_norm": 0.6434283809211656, "learning_rate": 5.0761857617332705e-08, "loss": 1.5485, "step": 4285 }, { "epoch": 0.8895807388958074, "grad_norm": 0.8840134229109191, "learning_rate": 5.0721918684441356e-08, "loss": 1.5788, "step": 4286 }, { "epoch": 0.8897882938978829, "grad_norm": 0.6680170235180583, "learning_rate": 5.0682051724096084e-08, "loss": 1.5123, "step": 4287 }, { "epoch": 0.8899958488999585, "grad_norm": 0.6609811219082681, "learning_rate": 5.0642256753247256e-08, "loss": 1.5164, "step": 4288 }, { "epoch": 0.890203403902034, "grad_norm": 1.2699579751064243, "learning_rate": 5.060253378881474e-08, "loss": 1.5474, "step": 4289 }, { "epoch": 0.8904109589041096, "grad_norm": 0.946140842710006, "learning_rate": 5.056288284768753e-08, "loss": 1.5962, "step": 4290 }, { "epoch": 0.8906185139061852, "grad_norm": 0.8625508751684643, "learning_rate": 5.052330394672428e-08, "loss": 1.4837, "step": 4291 }, { "epoch": 0.8908260689082607, "grad_norm": 3.111788008305834, "learning_rate": 5.0483797102752834e-08, "loss": 1.5295, "step": 4292 }, { "epoch": 0.8910336239103362, "grad_norm": 0.8459879573160543, "learning_rate": 5.044436233257042e-08, "loss": 1.5793, "step": 4293 }, { "epoch": 0.8912411789124118, "grad_norm": 0.7554870079899078, "learning_rate": 5.0404999652943735e-08, "loss": 1.5324, "step": 4294 }, { "epoch": 0.8914487339144873, "grad_norm": 0.7717021565993143, "learning_rate": 5.036570908060871e-08, "loss": 1.515, "step": 4295 }, { "epoch": 0.8916562889165629, "grad_norm": 0.7529179131308154, "learning_rate": 5.0326490632270656e-08, "loss": 1.5593, "step": 4296 }, { "epoch": 0.8918638439186385, "grad_norm": 0.6936682708787681, "learning_rate": 5.028734432460418e-08, "loss": 1.5714, "step": 4297 }, { "epoch": 0.892071398920714, "grad_norm": 1.3458158128488977, "learning_rate": 5.024827017425331e-08, "loss": 1.5148, "step": 4298 }, { "epoch": 0.8922789539227896, "grad_norm": 0.7696684986420448, "learning_rate": 5.020926819783128e-08, "loss": 1.5205, "step": 4299 }, { "epoch": 0.8924865089248651, "grad_norm": 0.8115595098586696, "learning_rate": 5.0170338411920745e-08, "loss": 1.4317, "step": 4300 }, { "epoch": 0.8926940639269406, "grad_norm": 0.7534127642869912, "learning_rate": 5.01314808330736e-08, "loss": 1.5335, "step": 4301 }, { "epoch": 0.8929016189290162, "grad_norm": 0.8573846778963077, "learning_rate": 5.0092695477811057e-08, "loss": 1.6471, "step": 4302 }, { "epoch": 0.8931091739310918, "grad_norm": 1.4395934480749717, "learning_rate": 5.005398236262358e-08, "loss": 1.5053, "step": 4303 }, { "epoch": 0.8933167289331673, "grad_norm": 0.9472802638245992, "learning_rate": 5.001534150397101e-08, "loss": 1.5163, "step": 4304 }, { "epoch": 0.8935242839352429, "grad_norm": 1.284152263952106, "learning_rate": 4.9976772918282406e-08, "loss": 1.5548, "step": 4305 }, { "epoch": 0.8937318389373183, "grad_norm": 0.8089470550323055, "learning_rate": 4.99382766219561e-08, "loss": 1.4239, "step": 4306 }, { "epoch": 0.8939393939393939, "grad_norm": 0.7458479682006363, "learning_rate": 4.989985263135968e-08, "loss": 1.4975, "step": 4307 }, { "epoch": 0.8941469489414695, "grad_norm": 0.8121655497119886, "learning_rate": 4.9861500962830014e-08, "loss": 1.5591, "step": 4308 }, { "epoch": 0.894354503943545, "grad_norm": 0.8018688010174477, "learning_rate": 4.982322163267326e-08, "loss": 1.5427, "step": 4309 }, { "epoch": 0.8945620589456206, "grad_norm": 0.6794273813769126, "learning_rate": 4.978501465716468e-08, "loss": 1.5165, "step": 4310 }, { "epoch": 0.8947696139476962, "grad_norm": 0.725473907679013, "learning_rate": 4.9746880052548935e-08, "loss": 1.4709, "step": 4311 }, { "epoch": 0.8949771689497716, "grad_norm": 0.9676880458658392, "learning_rate": 4.970881783503982e-08, "loss": 1.589, "step": 4312 }, { "epoch": 0.8951847239518472, "grad_norm": 0.7389128274610071, "learning_rate": 4.967082802082041e-08, "loss": 1.581, "step": 4313 }, { "epoch": 0.8953922789539228, "grad_norm": 0.8186399866900692, "learning_rate": 4.96329106260429e-08, "loss": 1.5291, "step": 4314 }, { "epoch": 0.8955998339559983, "grad_norm": 1.6937377868442203, "learning_rate": 4.9595065666828864e-08, "loss": 1.5178, "step": 4315 }, { "epoch": 0.8958073889580739, "grad_norm": 1.1622595347845388, "learning_rate": 4.955729315926886e-08, "loss": 1.5696, "step": 4316 }, { "epoch": 0.8960149439601495, "grad_norm": 0.7733428171985816, "learning_rate": 4.9519593119422846e-08, "loss": 1.5337, "step": 4317 }, { "epoch": 0.896222498962225, "grad_norm": 0.8729457673286071, "learning_rate": 4.948196556331982e-08, "loss": 1.5659, "step": 4318 }, { "epoch": 0.8964300539643005, "grad_norm": 0.636443077605352, "learning_rate": 4.944441050695802e-08, "loss": 1.516, "step": 4319 }, { "epoch": 0.8966376089663761, "grad_norm": 0.8005755156950772, "learning_rate": 4.940692796630491e-08, "loss": 1.4816, "step": 4320 }, { "epoch": 0.8968451639684516, "grad_norm": 0.9880259238605373, "learning_rate": 4.936951795729704e-08, "loss": 1.5539, "step": 4321 }, { "epoch": 0.8970527189705272, "grad_norm": 0.692931295818655, "learning_rate": 4.9332180495840136e-08, "loss": 1.5093, "step": 4322 }, { "epoch": 0.8972602739726028, "grad_norm": 0.738273417602925, "learning_rate": 4.929491559780911e-08, "loss": 1.5313, "step": 4323 }, { "epoch": 0.8974678289746783, "grad_norm": 0.8776384646402227, "learning_rate": 4.925772327904805e-08, "loss": 1.5398, "step": 4324 }, { "epoch": 0.8976753839767538, "grad_norm": 1.7551359055823745, "learning_rate": 4.922060355537005e-08, "loss": 1.4829, "step": 4325 }, { "epoch": 0.8978829389788294, "grad_norm": 3.2203706962528575, "learning_rate": 4.918355644255752e-08, "loss": 1.529, "step": 4326 }, { "epoch": 0.8980904939809049, "grad_norm": 1.6691959832247925, "learning_rate": 4.9146581956361864e-08, "loss": 1.5196, "step": 4327 }, { "epoch": 0.8982980489829805, "grad_norm": 0.7695873416536831, "learning_rate": 4.910968011250366e-08, "loss": 1.5075, "step": 4328 }, { "epoch": 0.8985056039850561, "grad_norm": 0.7215491884709342, "learning_rate": 4.9072850926672564e-08, "loss": 1.4768, "step": 4329 }, { "epoch": 0.8987131589871316, "grad_norm": 13.009121681595024, "learning_rate": 4.9036094414527416e-08, "loss": 1.5481, "step": 4330 }, { "epoch": 0.8989207139892071, "grad_norm": 0.7662619366580208, "learning_rate": 4.899941059169611e-08, "loss": 1.5058, "step": 4331 }, { "epoch": 0.8991282689912827, "grad_norm": 0.9438494218201771, "learning_rate": 4.8962799473775576e-08, "loss": 1.5007, "step": 4332 }, { "epoch": 0.8993358239933582, "grad_norm": 1.8020124466171614, "learning_rate": 4.8926261076331954e-08, "loss": 1.443, "step": 4333 }, { "epoch": 0.8995433789954338, "grad_norm": 1.343314488193815, "learning_rate": 4.888979541490035e-08, "loss": 1.4764, "step": 4334 }, { "epoch": 0.8997509339975094, "grad_norm": 0.9158077720735733, "learning_rate": 4.8853402504985026e-08, "loss": 1.4863, "step": 4335 }, { "epoch": 0.8999584889995849, "grad_norm": 0.7047215744402243, "learning_rate": 4.88170823620593e-08, "loss": 1.5387, "step": 4336 }, { "epoch": 0.9001660440016604, "grad_norm": 0.6709317440385352, "learning_rate": 4.878083500156548e-08, "loss": 1.4327, "step": 4337 }, { "epoch": 0.900373599003736, "grad_norm": 0.8814860386970792, "learning_rate": 4.8744660438914985e-08, "loss": 1.4798, "step": 4338 }, { "epoch": 0.9005811540058115, "grad_norm": 0.7987821173935699, "learning_rate": 4.870855868948837e-08, "loss": 1.5077, "step": 4339 }, { "epoch": 0.9007887090078871, "grad_norm": 0.756473745359541, "learning_rate": 4.867252976863499e-08, "loss": 1.4111, "step": 4340 }, { "epoch": 0.9009962640099627, "grad_norm": 0.6532856202952518, "learning_rate": 4.863657369167351e-08, "loss": 1.4461, "step": 4341 }, { "epoch": 0.9012038190120382, "grad_norm": 0.6751058802893536, "learning_rate": 4.860069047389147e-08, "loss": 1.5752, "step": 4342 }, { "epoch": 0.9014113740141138, "grad_norm": 1.3548043307678506, "learning_rate": 4.856488013054543e-08, "loss": 1.482, "step": 4343 }, { "epoch": 0.9016189290161893, "grad_norm": 0.7146013599961456, "learning_rate": 4.852914267686099e-08, "loss": 1.4116, "step": 4344 }, { "epoch": 0.9018264840182648, "grad_norm": 0.6792004305762552, "learning_rate": 4.849347812803281e-08, "loss": 1.5328, "step": 4345 }, { "epoch": 0.9020340390203404, "grad_norm": 1.1622597544407174, "learning_rate": 4.8457886499224496e-08, "loss": 1.5107, "step": 4346 }, { "epoch": 0.902241594022416, "grad_norm": 0.7438104866212684, "learning_rate": 4.842236780556864e-08, "loss": 1.4275, "step": 4347 }, { "epoch": 0.9024491490244915, "grad_norm": 0.7361792801280723, "learning_rate": 4.838692206216692e-08, "loss": 1.4535, "step": 4348 }, { "epoch": 0.9026567040265671, "grad_norm": 0.7286256834292965, "learning_rate": 4.83515492840898e-08, "loss": 1.4551, "step": 4349 }, { "epoch": 0.9028642590286425, "grad_norm": 0.7656211505890863, "learning_rate": 4.8316249486377e-08, "loss": 1.5635, "step": 4350 }, { "epoch": 0.9030718140307181, "grad_norm": 0.6991044769892722, "learning_rate": 4.82810226840369e-08, "loss": 1.5301, "step": 4351 }, { "epoch": 0.9032793690327937, "grad_norm": 0.718697650781546, "learning_rate": 4.824586889204711e-08, "loss": 1.5897, "step": 4352 }, { "epoch": 0.9034869240348692, "grad_norm": 3.9182666062595737, "learning_rate": 4.8210788125354046e-08, "loss": 1.4579, "step": 4353 }, { "epoch": 0.9036944790369448, "grad_norm": 0.7970377583883338, "learning_rate": 4.8175780398873125e-08, "loss": 1.5459, "step": 4354 }, { "epoch": 0.9039020340390204, "grad_norm": 0.6824980370755335, "learning_rate": 4.814084572748869e-08, "loss": 1.3993, "step": 4355 }, { "epoch": 0.9041095890410958, "grad_norm": 0.7123105604691621, "learning_rate": 4.810598412605407e-08, "loss": 1.493, "step": 4356 }, { "epoch": 0.9043171440431714, "grad_norm": 1.7779741114233782, "learning_rate": 4.807119560939146e-08, "loss": 1.5502, "step": 4357 }, { "epoch": 0.904524699045247, "grad_norm": 0.7390264874137673, "learning_rate": 4.803648019229204e-08, "loss": 1.5403, "step": 4358 }, { "epoch": 0.9047322540473225, "grad_norm": 0.7988710148506181, "learning_rate": 4.8001837889515864e-08, "loss": 1.465, "step": 4359 }, { "epoch": 0.9049398090493981, "grad_norm": 0.8442705619661417, "learning_rate": 4.796726871579192e-08, "loss": 1.4121, "step": 4360 }, { "epoch": 0.9051473640514737, "grad_norm": 0.9624261441467818, "learning_rate": 4.793277268581811e-08, "loss": 1.4451, "step": 4361 }, { "epoch": 0.9053549190535491, "grad_norm": 0.9471352482548796, "learning_rate": 4.789834981426124e-08, "loss": 1.4793, "step": 4362 }, { "epoch": 0.9055624740556247, "grad_norm": 0.7201880730417008, "learning_rate": 4.7864000115757e-08, "loss": 1.4578, "step": 4363 }, { "epoch": 0.9057700290577003, "grad_norm": 0.786463621275674, "learning_rate": 4.782972360490992e-08, "loss": 1.5344, "step": 4364 }, { "epoch": 0.9059775840597758, "grad_norm": 0.7245361586970211, "learning_rate": 4.7795520296293553e-08, "loss": 1.5746, "step": 4365 }, { "epoch": 0.9061851390618514, "grad_norm": 1.2410605215183514, "learning_rate": 4.776139020445016e-08, "loss": 1.5514, "step": 4366 }, { "epoch": 0.906392694063927, "grad_norm": 0.7714388110442567, "learning_rate": 4.7727333343890974e-08, "loss": 1.5348, "step": 4367 }, { "epoch": 0.9066002490660025, "grad_norm": 0.8684083889947328, "learning_rate": 4.769334972909608e-08, "loss": 1.506, "step": 4368 }, { "epoch": 0.906807804068078, "grad_norm": 0.8549172423272855, "learning_rate": 4.765943937451439e-08, "loss": 1.483, "step": 4369 }, { "epoch": 0.9070153590701536, "grad_norm": 0.6870905683449732, "learning_rate": 4.762560229456369e-08, "loss": 1.5133, "step": 4370 }, { "epoch": 0.9072229140722291, "grad_norm": 0.8551743345917164, "learning_rate": 4.759183850363062e-08, "loss": 1.606, "step": 4371 }, { "epoch": 0.9074304690743047, "grad_norm": 1.6168826855365306, "learning_rate": 4.755814801607065e-08, "loss": 1.5089, "step": 4372 }, { "epoch": 0.9076380240763803, "grad_norm": 0.6422541228698613, "learning_rate": 4.752453084620806e-08, "loss": 1.5377, "step": 4373 }, { "epoch": 0.9078455790784558, "grad_norm": 0.7689861188126695, "learning_rate": 4.749098700833603e-08, "loss": 1.483, "step": 4374 }, { "epoch": 0.9080531340805313, "grad_norm": 0.6978198724548461, "learning_rate": 4.7457516516716414e-08, "loss": 1.5303, "step": 4375 }, { "epoch": 0.9082606890826069, "grad_norm": 1.0136882312727724, "learning_rate": 4.7424119385580055e-08, "loss": 1.5679, "step": 4376 }, { "epoch": 0.9084682440846824, "grad_norm": 0.6774693709160086, "learning_rate": 4.739079562912651e-08, "loss": 1.5668, "step": 4377 }, { "epoch": 0.908675799086758, "grad_norm": 1.1064660155775856, "learning_rate": 4.7357545261524166e-08, "loss": 1.5283, "step": 4378 }, { "epoch": 0.9088833540888336, "grad_norm": 0.727948921900572, "learning_rate": 4.732436829691015e-08, "loss": 1.5968, "step": 4379 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7389830990205652, "learning_rate": 4.729126474939049e-08, "loss": 1.5618, "step": 4380 }, { "epoch": 0.9092984640929846, "grad_norm": 0.7569380462722427, "learning_rate": 4.725823463303986e-08, "loss": 1.5163, "step": 4381 }, { "epoch": 0.9095060190950602, "grad_norm": 0.6847799619270005, "learning_rate": 4.7225277961901865e-08, "loss": 1.5, "step": 4382 }, { "epoch": 0.9097135740971357, "grad_norm": 0.9145135943101996, "learning_rate": 4.719239474998875e-08, "loss": 1.5592, "step": 4383 }, { "epoch": 0.9099211290992113, "grad_norm": 0.7878203473287412, "learning_rate": 4.7159585011281635e-08, "loss": 1.5378, "step": 4384 }, { "epoch": 0.9101286841012869, "grad_norm": 0.6200429902153209, "learning_rate": 4.712684875973028e-08, "loss": 1.4947, "step": 4385 }, { "epoch": 0.9103362391033624, "grad_norm": 1.3305272680240208, "learning_rate": 4.7094186009253336e-08, "loss": 1.5159, "step": 4386 }, { "epoch": 0.910543794105438, "grad_norm": 0.7071682533579644, "learning_rate": 4.706159677373812e-08, "loss": 1.4443, "step": 4387 }, { "epoch": 0.9107513491075135, "grad_norm": 1.8221901111293077, "learning_rate": 4.7029081067040666e-08, "loss": 1.5619, "step": 4388 }, { "epoch": 0.910958904109589, "grad_norm": 1.0410088942310942, "learning_rate": 4.6996638902985883e-08, "loss": 1.5018, "step": 4389 }, { "epoch": 0.9111664591116646, "grad_norm": 1.2034139004525068, "learning_rate": 4.696427029536721e-08, "loss": 1.4891, "step": 4390 }, { "epoch": 0.9113740141137402, "grad_norm": 0.7881662048489497, "learning_rate": 4.6931975257946985e-08, "loss": 1.5807, "step": 4391 }, { "epoch": 0.9115815691158157, "grad_norm": 1.8912080648551375, "learning_rate": 4.689975380445619e-08, "loss": 1.4143, "step": 4392 }, { "epoch": 0.9117891241178913, "grad_norm": 0.7055284395849184, "learning_rate": 4.686760594859454e-08, "loss": 1.5589, "step": 4393 }, { "epoch": 0.9119966791199668, "grad_norm": 1.0525802899971646, "learning_rate": 4.6835531704030396e-08, "loss": 1.5496, "step": 4394 }, { "epoch": 0.9122042341220423, "grad_norm": 0.6931283373747346, "learning_rate": 4.680353108440098e-08, "loss": 1.5093, "step": 4395 }, { "epoch": 0.9124117891241179, "grad_norm": 0.828222493701835, "learning_rate": 4.677160410331199e-08, "loss": 1.4444, "step": 4396 }, { "epoch": 0.9126193441261934, "grad_norm": 0.7121260738688048, "learning_rate": 4.6739750774338006e-08, "loss": 1.6235, "step": 4397 }, { "epoch": 0.912826899128269, "grad_norm": 0.8476768761382423, "learning_rate": 4.6707971111022194e-08, "loss": 1.5912, "step": 4398 }, { "epoch": 0.9130344541303446, "grad_norm": 2.139204798723291, "learning_rate": 4.6676265126876386e-08, "loss": 1.455, "step": 4399 }, { "epoch": 0.91324200913242, "grad_norm": 0.720492289256062, "learning_rate": 4.664463283538122e-08, "loss": 1.5401, "step": 4400 }, { "epoch": 0.9134495641344956, "grad_norm": 0.7742920956672075, "learning_rate": 4.661307424998579e-08, "loss": 1.5944, "step": 4401 }, { "epoch": 0.9136571191365712, "grad_norm": 1.2314237156426655, "learning_rate": 4.658158938410809e-08, "loss": 1.4757, "step": 4402 }, { "epoch": 0.9138646741386467, "grad_norm": 0.7277071162954373, "learning_rate": 4.6550178251134556e-08, "loss": 1.5239, "step": 4403 }, { "epoch": 0.9140722291407223, "grad_norm": 0.8245201853069499, "learning_rate": 4.6518840864420406e-08, "loss": 1.4398, "step": 4404 }, { "epoch": 0.9142797841427979, "grad_norm": 1.3045080232070827, "learning_rate": 4.6487577237289424e-08, "loss": 1.5627, "step": 4405 }, { "epoch": 0.9144873391448733, "grad_norm": 0.7077548303895802, "learning_rate": 4.645638738303416e-08, "loss": 1.5299, "step": 4406 }, { "epoch": 0.9146948941469489, "grad_norm": 0.8372044048656329, "learning_rate": 4.642527131491562e-08, "loss": 1.5105, "step": 4407 }, { "epoch": 0.9149024491490245, "grad_norm": 0.9066838762925182, "learning_rate": 4.6394229046163595e-08, "loss": 1.5328, "step": 4408 }, { "epoch": 0.9151100041511, "grad_norm": 0.7633254579902081, "learning_rate": 4.6363260589976394e-08, "loss": 1.5325, "step": 4409 }, { "epoch": 0.9153175591531756, "grad_norm": 0.9357551080452527, "learning_rate": 4.633236595952098e-08, "loss": 1.5307, "step": 4410 }, { "epoch": 0.9155251141552512, "grad_norm": 0.9034473550145689, "learning_rate": 4.630154516793299e-08, "loss": 1.5008, "step": 4411 }, { "epoch": 0.9157326691573267, "grad_norm": 0.7076322259730242, "learning_rate": 4.627079822831653e-08, "loss": 1.4441, "step": 4412 }, { "epoch": 0.9159402241594022, "grad_norm": 0.9026288025768455, "learning_rate": 4.624012515374444e-08, "loss": 1.4793, "step": 4413 }, { "epoch": 0.9161477791614778, "grad_norm": 0.6874945841040789, "learning_rate": 4.620952595725803e-08, "loss": 1.4911, "step": 4414 }, { "epoch": 0.9163553341635533, "grad_norm": 1.0048841346332247, "learning_rate": 4.617900065186737e-08, "loss": 1.509, "step": 4415 }, { "epoch": 0.9165628891656289, "grad_norm": 0.6583404609257811, "learning_rate": 4.614854925055089e-08, "loss": 1.5402, "step": 4416 }, { "epoch": 0.9167704441677045, "grad_norm": 1.6156209724642725, "learning_rate": 4.611817176625581e-08, "loss": 1.5041, "step": 4417 }, { "epoch": 0.91697799916978, "grad_norm": 6.318478167668886, "learning_rate": 4.6087868211897785e-08, "loss": 1.4936, "step": 4418 }, { "epoch": 0.9171855541718555, "grad_norm": 0.987265882766978, "learning_rate": 4.605763860036108e-08, "loss": 1.5105, "step": 4419 }, { "epoch": 0.9173931091739311, "grad_norm": 0.7562715301866854, "learning_rate": 4.602748294449855e-08, "loss": 1.5229, "step": 4420 }, { "epoch": 0.9176006641760066, "grad_norm": 0.7504194150683303, "learning_rate": 4.599740125713158e-08, "loss": 1.483, "step": 4421 }, { "epoch": 0.9178082191780822, "grad_norm": 0.650245880355937, "learning_rate": 4.596739355105005e-08, "loss": 1.5077, "step": 4422 }, { "epoch": 0.9180157741801578, "grad_norm": 1.2719317873896605, "learning_rate": 4.593745983901249e-08, "loss": 1.6539, "step": 4423 }, { "epoch": 0.9182233291822333, "grad_norm": 1.1612098878021526, "learning_rate": 4.590760013374593e-08, "loss": 1.5066, "step": 4424 }, { "epoch": 0.9184308841843088, "grad_norm": 0.7345640594038324, "learning_rate": 4.587781444794588e-08, "loss": 1.5099, "step": 4425 }, { "epoch": 0.9186384391863844, "grad_norm": 0.7000581237384536, "learning_rate": 4.5848102794276454e-08, "loss": 1.4999, "step": 4426 }, { "epoch": 0.9188459941884599, "grad_norm": 1.3712296747194503, "learning_rate": 4.581846518537025e-08, "loss": 1.4945, "step": 4427 }, { "epoch": 0.9190535491905355, "grad_norm": 1.0385679378460553, "learning_rate": 4.578890163382841e-08, "loss": 1.4996, "step": 4428 }, { "epoch": 0.9192611041926111, "grad_norm": 0.8330339988495031, "learning_rate": 4.575941215222051e-08, "loss": 1.5503, "step": 4429 }, { "epoch": 0.9194686591946866, "grad_norm": 0.8311479711107309, "learning_rate": 4.5729996753084796e-08, "loss": 1.5274, "step": 4430 }, { "epoch": 0.9196762141967622, "grad_norm": 1.044686514166469, "learning_rate": 4.5700655448927805e-08, "loss": 1.5151, "step": 4431 }, { "epoch": 0.9198837691988377, "grad_norm": 1.2782924304520138, "learning_rate": 4.567138825222475e-08, "loss": 1.5257, "step": 4432 }, { "epoch": 0.9200913242009132, "grad_norm": 2.04488419397859, "learning_rate": 4.564219517541926e-08, "loss": 1.532, "step": 4433 }, { "epoch": 0.9202988792029888, "grad_norm": 0.9057868866762059, "learning_rate": 4.561307623092343e-08, "loss": 1.5895, "step": 4434 }, { "epoch": 0.9205064342050644, "grad_norm": 0.8695854615889368, "learning_rate": 4.558403143111788e-08, "loss": 1.5115, "step": 4435 }, { "epoch": 0.9207139892071399, "grad_norm": 0.7957806638665895, "learning_rate": 4.5555060788351695e-08, "loss": 1.5068, "step": 4436 }, { "epoch": 0.9209215442092155, "grad_norm": 0.9864650291143578, "learning_rate": 4.552616431494241e-08, "loss": 1.5335, "step": 4437 }, { "epoch": 0.921129099211291, "grad_norm": 0.8536182415804855, "learning_rate": 4.549734202317604e-08, "loss": 1.499, "step": 4438 }, { "epoch": 0.9213366542133665, "grad_norm": 1.3897147528792686, "learning_rate": 4.5468593925307064e-08, "loss": 1.4869, "step": 4439 }, { "epoch": 0.9215442092154421, "grad_norm": 0.8138634818407828, "learning_rate": 4.5439920033558376e-08, "loss": 1.5216, "step": 4440 }, { "epoch": 0.9217517642175176, "grad_norm": 0.8327162883012469, "learning_rate": 4.541132036012145e-08, "loss": 1.5013, "step": 4441 }, { "epoch": 0.9219593192195932, "grad_norm": 0.7633244001951912, "learning_rate": 4.5382794917155963e-08, "loss": 1.3942, "step": 4442 }, { "epoch": 0.9221668742216688, "grad_norm": 1.2163544308978829, "learning_rate": 4.535434371679029e-08, "loss": 1.5024, "step": 4443 }, { "epoch": 0.9223744292237442, "grad_norm": 1.1302019272247883, "learning_rate": 4.532596677112111e-08, "loss": 1.4594, "step": 4444 }, { "epoch": 0.9225819842258198, "grad_norm": 0.7303077583103256, "learning_rate": 4.529766409221351e-08, "loss": 1.5442, "step": 4445 }, { "epoch": 0.9227895392278954, "grad_norm": 1.3541486344650249, "learning_rate": 4.526943569210102e-08, "loss": 1.4617, "step": 4446 }, { "epoch": 0.9229970942299709, "grad_norm": 0.9248102150018691, "learning_rate": 4.524128158278569e-08, "loss": 1.4724, "step": 4447 }, { "epoch": 0.9232046492320465, "grad_norm": 0.7525579147298637, "learning_rate": 4.521320177623784e-08, "loss": 1.5254, "step": 4448 }, { "epoch": 0.9234122042341221, "grad_norm": 2.0492512128914906, "learning_rate": 4.5185196284396255e-08, "loss": 1.4891, "step": 4449 }, { "epoch": 0.9236197592361975, "grad_norm": 1.1699401786400996, "learning_rate": 4.515726511916815e-08, "loss": 1.4862, "step": 4450 }, { "epoch": 0.9238273142382731, "grad_norm": 1.190228317672257, "learning_rate": 4.5129408292429097e-08, "loss": 1.4071, "step": 4451 }, { "epoch": 0.9240348692403487, "grad_norm": 1.228477886821952, "learning_rate": 4.510162581602309e-08, "loss": 1.5247, "step": 4452 }, { "epoch": 0.9242424242424242, "grad_norm": 1.2742643777187674, "learning_rate": 4.50739177017625e-08, "loss": 1.5843, "step": 4453 }, { "epoch": 0.9244499792444998, "grad_norm": 3.340894858996516, "learning_rate": 4.5046283961428095e-08, "loss": 1.506, "step": 4454 }, { "epoch": 0.9246575342465754, "grad_norm": 0.6847573499275801, "learning_rate": 4.501872460676895e-08, "loss": 1.5187, "step": 4455 }, { "epoch": 0.9248650892486509, "grad_norm": 1.9571099592762493, "learning_rate": 4.499123964950266e-08, "loss": 1.5183, "step": 4456 }, { "epoch": 0.9250726442507264, "grad_norm": 1.1258851354934087, "learning_rate": 4.496382910131502e-08, "loss": 1.5013, "step": 4457 }, { "epoch": 0.925280199252802, "grad_norm": 0.8065146095577503, "learning_rate": 4.493649297386033e-08, "loss": 1.5093, "step": 4458 }, { "epoch": 0.9254877542548775, "grad_norm": 0.6998701312752142, "learning_rate": 4.490923127876115e-08, "loss": 1.4497, "step": 4459 }, { "epoch": 0.9256953092569531, "grad_norm": 2.77959072017406, "learning_rate": 4.488204402760843e-08, "loss": 1.4958, "step": 4460 }, { "epoch": 0.9259028642590287, "grad_norm": 0.9388118863179752, "learning_rate": 4.485493123196144e-08, "loss": 1.5113, "step": 4461 }, { "epoch": 0.9261104192611042, "grad_norm": 2.2936421316010955, "learning_rate": 4.482789290334789e-08, "loss": 1.5426, "step": 4462 }, { "epoch": 0.9263179742631797, "grad_norm": 0.702620256173264, "learning_rate": 4.4800929053263714e-08, "loss": 1.4416, "step": 4463 }, { "epoch": 0.9265255292652553, "grad_norm": 0.6802808803940142, "learning_rate": 4.477403969317323e-08, "loss": 1.6134, "step": 4464 }, { "epoch": 0.9267330842673308, "grad_norm": 0.9739628418057668, "learning_rate": 4.47472248345091e-08, "loss": 1.4952, "step": 4465 }, { "epoch": 0.9269406392694064, "grad_norm": 0.9030463701204992, "learning_rate": 4.472048448867225e-08, "loss": 1.4775, "step": 4466 }, { "epoch": 0.927148194271482, "grad_norm": 0.6996844976075115, "learning_rate": 4.4693818667032e-08, "loss": 1.5166, "step": 4467 }, { "epoch": 0.9273557492735575, "grad_norm": 0.7571359355755798, "learning_rate": 4.4667227380925945e-08, "loss": 1.5114, "step": 4468 }, { "epoch": 0.927563304275633, "grad_norm": 0.6672144682852137, "learning_rate": 4.464071064165998e-08, "loss": 1.5223, "step": 4469 }, { "epoch": 0.9277708592777086, "grad_norm": 0.7942084696423082, "learning_rate": 4.461426846050831e-08, "loss": 1.4548, "step": 4470 }, { "epoch": 0.9279784142797841, "grad_norm": 0.658150520069637, "learning_rate": 4.45879008487135e-08, "loss": 1.507, "step": 4471 }, { "epoch": 0.9281859692818597, "grad_norm": 0.7082920604084778, "learning_rate": 4.4561607817486283e-08, "loss": 1.5786, "step": 4472 }, { "epoch": 0.9283935242839353, "grad_norm": 0.7296630743884838, "learning_rate": 4.453538937800581e-08, "loss": 1.4484, "step": 4473 }, { "epoch": 0.9286010792860108, "grad_norm": 0.7233965176401961, "learning_rate": 4.450924554141948e-08, "loss": 1.5375, "step": 4474 }, { "epoch": 0.9288086342880864, "grad_norm": 0.8294890166298233, "learning_rate": 4.448317631884292e-08, "loss": 1.5272, "step": 4475 }, { "epoch": 0.9290161892901619, "grad_norm": 0.7815812696781098, "learning_rate": 4.4457181721360046e-08, "loss": 1.5248, "step": 4476 }, { "epoch": 0.9292237442922374, "grad_norm": 0.699343076994957, "learning_rate": 4.4431261760023145e-08, "loss": 1.5114, "step": 4477 }, { "epoch": 0.929431299294313, "grad_norm": 0.7149120920009966, "learning_rate": 4.4405416445852646e-08, "loss": 1.5059, "step": 4478 }, { "epoch": 0.9296388542963886, "grad_norm": 1.4018707708059848, "learning_rate": 4.437964578983729e-08, "loss": 1.5493, "step": 4479 }, { "epoch": 0.9298464092984641, "grad_norm": 0.6541224093567589, "learning_rate": 4.4353949802934124e-08, "loss": 1.527, "step": 4480 }, { "epoch": 0.9300539643005397, "grad_norm": 0.7940169131847479, "learning_rate": 4.4328328496068323e-08, "loss": 1.5077, "step": 4481 }, { "epoch": 0.9302615193026152, "grad_norm": 1.3773760777592015, "learning_rate": 4.430278188013347e-08, "loss": 1.5272, "step": 4482 }, { "epoch": 0.9304690743046907, "grad_norm": 0.8855725936660176, "learning_rate": 4.4277309965991236e-08, "loss": 1.4773, "step": 4483 }, { "epoch": 0.9306766293067663, "grad_norm": 0.8090724338889203, "learning_rate": 4.4251912764471656e-08, "loss": 1.5352, "step": 4484 }, { "epoch": 0.9308841843088418, "grad_norm": 0.8313199205017122, "learning_rate": 4.422659028637291e-08, "loss": 1.4942, "step": 4485 }, { "epoch": 0.9310917393109174, "grad_norm": 1.8701926240939024, "learning_rate": 4.420134254246144e-08, "loss": 1.5, "step": 4486 }, { "epoch": 0.931299294312993, "grad_norm": 1.313129285023572, "learning_rate": 4.417616954347194e-08, "loss": 1.5197, "step": 4487 }, { "epoch": 0.9315068493150684, "grad_norm": 0.6029976800612497, "learning_rate": 4.4151071300107296e-08, "loss": 1.5494, "step": 4488 }, { "epoch": 0.931714404317144, "grad_norm": 0.7550787286907598, "learning_rate": 4.412604782303862e-08, "loss": 1.4717, "step": 4489 }, { "epoch": 0.9319219593192196, "grad_norm": 0.7139714846761307, "learning_rate": 4.410109912290521e-08, "loss": 1.4775, "step": 4490 }, { "epoch": 0.9321295143212951, "grad_norm": 0.901604077229633, "learning_rate": 4.407622521031462e-08, "loss": 1.434, "step": 4491 }, { "epoch": 0.9323370693233707, "grad_norm": 0.9259749545321699, "learning_rate": 4.405142609584253e-08, "loss": 1.5553, "step": 4492 }, { "epoch": 0.9325446243254463, "grad_norm": 1.0513330819349027, "learning_rate": 4.402670179003292e-08, "loss": 1.4863, "step": 4493 }, { "epoch": 0.9327521793275217, "grad_norm": 1.4548972607670645, "learning_rate": 4.4002052303397874e-08, "loss": 1.4719, "step": 4494 }, { "epoch": 0.9329597343295973, "grad_norm": 2.2831818349145996, "learning_rate": 4.3977477646417714e-08, "loss": 1.5404, "step": 4495 }, { "epoch": 0.9331672893316729, "grad_norm": 0.7028828157279985, "learning_rate": 4.395297782954091e-08, "loss": 1.4762, "step": 4496 }, { "epoch": 0.9333748443337484, "grad_norm": 0.7034628389845509, "learning_rate": 4.392855286318419e-08, "loss": 1.5368, "step": 4497 }, { "epoch": 0.933582399335824, "grad_norm": 0.8850500424659637, "learning_rate": 4.390420275773232e-08, "loss": 1.5443, "step": 4498 }, { "epoch": 0.9337899543378996, "grad_norm": 0.72343686302418, "learning_rate": 4.3879927523538366e-08, "loss": 1.5123, "step": 4499 }, { "epoch": 0.933997509339975, "grad_norm": 0.6702840746003527, "learning_rate": 4.385572717092353e-08, "loss": 1.5527, "step": 4500 }, { "epoch": 0.9342050643420506, "grad_norm": 0.8031675225917405, "learning_rate": 4.3831601710177126e-08, "loss": 1.4747, "step": 4501 }, { "epoch": 0.9344126193441262, "grad_norm": 0.721780908832118, "learning_rate": 4.380755115155666e-08, "loss": 1.5649, "step": 4502 }, { "epoch": 0.9346201743462017, "grad_norm": 0.7118293261803608, "learning_rate": 4.378357550528781e-08, "loss": 1.6267, "step": 4503 }, { "epoch": 0.9348277293482773, "grad_norm": 0.8200651442469405, "learning_rate": 4.375967478156437e-08, "loss": 1.4658, "step": 4504 }, { "epoch": 0.9350352843503529, "grad_norm": 0.7175265262449997, "learning_rate": 4.3735848990548274e-08, "loss": 1.5391, "step": 4505 }, { "epoch": 0.9352428393524284, "grad_norm": 0.7726362716001752, "learning_rate": 4.3712098142369694e-08, "loss": 1.5698, "step": 4506 }, { "epoch": 0.935450394354504, "grad_norm": 0.9930249508022962, "learning_rate": 4.368842224712677e-08, "loss": 1.564, "step": 4507 }, { "epoch": 0.9356579493565795, "grad_norm": 0.8468251701210507, "learning_rate": 4.36648213148859e-08, "loss": 1.5035, "step": 4508 }, { "epoch": 0.935865504358655, "grad_norm": 1.0619639715631535, "learning_rate": 4.364129535568159e-08, "loss": 1.4753, "step": 4509 }, { "epoch": 0.9360730593607306, "grad_norm": 0.7544660832297827, "learning_rate": 4.3617844379516424e-08, "loss": 1.5263, "step": 4510 }, { "epoch": 0.9362806143628062, "grad_norm": 0.7925913449770202, "learning_rate": 4.359446839636114e-08, "loss": 1.5363, "step": 4511 }, { "epoch": 0.9364881693648817, "grad_norm": 1.2563411978038899, "learning_rate": 4.357116741615463e-08, "loss": 1.5103, "step": 4512 }, { "epoch": 0.9366957243669572, "grad_norm": 1.205123315595704, "learning_rate": 4.3547941448803794e-08, "loss": 1.5272, "step": 4513 }, { "epoch": 0.9369032793690328, "grad_norm": 0.6445394889101358, "learning_rate": 4.3524790504183716e-08, "loss": 1.6343, "step": 4514 }, { "epoch": 0.9371108343711083, "grad_norm": 2.3546511600674465, "learning_rate": 4.3501714592137555e-08, "loss": 1.4781, "step": 4515 }, { "epoch": 0.9373183893731839, "grad_norm": 0.9086640031498722, "learning_rate": 4.3478713722476587e-08, "loss": 1.4756, "step": 4516 }, { "epoch": 0.9375259443752595, "grad_norm": 0.6822374602427158, "learning_rate": 4.345578790498019e-08, "loss": 1.5227, "step": 4517 }, { "epoch": 0.937733499377335, "grad_norm": 1.1028352204265481, "learning_rate": 4.3432937149395786e-08, "loss": 1.5268, "step": 4518 }, { "epoch": 0.9379410543794106, "grad_norm": 0.9282871089810995, "learning_rate": 4.341016146543892e-08, "loss": 1.4393, "step": 4519 }, { "epoch": 0.9381486093814861, "grad_norm": 0.7863922224590355, "learning_rate": 4.338746086279317e-08, "loss": 1.4801, "step": 4520 }, { "epoch": 0.9383561643835616, "grad_norm": 0.8555634570162212, "learning_rate": 4.336483535111032e-08, "loss": 1.5035, "step": 4521 }, { "epoch": 0.9385637193856372, "grad_norm": 1.6854804852988134, "learning_rate": 4.334228494001006e-08, "loss": 1.6103, "step": 4522 }, { "epoch": 0.9387712743877128, "grad_norm": 0.6385385269374086, "learning_rate": 4.331980963908024e-08, "loss": 1.5397, "step": 4523 }, { "epoch": 0.9389788293897883, "grad_norm": 0.9300061362751645, "learning_rate": 4.3297409457876784e-08, "loss": 1.5995, "step": 4524 }, { "epoch": 0.9391863843918639, "grad_norm": 0.9375377838751968, "learning_rate": 4.327508440592362e-08, "loss": 1.5737, "step": 4525 }, { "epoch": 0.9393939393939394, "grad_norm": 0.6737914930537501, "learning_rate": 4.325283449271279e-08, "loss": 1.5171, "step": 4526 }, { "epoch": 0.9396014943960149, "grad_norm": 1.136481574205765, "learning_rate": 4.323065972770438e-08, "loss": 1.5142, "step": 4527 }, { "epoch": 0.9398090493980905, "grad_norm": 1.0050799981727405, "learning_rate": 4.3208560120326475e-08, "loss": 1.4586, "step": 4528 }, { "epoch": 0.940016604400166, "grad_norm": 0.9607987975142752, "learning_rate": 4.318653567997527e-08, "loss": 1.5697, "step": 4529 }, { "epoch": 0.9402241594022416, "grad_norm": 1.0471143595993813, "learning_rate": 4.316458641601497e-08, "loss": 1.5214, "step": 4530 }, { "epoch": 0.9404317144043172, "grad_norm": 0.706593188312328, "learning_rate": 4.3142712337777806e-08, "loss": 1.552, "step": 4531 }, { "epoch": 0.9406392694063926, "grad_norm": 0.7029313367378158, "learning_rate": 4.3120913454564064e-08, "loss": 1.5417, "step": 4532 }, { "epoch": 0.9408468244084682, "grad_norm": 0.8704239289503982, "learning_rate": 4.3099189775642044e-08, "loss": 1.5143, "step": 4533 }, { "epoch": 0.9410543794105438, "grad_norm": 1.2689383914240466, "learning_rate": 4.307754131024808e-08, "loss": 1.5231, "step": 4534 }, { "epoch": 0.9412619344126193, "grad_norm": 0.9087786551018489, "learning_rate": 4.305596806758655e-08, "loss": 1.4596, "step": 4535 }, { "epoch": 0.9414694894146949, "grad_norm": 0.7299322704651798, "learning_rate": 4.3034470056829774e-08, "loss": 1.546, "step": 4536 }, { "epoch": 0.9416770444167705, "grad_norm": 0.837026096713149, "learning_rate": 4.301304728711815e-08, "loss": 1.5874, "step": 4537 }, { "epoch": 0.941884599418846, "grad_norm": 0.7498371578254125, "learning_rate": 4.299169976756013e-08, "loss": 1.4819, "step": 4538 }, { "epoch": 0.9420921544209215, "grad_norm": 0.9368841752996462, "learning_rate": 4.297042750723203e-08, "loss": 1.4899, "step": 4539 }, { "epoch": 0.9422997094229971, "grad_norm": 1.4710849738274496, "learning_rate": 4.294923051517828e-08, "loss": 1.578, "step": 4540 }, { "epoch": 0.9425072644250726, "grad_norm": 1.7118077346986422, "learning_rate": 4.2928108800411314e-08, "loss": 1.4544, "step": 4541 }, { "epoch": 0.9427148194271482, "grad_norm": 0.8717334193784907, "learning_rate": 4.2907062371911456e-08, "loss": 1.5114, "step": 4542 }, { "epoch": 0.9429223744292238, "grad_norm": 0.797151938995318, "learning_rate": 4.288609123862715e-08, "loss": 1.543, "step": 4543 }, { "epoch": 0.9431299294312993, "grad_norm": 0.6492360388504499, "learning_rate": 4.2865195409474765e-08, "loss": 1.5174, "step": 4544 }, { "epoch": 0.9433374844333748, "grad_norm": 0.8143199366541592, "learning_rate": 4.28443748933386e-08, "loss": 1.522, "step": 4545 }, { "epoch": 0.9435450394354504, "grad_norm": 0.8687909828584272, "learning_rate": 4.282362969907102e-08, "loss": 1.5406, "step": 4546 }, { "epoch": 0.9437525944375259, "grad_norm": 0.6432271038911301, "learning_rate": 4.2802959835492335e-08, "loss": 1.5629, "step": 4547 }, { "epoch": 0.9439601494396015, "grad_norm": 0.7500351491669647, "learning_rate": 4.278236531139082e-08, "loss": 1.5795, "step": 4548 }, { "epoch": 0.9441677044416771, "grad_norm": 0.9294157260063273, "learning_rate": 4.276184613552269e-08, "loss": 1.5078, "step": 4549 }, { "epoch": 0.9443752594437526, "grad_norm": 0.8764919926678213, "learning_rate": 4.2741402316612195e-08, "loss": 1.4858, "step": 4550 }, { "epoch": 0.9445828144458281, "grad_norm": 0.8546642247694494, "learning_rate": 4.2721033863351483e-08, "loss": 1.5149, "step": 4551 }, { "epoch": 0.9447903694479037, "grad_norm": 0.774177244202855, "learning_rate": 4.270074078440069e-08, "loss": 1.5055, "step": 4552 }, { "epoch": 0.9449979244499792, "grad_norm": 0.7604176349904409, "learning_rate": 4.2680523088387875e-08, "loss": 1.4702, "step": 4553 }, { "epoch": 0.9452054794520548, "grad_norm": 2.0082303625924713, "learning_rate": 4.266038078390908e-08, "loss": 1.461, "step": 4554 }, { "epoch": 0.9454130344541304, "grad_norm": 0.6675684174323314, "learning_rate": 4.26403138795283e-08, "loss": 1.4913, "step": 4555 }, { "epoch": 0.9456205894562059, "grad_norm": 0.7995072028384712, "learning_rate": 4.262032238377741e-08, "loss": 1.4671, "step": 4556 }, { "epoch": 0.9458281444582815, "grad_norm": 1.0244577318179149, "learning_rate": 4.2600406305156267e-08, "loss": 1.5108, "step": 4557 }, { "epoch": 0.946035699460357, "grad_norm": 0.7408491221435469, "learning_rate": 4.258056565213267e-08, "loss": 1.4884, "step": 4558 }, { "epoch": 0.9462432544624325, "grad_norm": 0.8713103891772991, "learning_rate": 4.256080043314235e-08, "loss": 1.4809, "step": 4559 }, { "epoch": 0.9464508094645081, "grad_norm": 0.6426473969151258, "learning_rate": 4.254111065658895e-08, "loss": 1.5272, "step": 4560 }, { "epoch": 0.9466583644665837, "grad_norm": 1.16716621572466, "learning_rate": 4.252149633084403e-08, "loss": 1.5701, "step": 4561 }, { "epoch": 0.9468659194686592, "grad_norm": 0.7637856511611366, "learning_rate": 4.250195746424709e-08, "loss": 1.4722, "step": 4562 }, { "epoch": 0.9470734744707348, "grad_norm": 0.9508379784710671, "learning_rate": 4.248249406510552e-08, "loss": 1.4911, "step": 4563 }, { "epoch": 0.9472810294728103, "grad_norm": 2.8244709877335006, "learning_rate": 4.2463106141694644e-08, "loss": 1.4978, "step": 4564 }, { "epoch": 0.9474885844748858, "grad_norm": 0.6451189998139304, "learning_rate": 4.2443793702257715e-08, "loss": 1.5645, "step": 4565 }, { "epoch": 0.9476961394769614, "grad_norm": 1.1247387387131438, "learning_rate": 4.242455675500585e-08, "loss": 1.6368, "step": 4566 }, { "epoch": 0.947903694479037, "grad_norm": 1.496196664764651, "learning_rate": 4.2405395308118086e-08, "loss": 1.5292, "step": 4567 }, { "epoch": 0.9481112494811125, "grad_norm": 0.9142829891268238, "learning_rate": 4.238630936974139e-08, "loss": 1.5708, "step": 4568 }, { "epoch": 0.9483188044831881, "grad_norm": 1.1393848253853893, "learning_rate": 4.236729894799056e-08, "loss": 1.4659, "step": 4569 }, { "epoch": 0.9485263594852636, "grad_norm": 0.7690035063731331, "learning_rate": 4.2348364050948344e-08, "loss": 1.516, "step": 4570 }, { "epoch": 0.9487339144873391, "grad_norm": 0.6911298818902055, "learning_rate": 4.232950468666533e-08, "loss": 1.5558, "step": 4571 }, { "epoch": 0.9489414694894147, "grad_norm": 1.5514773118523946, "learning_rate": 4.231072086316005e-08, "loss": 1.5528, "step": 4572 }, { "epoch": 0.9491490244914902, "grad_norm": 0.7423324589971126, "learning_rate": 4.2292012588418896e-08, "loss": 1.419, "step": 4573 }, { "epoch": 0.9493565794935658, "grad_norm": 0.6969655876415753, "learning_rate": 4.2273379870396084e-08, "loss": 1.5177, "step": 4574 }, { "epoch": 0.9495641344956414, "grad_norm": 0.6856667438061688, "learning_rate": 4.225482271701379e-08, "loss": 1.59, "step": 4575 }, { "epoch": 0.9497716894977168, "grad_norm": 0.8036229616953146, "learning_rate": 4.2236341136162017e-08, "loss": 1.4783, "step": 4576 }, { "epoch": 0.9499792444997924, "grad_norm": 1.3426688142416194, "learning_rate": 4.221793513569863e-08, "loss": 1.5896, "step": 4577 }, { "epoch": 0.950186799501868, "grad_norm": 0.6733771643306685, "learning_rate": 4.219960472344936e-08, "loss": 1.555, "step": 4578 }, { "epoch": 0.9503943545039435, "grad_norm": 0.6456767579538037, "learning_rate": 4.218134990720785e-08, "loss": 1.5328, "step": 4579 }, { "epoch": 0.9506019095060191, "grad_norm": 0.9374275557549357, "learning_rate": 4.216317069473555e-08, "loss": 1.4818, "step": 4580 }, { "epoch": 0.9508094645080947, "grad_norm": 0.6484423525678912, "learning_rate": 4.214506709376175e-08, "loss": 1.5129, "step": 4581 }, { "epoch": 0.9510170195101701, "grad_norm": 0.7737760479119955, "learning_rate": 4.212703911198366e-08, "loss": 1.5594, "step": 4582 }, { "epoch": 0.9512245745122457, "grad_norm": 0.7844084207844886, "learning_rate": 4.210908675706626e-08, "loss": 1.4957, "step": 4583 }, { "epoch": 0.9514321295143213, "grad_norm": 0.9262593197523248, "learning_rate": 4.209121003664245e-08, "loss": 1.5767, "step": 4584 }, { "epoch": 0.9516396845163968, "grad_norm": 0.7698081820293008, "learning_rate": 4.2073408958312926e-08, "loss": 1.5767, "step": 4585 }, { "epoch": 0.9518472395184724, "grad_norm": 0.6735198851412284, "learning_rate": 4.205568352964622e-08, "loss": 1.5023, "step": 4586 }, { "epoch": 0.952054794520548, "grad_norm": 0.8459595848873649, "learning_rate": 4.203803375817872e-08, "loss": 1.4842, "step": 4587 }, { "epoch": 0.9522623495226235, "grad_norm": 1.0155324317812409, "learning_rate": 4.202045965141468e-08, "loss": 1.6362, "step": 4588 }, { "epoch": 0.952469904524699, "grad_norm": 0.7090566499052219, "learning_rate": 4.200296121682606e-08, "loss": 1.5566, "step": 4589 }, { "epoch": 0.9526774595267746, "grad_norm": 0.9737052611134742, "learning_rate": 4.1985538461852796e-08, "loss": 1.4877, "step": 4590 }, { "epoch": 0.9528850145288501, "grad_norm": 1.0673626424193492, "learning_rate": 4.196819139390257e-08, "loss": 1.5156, "step": 4591 }, { "epoch": 0.9530925695309257, "grad_norm": 0.7054889880778412, "learning_rate": 4.195092002035089e-08, "loss": 1.4907, "step": 4592 }, { "epoch": 0.9533001245330013, "grad_norm": 0.6999754730041319, "learning_rate": 4.193372434854108e-08, "loss": 1.5677, "step": 4593 }, { "epoch": 0.9535076795350768, "grad_norm": 0.8279841744733754, "learning_rate": 4.191660438578428e-08, "loss": 1.6033, "step": 4594 }, { "epoch": 0.9537152345371523, "grad_norm": 1.3923924407940542, "learning_rate": 4.189956013935945e-08, "loss": 1.5126, "step": 4595 }, { "epoch": 0.9539227895392279, "grad_norm": 1.681141184342241, "learning_rate": 4.188259161651336e-08, "loss": 1.5012, "step": 4596 }, { "epoch": 0.9541303445413034, "grad_norm": 0.7265135910015387, "learning_rate": 4.186569882446057e-08, "loss": 1.4503, "step": 4597 }, { "epoch": 0.954337899543379, "grad_norm": 0.854786428605816, "learning_rate": 4.1848881770383405e-08, "loss": 1.471, "step": 4598 }, { "epoch": 0.9545454545454546, "grad_norm": 0.8148948557589987, "learning_rate": 4.183214046143212e-08, "loss": 1.5146, "step": 4599 }, { "epoch": 0.9547530095475301, "grad_norm": 0.7662688981036365, "learning_rate": 4.1815474904724586e-08, "loss": 1.4914, "step": 4600 }, { "epoch": 0.9549605645496057, "grad_norm": 0.837785386236163, "learning_rate": 4.1798885107346605e-08, "loss": 1.522, "step": 4601 }, { "epoch": 0.9551681195516812, "grad_norm": 0.8890482116200626, "learning_rate": 4.178237107635171e-08, "loss": 1.498, "step": 4602 }, { "epoch": 0.9553756745537567, "grad_norm": 0.8753034034433054, "learning_rate": 4.176593281876123e-08, "loss": 1.502, "step": 4603 }, { "epoch": 0.9555832295558323, "grad_norm": 0.6595086623139345, "learning_rate": 4.1749570341564245e-08, "loss": 1.4684, "step": 4604 }, { "epoch": 0.9557907845579079, "grad_norm": 1.0800578946187207, "learning_rate": 4.17332836517177e-08, "loss": 1.438, "step": 4605 }, { "epoch": 0.9559983395599834, "grad_norm": 0.6281388416977238, "learning_rate": 4.1717072756146225e-08, "loss": 1.5573, "step": 4606 }, { "epoch": 0.956205894562059, "grad_norm": 1.8534936102912238, "learning_rate": 4.170093766174226e-08, "loss": 1.4748, "step": 4607 }, { "epoch": 0.9564134495641345, "grad_norm": 0.7001395982790451, "learning_rate": 4.1684878375366025e-08, "loss": 1.5377, "step": 4608 }, { "epoch": 0.95662100456621, "grad_norm": 1.6047256182064333, "learning_rate": 4.1668894903845525e-08, "loss": 1.5357, "step": 4609 }, { "epoch": 0.9568285595682856, "grad_norm": 0.6796980319489333, "learning_rate": 4.165298725397648e-08, "loss": 1.5431, "step": 4610 }, { "epoch": 0.9570361145703612, "grad_norm": 1.2916547937961653, "learning_rate": 4.163715543252242e-08, "loss": 1.5561, "step": 4611 }, { "epoch": 0.9572436695724367, "grad_norm": 0.7948554214906115, "learning_rate": 4.162139944621461e-08, "loss": 1.477, "step": 4612 }, { "epoch": 0.9574512245745123, "grad_norm": 0.6937265303064192, "learning_rate": 4.160571930175206e-08, "loss": 1.5232, "step": 4613 }, { "epoch": 0.9576587795765878, "grad_norm": 0.8077156923953326, "learning_rate": 4.159011500580157e-08, "loss": 1.5153, "step": 4614 }, { "epoch": 0.9578663345786633, "grad_norm": 1.758448154174464, "learning_rate": 4.157458656499767e-08, "loss": 1.4906, "step": 4615 }, { "epoch": 0.9580738895807389, "grad_norm": 1.1439333501068285, "learning_rate": 4.1559133985942626e-08, "loss": 1.4249, "step": 4616 }, { "epoch": 0.9582814445828145, "grad_norm": 1.407703602097641, "learning_rate": 4.154375727520648e-08, "loss": 1.5138, "step": 4617 }, { "epoch": 0.95848899958489, "grad_norm": 1.0606155976012503, "learning_rate": 4.152845643932701e-08, "loss": 1.5539, "step": 4618 }, { "epoch": 0.9586965545869656, "grad_norm": 0.7122111604792619, "learning_rate": 4.151323148480968e-08, "loss": 1.5537, "step": 4619 }, { "epoch": 0.958904109589041, "grad_norm": 0.7859059654256734, "learning_rate": 4.149808241812781e-08, "loss": 1.5215, "step": 4620 }, { "epoch": 0.9591116645911166, "grad_norm": 0.8564996464193283, "learning_rate": 4.1483009245722314e-08, "loss": 1.462, "step": 4621 }, { "epoch": 0.9593192195931922, "grad_norm": 0.6311425774837578, "learning_rate": 4.1468011974001914e-08, "loss": 1.5691, "step": 4622 }, { "epoch": 0.9595267745952677, "grad_norm": 0.6202786189785873, "learning_rate": 4.145309060934312e-08, "loss": 1.446, "step": 4623 }, { "epoch": 0.9597343295973433, "grad_norm": 1.2288408414971486, "learning_rate": 4.1438245158089997e-08, "loss": 1.5343, "step": 4624 }, { "epoch": 0.9599418845994189, "grad_norm": 0.7311824696214257, "learning_rate": 4.142347562655451e-08, "loss": 1.4818, "step": 4625 }, { "epoch": 0.9601494396014943, "grad_norm": 1.0596396783461544, "learning_rate": 4.140878202101625e-08, "loss": 1.5183, "step": 4626 }, { "epoch": 0.9603569946035699, "grad_norm": 1.325654952200408, "learning_rate": 4.139416434772255e-08, "loss": 1.5279, "step": 4627 }, { "epoch": 0.9605645496056455, "grad_norm": 0.8300291401820142, "learning_rate": 4.1379622612888426e-08, "loss": 1.5105, "step": 4628 }, { "epoch": 0.960772104607721, "grad_norm": 0.9084762549121753, "learning_rate": 4.13651568226967e-08, "loss": 1.563, "step": 4629 }, { "epoch": 0.9609796596097966, "grad_norm": 1.089336144976623, "learning_rate": 4.135076698329779e-08, "loss": 1.48, "step": 4630 }, { "epoch": 0.9611872146118722, "grad_norm": 0.7102430393330588, "learning_rate": 4.1336453100809893e-08, "loss": 1.5693, "step": 4631 }, { "epoch": 0.9613947696139477, "grad_norm": 0.8714599631287424, "learning_rate": 4.132221518131891e-08, "loss": 1.4737, "step": 4632 }, { "epoch": 0.9616023246160232, "grad_norm": 0.7782564964660168, "learning_rate": 4.130805323087838e-08, "loss": 1.4724, "step": 4633 }, { "epoch": 0.9618098796180988, "grad_norm": 0.7500661734896757, "learning_rate": 4.1293967255509624e-08, "loss": 1.5445, "step": 4634 }, { "epoch": 0.9620174346201743, "grad_norm": 0.8544078344655154, "learning_rate": 4.1279957261201614e-08, "loss": 1.5287, "step": 4635 }, { "epoch": 0.9622249896222499, "grad_norm": 0.7329942189821124, "learning_rate": 4.1266023253911034e-08, "loss": 1.5819, "step": 4636 }, { "epoch": 0.9624325446243255, "grad_norm": 0.9364016938946021, "learning_rate": 4.125216523956224e-08, "loss": 1.5064, "step": 4637 }, { "epoch": 0.962640099626401, "grad_norm": 0.6864980539580976, "learning_rate": 4.123838322404731e-08, "loss": 1.5072, "step": 4638 }, { "epoch": 0.9628476546284765, "grad_norm": 1.299292949039472, "learning_rate": 4.1224677213225986e-08, "loss": 1.562, "step": 4639 }, { "epoch": 0.9630552096305521, "grad_norm": 1.077368512042783, "learning_rate": 4.121104721292569e-08, "loss": 1.5629, "step": 4640 }, { "epoch": 0.9632627646326276, "grad_norm": 0.8967764752786451, "learning_rate": 4.119749322894154e-08, "loss": 1.4243, "step": 4641 }, { "epoch": 0.9634703196347032, "grad_norm": 0.6675107737060751, "learning_rate": 4.1184015267036336e-08, "loss": 1.5298, "step": 4642 }, { "epoch": 0.9636778746367788, "grad_norm": 0.6756480050196306, "learning_rate": 4.117061333294053e-08, "loss": 1.5958, "step": 4643 }, { "epoch": 0.9638854296388543, "grad_norm": 0.7054910490034481, "learning_rate": 4.11572874323523e-08, "loss": 1.5472, "step": 4644 }, { "epoch": 0.9640929846409299, "grad_norm": 0.7840653708980738, "learning_rate": 4.114403757093744e-08, "loss": 1.5807, "step": 4645 }, { "epoch": 0.9643005396430054, "grad_norm": 0.6934424233209485, "learning_rate": 4.113086375432947e-08, "loss": 1.5376, "step": 4646 }, { "epoch": 0.9645080946450809, "grad_norm": 0.7877721217863387, "learning_rate": 4.111776598812951e-08, "loss": 1.5276, "step": 4647 }, { "epoch": 0.9647156496471565, "grad_norm": 0.7454271418523588, "learning_rate": 4.110474427790641e-08, "loss": 1.484, "step": 4648 }, { "epoch": 0.9649232046492321, "grad_norm": 0.744986205741643, "learning_rate": 4.109179862919663e-08, "loss": 1.5905, "step": 4649 }, { "epoch": 0.9651307596513076, "grad_norm": 0.7004663036656038, "learning_rate": 4.1078929047504335e-08, "loss": 1.4938, "step": 4650 }, { "epoch": 0.9653383146533832, "grad_norm": 1.0713546895606605, "learning_rate": 4.1066135538301335e-08, "loss": 1.5061, "step": 4651 }, { "epoch": 0.9655458696554587, "grad_norm": 1.566737592602583, "learning_rate": 4.1053418107027064e-08, "loss": 1.5567, "step": 4652 }, { "epoch": 0.9657534246575342, "grad_norm": 0.8376640221052621, "learning_rate": 4.104077675908867e-08, "loss": 1.5295, "step": 4653 }, { "epoch": 0.9659609796596098, "grad_norm": 1.0867380990412316, "learning_rate": 4.102821149986086e-08, "loss": 1.5059, "step": 4654 }, { "epoch": 0.9661685346616854, "grad_norm": 2.167915928889979, "learning_rate": 4.101572233468614e-08, "loss": 1.5103, "step": 4655 }, { "epoch": 0.9663760896637609, "grad_norm": 0.7660759611693861, "learning_rate": 4.100330926887451e-08, "loss": 1.5275, "step": 4656 }, { "epoch": 0.9665836446658365, "grad_norm": 1.2109957192797038, "learning_rate": 4.099097230770366e-08, "loss": 1.4999, "step": 4657 }, { "epoch": 0.966791199667912, "grad_norm": 0.8004370859145361, "learning_rate": 4.097871145641899e-08, "loss": 1.5075, "step": 4658 }, { "epoch": 0.9669987546699875, "grad_norm": 0.8864657515165125, "learning_rate": 4.096652672023344e-08, "loss": 1.4705, "step": 4659 }, { "epoch": 0.9672063096720631, "grad_norm": 0.856026531066869, "learning_rate": 4.095441810432769e-08, "loss": 1.5136, "step": 4660 }, { "epoch": 0.9674138646741387, "grad_norm": 0.6244707290968177, "learning_rate": 4.094238561384995e-08, "loss": 1.5085, "step": 4661 }, { "epoch": 0.9676214196762142, "grad_norm": 0.7320728434558134, "learning_rate": 4.093042925391615e-08, "loss": 1.5445, "step": 4662 }, { "epoch": 0.9678289746782898, "grad_norm": 1.1710369472762623, "learning_rate": 4.091854902960979e-08, "loss": 1.5024, "step": 4663 }, { "epoch": 0.9680365296803652, "grad_norm": 0.6828462775661966, "learning_rate": 4.090674494598206e-08, "loss": 1.5456, "step": 4664 }, { "epoch": 0.9682440846824408, "grad_norm": 0.9390340505547526, "learning_rate": 4.0895017008051715e-08, "loss": 1.4355, "step": 4665 }, { "epoch": 0.9684516396845164, "grad_norm": 0.7522428400905802, "learning_rate": 4.088336522080517e-08, "loss": 1.6443, "step": 4666 }, { "epoch": 0.9686591946865919, "grad_norm": 0.8685731677402596, "learning_rate": 4.087178958919646e-08, "loss": 1.5257, "step": 4667 }, { "epoch": 0.9688667496886675, "grad_norm": 0.8520921792331272, "learning_rate": 4.086029011814722e-08, "loss": 1.564, "step": 4668 }, { "epoch": 0.9690743046907431, "grad_norm": 0.8267511840783619, "learning_rate": 4.084886681254676e-08, "loss": 1.5538, "step": 4669 }, { "epoch": 0.9692818596928185, "grad_norm": 0.7229289608801082, "learning_rate": 4.0837519677251917e-08, "loss": 1.4945, "step": 4670 }, { "epoch": 0.9694894146948941, "grad_norm": 0.7277180439821617, "learning_rate": 4.082624871708722e-08, "loss": 1.4949, "step": 4671 }, { "epoch": 0.9696969696969697, "grad_norm": 0.9480371814541704, "learning_rate": 4.0815053936844776e-08, "loss": 1.4791, "step": 4672 }, { "epoch": 0.9699045246990452, "grad_norm": 4.645480242387703, "learning_rate": 4.080393534128431e-08, "loss": 1.5323, "step": 4673 }, { "epoch": 0.9701120797011208, "grad_norm": 0.8358385530189513, "learning_rate": 4.079289293513315e-08, "loss": 1.574, "step": 4674 }, { "epoch": 0.9703196347031964, "grad_norm": 1.9166545887255289, "learning_rate": 4.078192672308623e-08, "loss": 1.5657, "step": 4675 }, { "epoch": 0.9705271897052719, "grad_norm": 1.5631358596632146, "learning_rate": 4.077103670980611e-08, "loss": 1.4578, "step": 4676 }, { "epoch": 0.9707347447073474, "grad_norm": 1.0245883091076342, "learning_rate": 4.07602228999229e-08, "loss": 1.5085, "step": 4677 }, { "epoch": 0.970942299709423, "grad_norm": 0.8479210791056112, "learning_rate": 4.0749485298034395e-08, "loss": 1.4895, "step": 4678 }, { "epoch": 0.9711498547114985, "grad_norm": 0.7161365790318167, "learning_rate": 4.07388239087059e-08, "loss": 1.4698, "step": 4679 }, { "epoch": 0.9713574097135741, "grad_norm": 0.982805133534909, "learning_rate": 4.072823873647037e-08, "loss": 1.5524, "step": 4680 }, { "epoch": 0.9715649647156497, "grad_norm": 1.6696359927983846, "learning_rate": 4.071772978582831e-08, "loss": 1.5761, "step": 4681 }, { "epoch": 0.9717725197177252, "grad_norm": 1.1304925351904684, "learning_rate": 4.0707297061247894e-08, "loss": 1.4797, "step": 4682 }, { "epoch": 0.9719800747198007, "grad_norm": 0.974033859429217, "learning_rate": 4.0696940567164815e-08, "loss": 1.4515, "step": 4683 }, { "epoch": 0.9721876297218763, "grad_norm": 0.9018628731043778, "learning_rate": 4.068666030798237e-08, "loss": 1.586, "step": 4684 }, { "epoch": 0.9723951847239518, "grad_norm": 0.989033216063998, "learning_rate": 4.0676456288071484e-08, "loss": 1.4983, "step": 4685 }, { "epoch": 0.9726027397260274, "grad_norm": 0.6397991257323977, "learning_rate": 4.066632851177059e-08, "loss": 1.485, "step": 4686 }, { "epoch": 0.972810294728103, "grad_norm": 0.7267750542797208, "learning_rate": 4.065627698338581e-08, "loss": 1.5025, "step": 4687 }, { "epoch": 0.9730178497301785, "grad_norm": 0.9942896945184911, "learning_rate": 4.0646301707190725e-08, "loss": 1.5947, "step": 4688 }, { "epoch": 0.973225404732254, "grad_norm": 1.204562693698375, "learning_rate": 4.063640268742657e-08, "loss": 1.4865, "step": 4689 }, { "epoch": 0.9734329597343296, "grad_norm": 0.7031025269493411, "learning_rate": 4.0626579928302184e-08, "loss": 1.4158, "step": 4690 }, { "epoch": 0.9736405147364051, "grad_norm": 0.8515389410367321, "learning_rate": 4.0616833433993916e-08, "loss": 1.5084, "step": 4691 }, { "epoch": 0.9738480697384807, "grad_norm": 0.921565007136761, "learning_rate": 4.060716320864572e-08, "loss": 1.5712, "step": 4692 }, { "epoch": 0.9740556247405563, "grad_norm": 0.677451922128275, "learning_rate": 4.059756925636912e-08, "loss": 1.5606, "step": 4693 }, { "epoch": 0.9742631797426318, "grad_norm": 0.860466881581266, "learning_rate": 4.058805158124321e-08, "loss": 1.459, "step": 4694 }, { "epoch": 0.9744707347447074, "grad_norm": 0.6859805359874132, "learning_rate": 4.057861018731464e-08, "loss": 1.5478, "step": 4695 }, { "epoch": 0.9746782897467829, "grad_norm": 1.4477060233263033, "learning_rate": 4.056924507859765e-08, "loss": 1.4988, "step": 4696 }, { "epoch": 0.9748858447488584, "grad_norm": 1.101620088514572, "learning_rate": 4.055995625907405e-08, "loss": 1.4904, "step": 4697 }, { "epoch": 0.975093399750934, "grad_norm": 1.1776161363871063, "learning_rate": 4.055074373269318e-08, "loss": 1.5173, "step": 4698 }, { "epoch": 0.9753009547530096, "grad_norm": 1.0307666290948483, "learning_rate": 4.054160750337196e-08, "loss": 1.4563, "step": 4699 }, { "epoch": 0.9755085097550851, "grad_norm": 0.7362611680560055, "learning_rate": 4.05325475749949e-08, "loss": 1.4868, "step": 4700 }, { "epoch": 0.9757160647571607, "grad_norm": 0.811619979400467, "learning_rate": 4.0523563951414e-08, "loss": 1.5298, "step": 4701 }, { "epoch": 0.9759236197592362, "grad_norm": 0.6944963343845532, "learning_rate": 4.051465663644888e-08, "loss": 1.4459, "step": 4702 }, { "epoch": 0.9761311747613117, "grad_norm": 0.6619802049420035, "learning_rate": 4.050582563388671e-08, "loss": 1.5209, "step": 4703 }, { "epoch": 0.9763387297633873, "grad_norm": 0.7613884963535807, "learning_rate": 4.049707094748217e-08, "loss": 1.4893, "step": 4704 }, { "epoch": 0.9765462847654629, "grad_norm": 1.6403748813229346, "learning_rate": 4.048839258095754e-08, "loss": 1.4914, "step": 4705 }, { "epoch": 0.9767538397675384, "grad_norm": 3.1261298282470413, "learning_rate": 4.047979053800262e-08, "loss": 1.5275, "step": 4706 }, { "epoch": 0.976961394769614, "grad_norm": 1.21190380898979, "learning_rate": 4.0471264822274773e-08, "loss": 1.5578, "step": 4707 }, { "epoch": 0.9771689497716894, "grad_norm": 0.7222124017937313, "learning_rate": 4.0462815437398894e-08, "loss": 1.492, "step": 4708 }, { "epoch": 0.977376504773765, "grad_norm": 0.6759149568725001, "learning_rate": 4.045444238696746e-08, "loss": 1.5949, "step": 4709 }, { "epoch": 0.9775840597758406, "grad_norm": 0.8984520709168742, "learning_rate": 4.044614567454046e-08, "loss": 1.4678, "step": 4710 }, { "epoch": 0.9777916147779161, "grad_norm": 0.6788739707070109, "learning_rate": 4.043792530364543e-08, "loss": 1.5045, "step": 4711 }, { "epoch": 0.9779991697799917, "grad_norm": 1.1852897891125689, "learning_rate": 4.0429781277777465e-08, "loss": 1.3628, "step": 4712 }, { "epoch": 0.9782067247820673, "grad_norm": 0.6682802264781241, "learning_rate": 4.0421713600399195e-08, "loss": 1.5119, "step": 4713 }, { "epoch": 0.9784142797841427, "grad_norm": 0.8759680400164908, "learning_rate": 4.0413722274940745e-08, "loss": 1.5006, "step": 4714 }, { "epoch": 0.9786218347862183, "grad_norm": 0.7496164249162688, "learning_rate": 4.040580730479984e-08, "loss": 1.5238, "step": 4715 }, { "epoch": 0.9788293897882939, "grad_norm": 1.1257328583630644, "learning_rate": 4.039796869334172e-08, "loss": 1.5144, "step": 4716 }, { "epoch": 0.9790369447903694, "grad_norm": 0.74059874591586, "learning_rate": 4.0390206443899156e-08, "loss": 1.5735, "step": 4717 }, { "epoch": 0.979244499792445, "grad_norm": 0.643683515685995, "learning_rate": 4.0382520559772454e-08, "loss": 1.5653, "step": 4718 }, { "epoch": 0.9794520547945206, "grad_norm": 0.7923047515581395, "learning_rate": 4.037491104422941e-08, "loss": 1.5341, "step": 4719 }, { "epoch": 0.979659609796596, "grad_norm": 0.7166449522443827, "learning_rate": 4.0367377900505434e-08, "loss": 1.532, "step": 4720 }, { "epoch": 0.9798671647986716, "grad_norm": 1.201917632385006, "learning_rate": 4.0359921131803386e-08, "loss": 1.5161, "step": 4721 }, { "epoch": 0.9800747198007472, "grad_norm": 0.7587177346956757, "learning_rate": 4.035254074129371e-08, "loss": 1.5741, "step": 4722 }, { "epoch": 0.9802822748028227, "grad_norm": 2.127737003348551, "learning_rate": 4.034523673211434e-08, "loss": 1.5328, "step": 4723 }, { "epoch": 0.9804898298048983, "grad_norm": 0.8185401955680965, "learning_rate": 4.033800910737075e-08, "loss": 1.4286, "step": 4724 }, { "epoch": 0.9806973848069739, "grad_norm": 0.828920286576002, "learning_rate": 4.033085787013591e-08, "loss": 1.5209, "step": 4725 }, { "epoch": 0.9809049398090494, "grad_norm": 0.8453287934918091, "learning_rate": 4.0323783023450396e-08, "loss": 1.4562, "step": 4726 }, { "epoch": 0.981112494811125, "grad_norm": 0.6952283768584352, "learning_rate": 4.031678457032218e-08, "loss": 1.5043, "step": 4727 }, { "epoch": 0.9813200498132005, "grad_norm": 1.2897209690624207, "learning_rate": 4.030986251372687e-08, "loss": 1.4461, "step": 4728 }, { "epoch": 0.981527604815276, "grad_norm": 1.0469843352990331, "learning_rate": 4.0303016856607495e-08, "loss": 1.4823, "step": 4729 }, { "epoch": 0.9817351598173516, "grad_norm": 0.9265847389795577, "learning_rate": 4.029624760187468e-08, "loss": 1.5008, "step": 4730 }, { "epoch": 0.9819427148194272, "grad_norm": 0.7520590136058093, "learning_rate": 4.028955475240653e-08, "loss": 1.4532, "step": 4731 }, { "epoch": 0.9821502698215027, "grad_norm": 0.8392658881370643, "learning_rate": 4.028293831104865e-08, "loss": 1.548, "step": 4732 }, { "epoch": 0.9823578248235783, "grad_norm": 0.7241276603863792, "learning_rate": 4.027639828061418e-08, "loss": 1.4409, "step": 4733 }, { "epoch": 0.9825653798256538, "grad_norm": 0.946193601502734, "learning_rate": 4.026993466388377e-08, "loss": 1.4558, "step": 4734 }, { "epoch": 0.9827729348277293, "grad_norm": 0.9661476942971086, "learning_rate": 4.026354746360558e-08, "loss": 1.5323, "step": 4735 }, { "epoch": 0.9829804898298049, "grad_norm": 1.468427866731725, "learning_rate": 4.0257236682495285e-08, "loss": 1.5041, "step": 4736 }, { "epoch": 0.9831880448318805, "grad_norm": 0.6742847505585995, "learning_rate": 4.025100232323605e-08, "loss": 1.5136, "step": 4737 }, { "epoch": 0.983395599833956, "grad_norm": 1.0342345747341286, "learning_rate": 4.024484438847856e-08, "loss": 1.5696, "step": 4738 }, { "epoch": 0.9836031548360316, "grad_norm": 2.1248841534357905, "learning_rate": 4.0238762880841e-08, "loss": 1.4973, "step": 4739 }, { "epoch": 0.9838107098381071, "grad_norm": 0.717429367986153, "learning_rate": 4.023275780290908e-08, "loss": 1.4209, "step": 4740 }, { "epoch": 0.9840182648401826, "grad_norm": 0.7871185432417753, "learning_rate": 4.022682915723599e-08, "loss": 1.5679, "step": 4741 }, { "epoch": 0.9842258198422582, "grad_norm": 0.8522536893074967, "learning_rate": 4.0220976946342444e-08, "loss": 1.4394, "step": 4742 }, { "epoch": 0.9844333748443338, "grad_norm": 0.8234984704094376, "learning_rate": 4.0215201172716636e-08, "loss": 1.5065, "step": 4743 }, { "epoch": 0.9846409298464093, "grad_norm": 0.6869708368717611, "learning_rate": 4.0209501838814276e-08, "loss": 1.4532, "step": 4744 }, { "epoch": 0.9848484848484849, "grad_norm": 0.7509564992042418, "learning_rate": 4.0203878947058566e-08, "loss": 1.6183, "step": 4745 }, { "epoch": 0.9850560398505604, "grad_norm": 1.0947761123950381, "learning_rate": 4.0198332499840224e-08, "loss": 1.5381, "step": 4746 }, { "epoch": 0.9852635948526359, "grad_norm": 0.8684484861142557, "learning_rate": 4.01928624995174e-08, "loss": 1.5029, "step": 4747 }, { "epoch": 0.9854711498547115, "grad_norm": 0.6835330015666372, "learning_rate": 4.018746894841589e-08, "loss": 1.4818, "step": 4748 }, { "epoch": 0.9856787048567871, "grad_norm": 0.6886924674526166, "learning_rate": 4.0182151848828783e-08, "loss": 1.5899, "step": 4749 }, { "epoch": 0.9858862598588626, "grad_norm": 1.1383021939772826, "learning_rate": 4.0176911203016855e-08, "loss": 1.5909, "step": 4750 }, { "epoch": 0.9860938148609382, "grad_norm": 0.645991041376154, "learning_rate": 4.0171747013208234e-08, "loss": 1.4994, "step": 4751 }, { "epoch": 0.9863013698630136, "grad_norm": 0.8743987096193, "learning_rate": 4.0166659281598603e-08, "loss": 1.5412, "step": 4752 }, { "epoch": 0.9865089248650892, "grad_norm": 1.0354994519211387, "learning_rate": 4.016164801035116e-08, "loss": 1.5045, "step": 4753 }, { "epoch": 0.9867164798671648, "grad_norm": 1.5615867852372607, "learning_rate": 4.0156713201596526e-08, "loss": 1.5393, "step": 4754 }, { "epoch": 0.9869240348692403, "grad_norm": 1.4601826423134665, "learning_rate": 4.015185485743289e-08, "loss": 1.595, "step": 4755 }, { "epoch": 0.9871315898713159, "grad_norm": 0.659440547783532, "learning_rate": 4.0147072979925864e-08, "loss": 1.5497, "step": 4756 }, { "epoch": 0.9873391448733915, "grad_norm": 0.7084580914786018, "learning_rate": 4.014236757110858e-08, "loss": 1.5256, "step": 4757 }, { "epoch": 0.987546699875467, "grad_norm": 0.7138896654617318, "learning_rate": 4.0137738632981674e-08, "loss": 1.4459, "step": 4758 }, { "epoch": 0.9877542548775425, "grad_norm": 0.6409615288214876, "learning_rate": 4.013318616751322e-08, "loss": 1.5234, "step": 4759 }, { "epoch": 0.9879618098796181, "grad_norm": 1.050531096764086, "learning_rate": 4.0128710176638817e-08, "loss": 1.5938, "step": 4760 }, { "epoch": 0.9881693648816936, "grad_norm": 0.795066709337219, "learning_rate": 4.0124310662261526e-08, "loss": 1.539, "step": 4761 }, { "epoch": 0.9883769198837692, "grad_norm": 1.4681377456511795, "learning_rate": 4.011998762625192e-08, "loss": 1.4879, "step": 4762 }, { "epoch": 0.9885844748858448, "grad_norm": 0.7427594931128179, "learning_rate": 4.011574107044802e-08, "loss": 1.5562, "step": 4763 }, { "epoch": 0.9887920298879203, "grad_norm": 0.7979355255280299, "learning_rate": 4.0111570996655386e-08, "loss": 1.4239, "step": 4764 }, { "epoch": 0.9889995848899958, "grad_norm": 0.6487432575061177, "learning_rate": 4.010747740664698e-08, "loss": 1.4893, "step": 4765 }, { "epoch": 0.9892071398920714, "grad_norm": 0.6932652546876449, "learning_rate": 4.01034603021633e-08, "loss": 1.5076, "step": 4766 }, { "epoch": 0.9894146948941469, "grad_norm": 0.8184296285089726, "learning_rate": 4.0099519684912334e-08, "loss": 1.5866, "step": 4767 }, { "epoch": 0.9896222498962225, "grad_norm": 0.8105581521638762, "learning_rate": 4.009565555656951e-08, "loss": 1.5144, "step": 4768 }, { "epoch": 0.9898298048982981, "grad_norm": 0.7538387275210242, "learning_rate": 4.009186791877774e-08, "loss": 1.5249, "step": 4769 }, { "epoch": 0.9900373599003736, "grad_norm": 1.9042359716630708, "learning_rate": 4.0088156773147466e-08, "loss": 1.501, "step": 4770 }, { "epoch": 0.9902449149024491, "grad_norm": 0.820178119045428, "learning_rate": 4.008452212125652e-08, "loss": 1.4738, "step": 4771 }, { "epoch": 0.9904524699045247, "grad_norm": 0.6675939545950915, "learning_rate": 4.0080963964650306e-08, "loss": 1.4948, "step": 4772 }, { "epoch": 0.9906600249066002, "grad_norm": 0.7515560145313551, "learning_rate": 4.007748230484161e-08, "loss": 1.5279, "step": 4773 }, { "epoch": 0.9908675799086758, "grad_norm": 0.7500708538598676, "learning_rate": 4.007407714331079e-08, "loss": 1.5235, "step": 4774 }, { "epoch": 0.9910751349107514, "grad_norm": 0.7484725917075831, "learning_rate": 4.0070748481505594e-08, "loss": 1.4688, "step": 4775 }, { "epoch": 0.9912826899128269, "grad_norm": 0.8655764784855894, "learning_rate": 4.006749632084131e-08, "loss": 1.5187, "step": 4776 }, { "epoch": 0.9914902449149025, "grad_norm": 0.6952214555759988, "learning_rate": 4.0064320662700635e-08, "loss": 1.4983, "step": 4777 }, { "epoch": 0.991697799916978, "grad_norm": 0.9166452120885582, "learning_rate": 4.0061221508433795e-08, "loss": 1.5525, "step": 4778 }, { "epoch": 0.9919053549190535, "grad_norm": 0.7817218910211783, "learning_rate": 4.005819885935846e-08, "loss": 1.5093, "step": 4779 }, { "epoch": 0.9921129099211291, "grad_norm": 0.7503094567945067, "learning_rate": 4.00552527167598e-08, "loss": 1.532, "step": 4780 }, { "epoch": 0.9923204649232047, "grad_norm": 0.6992339495425524, "learning_rate": 4.005238308189043e-08, "loss": 1.4616, "step": 4781 }, { "epoch": 0.9925280199252802, "grad_norm": 0.6570827794463783, "learning_rate": 4.004958995597042e-08, "loss": 1.5701, "step": 4782 }, { "epoch": 0.9927355749273558, "grad_norm": 0.7246868341128492, "learning_rate": 4.004687334018735e-08, "loss": 1.51, "step": 4783 }, { "epoch": 0.9929431299294313, "grad_norm": 0.7304689068384665, "learning_rate": 4.004423323569627e-08, "loss": 1.5637, "step": 4784 }, { "epoch": 0.9931506849315068, "grad_norm": 0.981954952445363, "learning_rate": 4.0041669643619645e-08, "loss": 1.4135, "step": 4785 }, { "epoch": 0.9933582399335824, "grad_norm": 0.902759387612884, "learning_rate": 4.003918256504748e-08, "loss": 1.5007, "step": 4786 }, { "epoch": 0.993565794935658, "grad_norm": 0.6947775892465466, "learning_rate": 4.0036772001037195e-08, "loss": 1.6436, "step": 4787 }, { "epoch": 0.9937733499377335, "grad_norm": 0.6413717353504101, "learning_rate": 4.0034437952613695e-08, "loss": 1.4695, "step": 4788 }, { "epoch": 0.9939809049398091, "grad_norm": 0.8198940266814553, "learning_rate": 4.0032180420769376e-08, "loss": 1.545, "step": 4789 }, { "epoch": 0.9941884599418847, "grad_norm": 0.9209396363654392, "learning_rate": 4.002999940646406e-08, "loss": 1.4476, "step": 4790 }, { "epoch": 0.9943960149439601, "grad_norm": 0.6808047918826174, "learning_rate": 4.002789491062506e-08, "loss": 1.5462, "step": 4791 }, { "epoch": 0.9946035699460357, "grad_norm": 0.7161441085781081, "learning_rate": 4.0025866934147177e-08, "loss": 1.543, "step": 4792 }, { "epoch": 0.9948111249481113, "grad_norm": 0.692252847155623, "learning_rate": 4.0023915477892605e-08, "loss": 1.5144, "step": 4793 }, { "epoch": 0.9950186799501868, "grad_norm": 0.8444795158888794, "learning_rate": 4.002204054269109e-08, "loss": 1.489, "step": 4794 }, { "epoch": 0.9952262349522624, "grad_norm": 0.7233455211136888, "learning_rate": 4.002024212933979e-08, "loss": 1.578, "step": 4795 }, { "epoch": 0.9954337899543378, "grad_norm": 0.7538220410667158, "learning_rate": 4.001852023860335e-08, "loss": 1.5155, "step": 4796 }, { "epoch": 0.9956413449564134, "grad_norm": 1.125191476257224, "learning_rate": 4.001687487121388e-08, "loss": 1.509, "step": 4797 }, { "epoch": 0.995848899958489, "grad_norm": 0.8026149744749589, "learning_rate": 4.001530602787092e-08, "loss": 1.5738, "step": 4798 }, { "epoch": 0.9960564549605645, "grad_norm": 0.8292481411587816, "learning_rate": 4.001381370924151e-08, "loss": 1.5108, "step": 4799 }, { "epoch": 0.9962640099626401, "grad_norm": 0.7459653503900843, "learning_rate": 4.001239791596016e-08, "loss": 1.5017, "step": 4800 }, { "epoch": 0.9964715649647157, "grad_norm": 1.0819786540329295, "learning_rate": 4.0011058648628806e-08, "loss": 1.61, "step": 4801 }, { "epoch": 0.9966791199667911, "grad_norm": 0.7525892175417594, "learning_rate": 4.000979590781689e-08, "loss": 1.5622, "step": 4802 }, { "epoch": 0.9968866749688667, "grad_norm": 0.8743837860522929, "learning_rate": 4.000860969406129e-08, "loss": 1.5628, "step": 4803 }, { "epoch": 0.9970942299709423, "grad_norm": 0.7750546424710333, "learning_rate": 4.000750000786634e-08, "loss": 1.4099, "step": 4804 }, { "epoch": 0.9973017849730178, "grad_norm": 0.6353047899187152, "learning_rate": 4.000646684970386e-08, "loss": 1.5222, "step": 4805 }, { "epoch": 0.9975093399750934, "grad_norm": 0.6470880844559511, "learning_rate": 4.000551022001311e-08, "loss": 1.5562, "step": 4806 }, { "epoch": 0.997716894977169, "grad_norm": 0.9084872529285581, "learning_rate": 4.0004630119200843e-08, "loss": 1.5464, "step": 4807 }, { "epoch": 0.9979244499792445, "grad_norm": 1.140572754103038, "learning_rate": 4.0003826547641254e-08, "loss": 1.5235, "step": 4808 }, { "epoch": 0.99813200498132, "grad_norm": 0.7299495991107142, "learning_rate": 4.000309950567598e-08, "loss": 1.4966, "step": 4809 }, { "epoch": 0.9983395599833956, "grad_norm": 0.8724224627053094, "learning_rate": 4.000244899361414e-08, "loss": 1.4967, "step": 4810 }, { "epoch": 0.9985471149854711, "grad_norm": 1.1786718780774377, "learning_rate": 4.000187501173234e-08, "loss": 1.4663, "step": 4811 }, { "epoch": 0.9987546699875467, "grad_norm": 0.8396401578326474, "learning_rate": 4.000137756027459e-08, "loss": 1.46, "step": 4812 }, { "epoch": 0.9989622249896223, "grad_norm": 0.6312751894777425, "learning_rate": 4.000095663945242e-08, "loss": 1.6157, "step": 4813 }, { "epoch": 0.9991697799916978, "grad_norm": 0.7178029286991596, "learning_rate": 4.000061224944478e-08, "loss": 1.4568, "step": 4814 }, { "epoch": 0.9993773349937733, "grad_norm": 0.8409185300333479, "learning_rate": 4.00003443903981e-08, "loss": 1.5537, "step": 4815 }, { "epoch": 0.9995848899958489, "grad_norm": 0.6319662812221615, "learning_rate": 4.0000153062426275e-08, "loss": 1.5672, "step": 4816 }, { "epoch": 0.9997924449979244, "grad_norm": 0.7315460409082969, "learning_rate": 4.000003826561064e-08, "loss": 1.5316, "step": 4817 }, { "epoch": 1.0, "grad_norm": 0.8062079039678556, "learning_rate": 4e-08, "loss": 1.578, "step": 4818 } ], "logging_steps": 1, "max_steps": 4818, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 964, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3769285113479168e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }