{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 361, "global_step": 1444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006925207756232687, "grad_norm": 0.46092769503593445, "learning_rate": 2.0000000000000003e-06, "loss": 1.8461, "step": 1 }, { "epoch": 0.0006925207756232687, "eval_loss": 1.2960177659988403, "eval_runtime": 331.9286, "eval_samples_per_second": 4.266, "eval_steps_per_second": 0.533, "step": 1 }, { "epoch": 0.0013850415512465374, "grad_norm": 0.5322127342224121, "learning_rate": 4.000000000000001e-06, "loss": 1.9426, "step": 2 }, { "epoch": 0.002077562326869806, "grad_norm": 0.43637630343437195, "learning_rate": 6e-06, "loss": 1.7676, "step": 3 }, { "epoch": 0.002770083102493075, "grad_norm": 0.47813108563423157, "learning_rate": 8.000000000000001e-06, "loss": 1.8202, "step": 4 }, { "epoch": 0.0034626038781163434, "grad_norm": 0.4561654329299927, "learning_rate": 1e-05, "loss": 1.7695, "step": 5 }, { "epoch": 0.004155124653739612, "grad_norm": 0.5688905119895935, "learning_rate": 1.2e-05, "loss": 1.797, "step": 6 }, { "epoch": 0.004847645429362881, "grad_norm": 0.4705469012260437, "learning_rate": 1.4e-05, "loss": 1.7815, "step": 7 }, { "epoch": 0.00554016620498615, "grad_norm": 0.5055038332939148, "learning_rate": 1.6000000000000003e-05, "loss": 1.8141, "step": 8 }, { "epoch": 0.006232686980609419, "grad_norm": 0.5161764621734619, "learning_rate": 1.8e-05, "loss": 1.6148, "step": 9 }, { "epoch": 0.006925207756232687, "grad_norm": 0.5202474594116211, "learning_rate": 2e-05, "loss": 1.6932, "step": 10 }, { "epoch": 0.007617728531855956, "grad_norm": 0.4921915829181671, "learning_rate": 1.9999976002208634e-05, "loss": 1.8003, "step": 11 }, { "epoch": 0.008310249307479225, "grad_norm": 0.44480928778648376, "learning_rate": 1.9999904008949705e-05, "loss": 1.7099, "step": 12 }, { "epoch": 0.009002770083102494, "grad_norm": 0.4888850748538971, "learning_rate": 1.9999784020568756e-05, "loss": 1.7404, "step": 13 }, { "epoch": 0.009695290858725761, "grad_norm": 0.4866941571235657, "learning_rate": 1.999961603764167e-05, "loss": 1.6584, "step": 14 }, { "epoch": 0.01038781163434903, "grad_norm": 0.5461321473121643, "learning_rate": 1.9999400060974704e-05, "loss": 1.797, "step": 15 }, { "epoch": 0.0110803324099723, "grad_norm": 0.536161482334137, "learning_rate": 1.9999136091604433e-05, "loss": 1.5394, "step": 16 }, { "epoch": 0.011772853185595568, "grad_norm": 0.6785447001457214, "learning_rate": 1.9998824130797813e-05, "loss": 1.5485, "step": 17 }, { "epoch": 0.012465373961218837, "grad_norm": 0.5797305107116699, "learning_rate": 1.99984641800521e-05, "loss": 1.4998, "step": 18 }, { "epoch": 0.013157894736842105, "grad_norm": 0.6252360343933105, "learning_rate": 1.9998056241094912e-05, "loss": 1.8451, "step": 19 }, { "epoch": 0.013850415512465374, "grad_norm": 0.5463359355926514, "learning_rate": 1.9997600315884166e-05, "loss": 1.6362, "step": 20 }, { "epoch": 0.014542936288088643, "grad_norm": 0.5481324791908264, "learning_rate": 1.9997096406608108e-05, "loss": 1.5417, "step": 21 }, { "epoch": 0.015235457063711912, "grad_norm": 0.4929562211036682, "learning_rate": 1.999654451568528e-05, "loss": 1.7517, "step": 22 }, { "epoch": 0.01592797783933518, "grad_norm": 0.547443151473999, "learning_rate": 1.9995944645764512e-05, "loss": 1.7079, "step": 23 }, { "epoch": 0.01662049861495845, "grad_norm": 0.497779905796051, "learning_rate": 1.9995296799724914e-05, "loss": 1.6336, "step": 24 }, { "epoch": 0.01731301939058172, "grad_norm": 0.5056659579277039, "learning_rate": 1.999460098067586e-05, "loss": 1.5759, "step": 25 }, { "epoch": 0.018005540166204988, "grad_norm": 0.5825318098068237, "learning_rate": 1.999385719195698e-05, "loss": 1.7271, "step": 26 }, { "epoch": 0.018698060941828253, "grad_norm": 0.5759397745132446, "learning_rate": 1.9993065437138126e-05, "loss": 1.6907, "step": 27 }, { "epoch": 0.019390581717451522, "grad_norm": 0.5402478575706482, "learning_rate": 1.9992225720019377e-05, "loss": 1.7526, "step": 28 }, { "epoch": 0.02008310249307479, "grad_norm": 0.5368712544441223, "learning_rate": 1.9991338044630996e-05, "loss": 1.6623, "step": 29 }, { "epoch": 0.02077562326869806, "grad_norm": 0.5437949895858765, "learning_rate": 1.9990402415233436e-05, "loss": 1.7876, "step": 30 }, { "epoch": 0.02146814404432133, "grad_norm": 0.6881880760192871, "learning_rate": 1.9989418836317304e-05, "loss": 1.5072, "step": 31 }, { "epoch": 0.0221606648199446, "grad_norm": 0.5721428990364075, "learning_rate": 1.998838731260335e-05, "loss": 1.5213, "step": 32 }, { "epoch": 0.022853185595567867, "grad_norm": 0.5982231497764587, "learning_rate": 1.9987307849042428e-05, "loss": 1.7325, "step": 33 }, { "epoch": 0.023545706371191136, "grad_norm": 0.6260273456573486, "learning_rate": 1.9986180450815485e-05, "loss": 1.6727, "step": 34 }, { "epoch": 0.024238227146814405, "grad_norm": 0.594240665435791, "learning_rate": 1.9985005123333536e-05, "loss": 1.7139, "step": 35 }, { "epoch": 0.024930747922437674, "grad_norm": 0.530646026134491, "learning_rate": 1.9983781872237634e-05, "loss": 1.6415, "step": 36 }, { "epoch": 0.025623268698060944, "grad_norm": 0.6217408776283264, "learning_rate": 1.9982510703398844e-05, "loss": 1.7185, "step": 37 }, { "epoch": 0.02631578947368421, "grad_norm": 0.537267804145813, "learning_rate": 1.9981191622918217e-05, "loss": 1.53, "step": 38 }, { "epoch": 0.027008310249307478, "grad_norm": 0.5151798129081726, "learning_rate": 1.9979824637126752e-05, "loss": 1.692, "step": 39 }, { "epoch": 0.027700831024930747, "grad_norm": 0.5791589021682739, "learning_rate": 1.997840975258538e-05, "loss": 1.6817, "step": 40 }, { "epoch": 0.028393351800554016, "grad_norm": 0.5138722062110901, "learning_rate": 1.9976946976084916e-05, "loss": 1.6058, "step": 41 }, { "epoch": 0.029085872576177285, "grad_norm": 0.5150821805000305, "learning_rate": 1.9975436314646052e-05, "loss": 1.6615, "step": 42 }, { "epoch": 0.029778393351800554, "grad_norm": 0.5633302927017212, "learning_rate": 1.9973877775519286e-05, "loss": 1.6728, "step": 43 }, { "epoch": 0.030470914127423823, "grad_norm": 0.7087507843971252, "learning_rate": 1.9972271366184922e-05, "loss": 1.3833, "step": 44 }, { "epoch": 0.031163434903047092, "grad_norm": 0.5197272896766663, "learning_rate": 1.9970617094353014e-05, "loss": 1.5722, "step": 45 }, { "epoch": 0.03185595567867036, "grad_norm": 0.5743720531463623, "learning_rate": 1.996891496796334e-05, "loss": 1.7185, "step": 46 }, { "epoch": 0.03254847645429363, "grad_norm": 0.6309240460395813, "learning_rate": 1.9967164995185345e-05, "loss": 1.6368, "step": 47 }, { "epoch": 0.0332409972299169, "grad_norm": 0.5736321806907654, "learning_rate": 1.9965367184418138e-05, "loss": 1.5741, "step": 48 }, { "epoch": 0.03393351800554017, "grad_norm": 0.6668252348899841, "learning_rate": 1.9963521544290402e-05, "loss": 1.6657, "step": 49 }, { "epoch": 0.03462603878116344, "grad_norm": 0.5815922617912292, "learning_rate": 1.9961628083660406e-05, "loss": 1.6007, "step": 50 }, { "epoch": 0.035318559556786706, "grad_norm": 0.5808799266815186, "learning_rate": 1.995968681161592e-05, "loss": 1.7115, "step": 51 }, { "epoch": 0.036011080332409975, "grad_norm": 0.7402616143226624, "learning_rate": 1.9957697737474198e-05, "loss": 1.4816, "step": 52 }, { "epoch": 0.036703601108033244, "grad_norm": 0.5336787104606628, "learning_rate": 1.9955660870781908e-05, "loss": 1.5755, "step": 53 }, { "epoch": 0.037396121883656507, "grad_norm": 0.6077079176902771, "learning_rate": 1.9953576221315116e-05, "loss": 1.6875, "step": 54 }, { "epoch": 0.038088642659279776, "grad_norm": 0.6559765934944153, "learning_rate": 1.9951443799079215e-05, "loss": 1.6986, "step": 55 }, { "epoch": 0.038781163434903045, "grad_norm": 0.7890306711196899, "learning_rate": 1.9949263614308894e-05, "loss": 1.2903, "step": 56 }, { "epoch": 0.039473684210526314, "grad_norm": 0.7925476431846619, "learning_rate": 1.9947035677468078e-05, "loss": 1.7279, "step": 57 }, { "epoch": 0.04016620498614958, "grad_norm": 0.7267352342605591, "learning_rate": 1.994475999924987e-05, "loss": 1.6833, "step": 58 }, { "epoch": 0.04085872576177285, "grad_norm": 0.657148003578186, "learning_rate": 1.9942436590576534e-05, "loss": 1.706, "step": 59 }, { "epoch": 0.04155124653739612, "grad_norm": 0.6420430541038513, "learning_rate": 1.9940065462599394e-05, "loss": 1.5163, "step": 60 }, { "epoch": 0.04224376731301939, "grad_norm": 0.6800001263618469, "learning_rate": 1.9937646626698823e-05, "loss": 1.6628, "step": 61 }, { "epoch": 0.04293628808864266, "grad_norm": 0.7497722506523132, "learning_rate": 1.9935180094484164e-05, "loss": 1.5616, "step": 62 }, { "epoch": 0.04362880886426593, "grad_norm": 0.7148786187171936, "learning_rate": 1.9932665877793678e-05, "loss": 1.585, "step": 63 }, { "epoch": 0.0443213296398892, "grad_norm": 0.6944183707237244, "learning_rate": 1.99301039886945e-05, "loss": 1.5801, "step": 64 }, { "epoch": 0.045013850415512466, "grad_norm": 0.7924667000770569, "learning_rate": 1.9927494439482566e-05, "loss": 1.3699, "step": 65 }, { "epoch": 0.045706371191135735, "grad_norm": 0.7302570343017578, "learning_rate": 1.992483724268255e-05, "loss": 1.644, "step": 66 }, { "epoch": 0.046398891966759004, "grad_norm": 0.7892882227897644, "learning_rate": 1.9922132411047835e-05, "loss": 1.5289, "step": 67 }, { "epoch": 0.04709141274238227, "grad_norm": 0.6914665699005127, "learning_rate": 1.9919379957560413e-05, "loss": 1.4982, "step": 68 }, { "epoch": 0.04778393351800554, "grad_norm": 0.7855063080787659, "learning_rate": 1.9916579895430845e-05, "loss": 1.5667, "step": 69 }, { "epoch": 0.04847645429362881, "grad_norm": 0.8000104427337646, "learning_rate": 1.991373223809819e-05, "loss": 1.7444, "step": 70 }, { "epoch": 0.04916897506925208, "grad_norm": 0.8270304799079895, "learning_rate": 1.991083699922995e-05, "loss": 1.5416, "step": 71 }, { "epoch": 0.04986149584487535, "grad_norm": 0.6860859394073486, "learning_rate": 1.990789419272199e-05, "loss": 1.5364, "step": 72 }, { "epoch": 0.05055401662049862, "grad_norm": 0.7422068119049072, "learning_rate": 1.9904903832698485e-05, "loss": 1.6074, "step": 73 }, { "epoch": 0.05124653739612189, "grad_norm": 0.9346948862075806, "learning_rate": 1.9901865933511834e-05, "loss": 1.3475, "step": 74 }, { "epoch": 0.05193905817174515, "grad_norm": 0.6729177832603455, "learning_rate": 1.989878050974262e-05, "loss": 1.6578, "step": 75 }, { "epoch": 0.05263157894736842, "grad_norm": 0.7014334797859192, "learning_rate": 1.9895647576199507e-05, "loss": 1.4947, "step": 76 }, { "epoch": 0.05332409972299169, "grad_norm": 0.7257154583930969, "learning_rate": 1.9892467147919196e-05, "loss": 1.615, "step": 77 }, { "epoch": 0.054016620498614956, "grad_norm": 0.7274760603904724, "learning_rate": 1.988923924016634e-05, "loss": 1.5596, "step": 78 }, { "epoch": 0.054709141274238225, "grad_norm": 0.8070396184921265, "learning_rate": 1.9885963868433463e-05, "loss": 1.4991, "step": 79 }, { "epoch": 0.055401662049861494, "grad_norm": 0.774266242980957, "learning_rate": 1.988264104844091e-05, "loss": 1.6116, "step": 80 }, { "epoch": 0.05609418282548476, "grad_norm": 0.5994683504104614, "learning_rate": 1.9879270796136746e-05, "loss": 0.9307, "step": 81 }, { "epoch": 0.05678670360110803, "grad_norm": 0.8213633298873901, "learning_rate": 1.987585312769669e-05, "loss": 1.541, "step": 82 }, { "epoch": 0.0574792243767313, "grad_norm": 0.7912551164627075, "learning_rate": 1.987238805952405e-05, "loss": 1.6115, "step": 83 }, { "epoch": 0.05817174515235457, "grad_norm": 0.8942985534667969, "learning_rate": 1.9868875608249613e-05, "loss": 1.4283, "step": 84 }, { "epoch": 0.05886426592797784, "grad_norm": 0.818084716796875, "learning_rate": 1.98653157907316e-05, "loss": 1.4575, "step": 85 }, { "epoch": 0.05955678670360111, "grad_norm": 0.756619393825531, "learning_rate": 1.986170862405556e-05, "loss": 1.5106, "step": 86 }, { "epoch": 0.06024930747922438, "grad_norm": 0.7861054539680481, "learning_rate": 1.9858054125534297e-05, "loss": 1.6412, "step": 87 }, { "epoch": 0.060941828254847646, "grad_norm": 0.767902672290802, "learning_rate": 1.98543523127078e-05, "loss": 1.6698, "step": 88 }, { "epoch": 0.061634349030470915, "grad_norm": 0.8469496369361877, "learning_rate": 1.985060320334312e-05, "loss": 1.6582, "step": 89 }, { "epoch": 0.062326869806094184, "grad_norm": 0.7616478800773621, "learning_rate": 1.984680681543434e-05, "loss": 1.5724, "step": 90 }, { "epoch": 0.06301939058171745, "grad_norm": 0.5673407912254333, "learning_rate": 1.9842963167202435e-05, "loss": 0.9085, "step": 91 }, { "epoch": 0.06371191135734072, "grad_norm": 0.9865372776985168, "learning_rate": 1.9839072277095222e-05, "loss": 1.551, "step": 92 }, { "epoch": 0.06440443213296398, "grad_norm": 0.8187092542648315, "learning_rate": 1.9835134163787257e-05, "loss": 1.5875, "step": 93 }, { "epoch": 0.06509695290858726, "grad_norm": 0.8418170809745789, "learning_rate": 1.9831148846179743e-05, "loss": 1.4789, "step": 94 }, { "epoch": 0.06578947368421052, "grad_norm": 0.8878125548362732, "learning_rate": 1.982711634340044e-05, "loss": 1.4319, "step": 95 }, { "epoch": 0.0664819944598338, "grad_norm": 0.9223076701164246, "learning_rate": 1.9823036674803585e-05, "loss": 1.3074, "step": 96 }, { "epoch": 0.06717451523545706, "grad_norm": 0.7803325057029724, "learning_rate": 1.981890985996979e-05, "loss": 1.6477, "step": 97 }, { "epoch": 0.06786703601108034, "grad_norm": 1.0602279901504517, "learning_rate": 1.981473591870593e-05, "loss": 1.3802, "step": 98 }, { "epoch": 0.0685595567867036, "grad_norm": 0.859002947807312, "learning_rate": 1.981051487104509e-05, "loss": 1.5765, "step": 99 }, { "epoch": 0.06925207756232687, "grad_norm": 0.9333771467208862, "learning_rate": 1.980624673724643e-05, "loss": 1.6703, "step": 100 }, { "epoch": 0.06994459833795014, "grad_norm": 1.0008572340011597, "learning_rate": 1.980193153779511e-05, "loss": 1.5619, "step": 101 }, { "epoch": 0.07063711911357341, "grad_norm": 0.9978685975074768, "learning_rate": 1.9797569293402174e-05, "loss": 1.6118, "step": 102 }, { "epoch": 0.07132963988919667, "grad_norm": 0.8422709107398987, "learning_rate": 1.9793160025004476e-05, "loss": 1.6786, "step": 103 }, { "epoch": 0.07202216066481995, "grad_norm": 0.8284188508987427, "learning_rate": 1.9788703753764554e-05, "loss": 1.4792, "step": 104 }, { "epoch": 0.07271468144044321, "grad_norm": 0.8767748475074768, "learning_rate": 1.9784200501070542e-05, "loss": 1.5439, "step": 105 }, { "epoch": 0.07340720221606649, "grad_norm": 0.9303686618804932, "learning_rate": 1.9779650288536057e-05, "loss": 1.5874, "step": 106 }, { "epoch": 0.07409972299168975, "grad_norm": 0.9373297095298767, "learning_rate": 1.9775053138000123e-05, "loss": 1.6427, "step": 107 }, { "epoch": 0.07479224376731301, "grad_norm": 0.9026633501052856, "learning_rate": 1.977040907152702e-05, "loss": 1.4498, "step": 108 }, { "epoch": 0.07548476454293629, "grad_norm": 0.9151142239570618, "learning_rate": 1.9765718111406217e-05, "loss": 1.3523, "step": 109 }, { "epoch": 0.07617728531855955, "grad_norm": 1.065004587173462, "learning_rate": 1.976098028015226e-05, "loss": 1.419, "step": 110 }, { "epoch": 0.07686980609418283, "grad_norm": 0.870905339717865, "learning_rate": 1.9756195600504637e-05, "loss": 1.589, "step": 111 }, { "epoch": 0.07756232686980609, "grad_norm": 0.9227341413497925, "learning_rate": 1.9751364095427694e-05, "loss": 1.4389, "step": 112 }, { "epoch": 0.07825484764542937, "grad_norm": 0.9902766942977905, "learning_rate": 1.974648578811053e-05, "loss": 1.54, "step": 113 }, { "epoch": 0.07894736842105263, "grad_norm": 1.0076311826705933, "learning_rate": 1.974156070196686e-05, "loss": 1.3828, "step": 114 }, { "epoch": 0.0796398891966759, "grad_norm": 0.8937874436378479, "learning_rate": 1.9736588860634928e-05, "loss": 1.5319, "step": 115 }, { "epoch": 0.08033240997229917, "grad_norm": 1.016831398010254, "learning_rate": 1.973157028797737e-05, "loss": 1.6604, "step": 116 }, { "epoch": 0.08102493074792244, "grad_norm": 0.8994125127792358, "learning_rate": 1.9726505008081113e-05, "loss": 1.532, "step": 117 }, { "epoch": 0.0817174515235457, "grad_norm": 0.8747804760932922, "learning_rate": 1.9721393045257277e-05, "loss": 1.5652, "step": 118 }, { "epoch": 0.08240997229916898, "grad_norm": 0.8889365196228027, "learning_rate": 1.9716234424041016e-05, "loss": 1.6021, "step": 119 }, { "epoch": 0.08310249307479224, "grad_norm": 0.9008545875549316, "learning_rate": 1.9711029169191437e-05, "loss": 1.4765, "step": 120 }, { "epoch": 0.08379501385041552, "grad_norm": 0.9812483191490173, "learning_rate": 1.9705777305691457e-05, "loss": 1.4359, "step": 121 }, { "epoch": 0.08448753462603878, "grad_norm": 0.9572108387947083, "learning_rate": 1.970047885874771e-05, "loss": 1.4399, "step": 122 }, { "epoch": 0.08518005540166206, "grad_norm": 1.038678526878357, "learning_rate": 1.9695133853790397e-05, "loss": 1.5133, "step": 123 }, { "epoch": 0.08587257617728532, "grad_norm": 0.898454487323761, "learning_rate": 1.968974231647318e-05, "loss": 1.4694, "step": 124 }, { "epoch": 0.0865650969529086, "grad_norm": 0.8905704021453857, "learning_rate": 1.9684304272673057e-05, "loss": 1.5121, "step": 125 }, { "epoch": 0.08725761772853186, "grad_norm": 0.9543448090553284, "learning_rate": 1.9678819748490236e-05, "loss": 1.4132, "step": 126 }, { "epoch": 0.08795013850415513, "grad_norm": 1.0110042095184326, "learning_rate": 1.9673288770248012e-05, "loss": 1.4191, "step": 127 }, { "epoch": 0.0886426592797784, "grad_norm": 0.9093568921089172, "learning_rate": 1.9667711364492638e-05, "loss": 1.4947, "step": 128 }, { "epoch": 0.08933518005540166, "grad_norm": 1.012677788734436, "learning_rate": 1.9662087557993193e-05, "loss": 1.4674, "step": 129 }, { "epoch": 0.09002770083102493, "grad_norm": 0.9102503657341003, "learning_rate": 1.965641737774147e-05, "loss": 1.4392, "step": 130 }, { "epoch": 0.0907202216066482, "grad_norm": 0.9984774589538574, "learning_rate": 1.9650700850951825e-05, "loss": 1.6278, "step": 131 }, { "epoch": 0.09141274238227147, "grad_norm": 0.9726704955101013, "learning_rate": 1.9644938005061062e-05, "loss": 1.548, "step": 132 }, { "epoch": 0.09210526315789473, "grad_norm": 1.1588407754898071, "learning_rate": 1.96391288677283e-05, "loss": 1.2959, "step": 133 }, { "epoch": 0.09279778393351801, "grad_norm": 0.9556019306182861, "learning_rate": 1.9633273466834826e-05, "loss": 1.4758, "step": 134 }, { "epoch": 0.09349030470914127, "grad_norm": 1.0594704151153564, "learning_rate": 1.9627371830483982e-05, "loss": 1.0755, "step": 135 }, { "epoch": 0.09418282548476455, "grad_norm": 1.4173860549926758, "learning_rate": 1.9621423987001013e-05, "loss": 1.4406, "step": 136 }, { "epoch": 0.09487534626038781, "grad_norm": 0.9821715950965881, "learning_rate": 1.9615429964932945e-05, "loss": 1.4767, "step": 137 }, { "epoch": 0.09556786703601108, "grad_norm": 1.2425581216812134, "learning_rate": 1.960938979304843e-05, "loss": 1.3854, "step": 138 }, { "epoch": 0.09626038781163435, "grad_norm": 1.0587480068206787, "learning_rate": 1.960330350033763e-05, "loss": 1.6481, "step": 139 }, { "epoch": 0.09695290858725762, "grad_norm": 0.9959594011306763, "learning_rate": 1.959717111601206e-05, "loss": 1.6265, "step": 140 }, { "epoch": 0.09764542936288088, "grad_norm": 0.9131922125816345, "learning_rate": 1.959099266950445e-05, "loss": 1.5011, "step": 141 }, { "epoch": 0.09833795013850416, "grad_norm": 0.9651609659194946, "learning_rate": 1.9584768190468624e-05, "loss": 1.7318, "step": 142 }, { "epoch": 0.09903047091412742, "grad_norm": 1.0025120973587036, "learning_rate": 1.9578497708779326e-05, "loss": 1.4781, "step": 143 }, { "epoch": 0.0997229916897507, "grad_norm": 0.9535677433013916, "learning_rate": 1.95721812545321e-05, "loss": 1.4097, "step": 144 }, { "epoch": 0.10041551246537396, "grad_norm": 0.9194809198379517, "learning_rate": 1.9565818858043136e-05, "loss": 1.6294, "step": 145 }, { "epoch": 0.10110803324099724, "grad_norm": 0.9537028074264526, "learning_rate": 1.9559410549849125e-05, "loss": 1.6033, "step": 146 }, { "epoch": 0.1018005540166205, "grad_norm": 0.8676417469978333, "learning_rate": 1.955295636070712e-05, "loss": 1.4449, "step": 147 }, { "epoch": 0.10249307479224377, "grad_norm": 0.9458538889884949, "learning_rate": 1.9546456321594374e-05, "loss": 1.5544, "step": 148 }, { "epoch": 0.10318559556786704, "grad_norm": 1.1934260129928589, "learning_rate": 1.9539910463708204e-05, "loss": 1.403, "step": 149 }, { "epoch": 0.1038781163434903, "grad_norm": 1.1710929870605469, "learning_rate": 1.9533318818465837e-05, "loss": 1.2662, "step": 150 }, { "epoch": 0.10457063711911357, "grad_norm": 1.031394362449646, "learning_rate": 1.952668141750426e-05, "loss": 1.4116, "step": 151 }, { "epoch": 0.10526315789473684, "grad_norm": 0.9981980919837952, "learning_rate": 1.9519998292680062e-05, "loss": 1.6333, "step": 152 }, { "epoch": 0.10595567867036011, "grad_norm": 1.0679597854614258, "learning_rate": 1.951326947606929e-05, "loss": 1.4244, "step": 153 }, { "epoch": 0.10664819944598337, "grad_norm": 1.1829801797866821, "learning_rate": 1.9506494999967298e-05, "loss": 1.4967, "step": 154 }, { "epoch": 0.10734072022160665, "grad_norm": 1.143169641494751, "learning_rate": 1.9499674896888573e-05, "loss": 1.3145, "step": 155 }, { "epoch": 0.10803324099722991, "grad_norm": 1.1118048429489136, "learning_rate": 1.94928091995666e-05, "loss": 1.5536, "step": 156 }, { "epoch": 0.10872576177285319, "grad_norm": 1.1905399560928345, "learning_rate": 1.948589794095369e-05, "loss": 1.7543, "step": 157 }, { "epoch": 0.10941828254847645, "grad_norm": 1.1353670358657837, "learning_rate": 1.9478941154220833e-05, "loss": 1.6226, "step": 158 }, { "epoch": 0.11011080332409973, "grad_norm": 1.1765538454055786, "learning_rate": 1.947193887275753e-05, "loss": 1.5584, "step": 159 }, { "epoch": 0.11080332409972299, "grad_norm": 1.1196644306182861, "learning_rate": 1.9464891130171647e-05, "loss": 1.6066, "step": 160 }, { "epoch": 0.11149584487534626, "grad_norm": 1.0848753452301025, "learning_rate": 1.945779796028923e-05, "loss": 1.6394, "step": 161 }, { "epoch": 0.11218836565096953, "grad_norm": 1.0297619104385376, "learning_rate": 1.9450659397154353e-05, "loss": 1.487, "step": 162 }, { "epoch": 0.1128808864265928, "grad_norm": 1.0936334133148193, "learning_rate": 1.9443475475028985e-05, "loss": 1.4577, "step": 163 }, { "epoch": 0.11357340720221606, "grad_norm": 1.1814393997192383, "learning_rate": 1.9436246228392762e-05, "loss": 1.3241, "step": 164 }, { "epoch": 0.11426592797783934, "grad_norm": 0.9540955424308777, "learning_rate": 1.942897169194288e-05, "loss": 1.555, "step": 165 }, { "epoch": 0.1149584487534626, "grad_norm": 1.0909806489944458, "learning_rate": 1.94216519005939e-05, "loss": 1.5422, "step": 166 }, { "epoch": 0.11565096952908588, "grad_norm": 1.1541738510131836, "learning_rate": 1.941428688947759e-05, "loss": 1.541, "step": 167 }, { "epoch": 0.11634349030470914, "grad_norm": 1.2583942413330078, "learning_rate": 1.9406876693942747e-05, "loss": 1.3919, "step": 168 }, { "epoch": 0.11703601108033242, "grad_norm": 1.1814335584640503, "learning_rate": 1.9399421349555037e-05, "loss": 1.4799, "step": 169 }, { "epoch": 0.11772853185595568, "grad_norm": 1.0238617658615112, "learning_rate": 1.939192089209682e-05, "loss": 1.462, "step": 170 }, { "epoch": 0.11842105263157894, "grad_norm": 0.9870756268501282, "learning_rate": 1.938437535756698e-05, "loss": 1.5175, "step": 171 }, { "epoch": 0.11911357340720222, "grad_norm": 0.9913883209228516, "learning_rate": 1.9376784782180747e-05, "loss": 1.547, "step": 172 }, { "epoch": 0.11980609418282548, "grad_norm": 1.0706313848495483, "learning_rate": 1.9369149202369532e-05, "loss": 1.5063, "step": 173 }, { "epoch": 0.12049861495844875, "grad_norm": 1.0937633514404297, "learning_rate": 1.9361468654780748e-05, "loss": 1.4345, "step": 174 }, { "epoch": 0.12119113573407202, "grad_norm": 1.341931700706482, "learning_rate": 1.9353743176277624e-05, "loss": 1.5701, "step": 175 }, { "epoch": 0.12188365650969529, "grad_norm": 1.1175850629806519, "learning_rate": 1.9345972803939046e-05, "loss": 1.4767, "step": 176 }, { "epoch": 0.12257617728531855, "grad_norm": 1.1408743858337402, "learning_rate": 1.933815757505937e-05, "loss": 1.4476, "step": 177 }, { "epoch": 0.12326869806094183, "grad_norm": 1.1984326839447021, "learning_rate": 1.9330297527148246e-05, "loss": 1.4915, "step": 178 }, { "epoch": 0.12396121883656509, "grad_norm": 1.1456125974655151, "learning_rate": 1.932239269793043e-05, "loss": 1.5642, "step": 179 }, { "epoch": 0.12465373961218837, "grad_norm": 1.0504119396209717, "learning_rate": 1.9314443125345606e-05, "loss": 1.494, "step": 180 }, { "epoch": 0.12534626038781163, "grad_norm": 1.188533067703247, "learning_rate": 1.9306448847548214e-05, "loss": 1.5529, "step": 181 }, { "epoch": 0.1260387811634349, "grad_norm": 1.1496820449829102, "learning_rate": 1.929840990290726e-05, "loss": 1.4418, "step": 182 }, { "epoch": 0.12673130193905818, "grad_norm": 1.0949493646621704, "learning_rate": 1.9290326330006125e-05, "loss": 1.58, "step": 183 }, { "epoch": 0.12742382271468145, "grad_norm": 1.0846694707870483, "learning_rate": 1.928219816764238e-05, "loss": 1.6893, "step": 184 }, { "epoch": 0.1281163434903047, "grad_norm": 1.086604118347168, "learning_rate": 1.9274025454827626e-05, "loss": 1.4686, "step": 185 }, { "epoch": 0.12880886426592797, "grad_norm": 1.2065285444259644, "learning_rate": 1.9265808230787265e-05, "loss": 1.4176, "step": 186 }, { "epoch": 0.12950138504155126, "grad_norm": 1.1812058687210083, "learning_rate": 1.9257546534960354e-05, "loss": 1.4403, "step": 187 }, { "epoch": 0.13019390581717452, "grad_norm": 1.1747177839279175, "learning_rate": 1.9249240406999366e-05, "loss": 1.551, "step": 188 }, { "epoch": 0.13088642659279778, "grad_norm": 1.2344166040420532, "learning_rate": 1.9240889886770065e-05, "loss": 1.496, "step": 189 }, { "epoch": 0.13157894736842105, "grad_norm": 1.1985383033752441, "learning_rate": 1.9232495014351248e-05, "loss": 1.534, "step": 190 }, { "epoch": 0.13227146814404434, "grad_norm": 1.2938690185546875, "learning_rate": 1.9224055830034597e-05, "loss": 1.3664, "step": 191 }, { "epoch": 0.1329639889196676, "grad_norm": 1.1214767694473267, "learning_rate": 1.921557237432447e-05, "loss": 1.4684, "step": 192 }, { "epoch": 0.13365650969529086, "grad_norm": 1.1654257774353027, "learning_rate": 1.9207044687937705e-05, "loss": 1.5447, "step": 193 }, { "epoch": 0.13434903047091412, "grad_norm": 1.1630734205245972, "learning_rate": 1.919847281180343e-05, "loss": 1.5108, "step": 194 }, { "epoch": 0.13504155124653738, "grad_norm": 1.125630497932434, "learning_rate": 1.918985678706287e-05, "loss": 1.5938, "step": 195 }, { "epoch": 0.13573407202216067, "grad_norm": 1.1549885272979736, "learning_rate": 1.9181196655069126e-05, "loss": 1.4511, "step": 196 }, { "epoch": 0.13642659279778394, "grad_norm": 1.4739876985549927, "learning_rate": 1.917249245738702e-05, "loss": 1.3833, "step": 197 }, { "epoch": 0.1371191135734072, "grad_norm": 1.3530839681625366, "learning_rate": 1.9163744235792845e-05, "loss": 1.4574, "step": 198 }, { "epoch": 0.13781163434903046, "grad_norm": 1.2195689678192139, "learning_rate": 1.915495203227421e-05, "loss": 1.4105, "step": 199 }, { "epoch": 0.13850415512465375, "grad_norm": 1.347271203994751, "learning_rate": 1.9146115889029793e-05, "loss": 1.6371, "step": 200 }, { "epoch": 0.139196675900277, "grad_norm": 1.3149245977401733, "learning_rate": 1.9137235848469196e-05, "loss": 1.3975, "step": 201 }, { "epoch": 0.13988919667590027, "grad_norm": 1.113451361656189, "learning_rate": 1.912831195321268e-05, "loss": 1.5483, "step": 202 }, { "epoch": 0.14058171745152354, "grad_norm": 1.2491871118545532, "learning_rate": 1.9119344246091e-05, "loss": 1.59, "step": 203 }, { "epoch": 0.14127423822714683, "grad_norm": 1.165940284729004, "learning_rate": 1.9110332770145198e-05, "loss": 1.4319, "step": 204 }, { "epoch": 0.1419667590027701, "grad_norm": 1.2933601140975952, "learning_rate": 1.9101277568626374e-05, "loss": 1.4285, "step": 205 }, { "epoch": 0.14265927977839335, "grad_norm": 1.3592076301574707, "learning_rate": 1.9092178684995487e-05, "loss": 1.435, "step": 206 }, { "epoch": 0.1433518005540166, "grad_norm": 1.3157401084899902, "learning_rate": 1.908303616292317e-05, "loss": 1.4506, "step": 207 }, { "epoch": 0.1440443213296399, "grad_norm": 1.179969072341919, "learning_rate": 1.9073850046289484e-05, "loss": 1.5505, "step": 208 }, { "epoch": 0.14473684210526316, "grad_norm": 1.3946419954299927, "learning_rate": 1.9064620379183733e-05, "loss": 1.4977, "step": 209 }, { "epoch": 0.14542936288088643, "grad_norm": 1.2105120420455933, "learning_rate": 1.9055347205904245e-05, "loss": 1.422, "step": 210 }, { "epoch": 0.1461218836565097, "grad_norm": 1.3983213901519775, "learning_rate": 1.9046030570958153e-05, "loss": 1.4853, "step": 211 }, { "epoch": 0.14681440443213298, "grad_norm": 1.2327667474746704, "learning_rate": 1.903667051906119e-05, "loss": 1.5621, "step": 212 }, { "epoch": 0.14750692520775624, "grad_norm": 1.0103579759597778, "learning_rate": 1.9027267095137466e-05, "loss": 0.7994, "step": 213 }, { "epoch": 0.1481994459833795, "grad_norm": 1.1519508361816406, "learning_rate": 1.901782034431927e-05, "loss": 1.4775, "step": 214 }, { "epoch": 0.14889196675900276, "grad_norm": 1.2052768468856812, "learning_rate": 1.9008330311946826e-05, "loss": 1.3932, "step": 215 }, { "epoch": 0.14958448753462603, "grad_norm": 1.217500925064087, "learning_rate": 1.8998797043568102e-05, "loss": 1.5393, "step": 216 }, { "epoch": 0.15027700831024932, "grad_norm": 1.2066594362258911, "learning_rate": 1.8989220584938574e-05, "loss": 1.5545, "step": 217 }, { "epoch": 0.15096952908587258, "grad_norm": 1.2560105323791504, "learning_rate": 1.8979600982021014e-05, "loss": 1.4585, "step": 218 }, { "epoch": 0.15166204986149584, "grad_norm": 1.2688912153244019, "learning_rate": 1.8969938280985264e-05, "loss": 1.3712, "step": 219 }, { "epoch": 0.1523545706371191, "grad_norm": 1.2653838396072388, "learning_rate": 1.896023252820802e-05, "loss": 1.5732, "step": 220 }, { "epoch": 0.1530470914127424, "grad_norm": 1.3516199588775635, "learning_rate": 1.8950483770272617e-05, "loss": 1.3206, "step": 221 }, { "epoch": 0.15373961218836565, "grad_norm": 1.3281147480010986, "learning_rate": 1.8940692053968773e-05, "loss": 1.3235, "step": 222 }, { "epoch": 0.15443213296398892, "grad_norm": 1.3650656938552856, "learning_rate": 1.893085742629241e-05, "loss": 1.3977, "step": 223 }, { "epoch": 0.15512465373961218, "grad_norm": 1.270662784576416, "learning_rate": 1.89209799344454e-05, "loss": 1.5181, "step": 224 }, { "epoch": 0.15581717451523547, "grad_norm": 1.3656113147735596, "learning_rate": 1.891105962583533e-05, "loss": 1.3082, "step": 225 }, { "epoch": 0.15650969529085873, "grad_norm": 1.2502739429473877, "learning_rate": 1.8901096548075305e-05, "loss": 1.5326, "step": 226 }, { "epoch": 0.157202216066482, "grad_norm": 1.3605083227157593, "learning_rate": 1.88910907489837e-05, "loss": 1.3527, "step": 227 }, { "epoch": 0.15789473684210525, "grad_norm": 1.2121204137802124, "learning_rate": 1.8881042276583924e-05, "loss": 1.3969, "step": 228 }, { "epoch": 0.15858725761772854, "grad_norm": 1.2529863119125366, "learning_rate": 1.8870951179104214e-05, "loss": 1.4062, "step": 229 }, { "epoch": 0.1592797783933518, "grad_norm": 1.1819360256195068, "learning_rate": 1.8860817504977374e-05, "loss": 1.3974, "step": 230 }, { "epoch": 0.15997229916897507, "grad_norm": 1.234582543373108, "learning_rate": 1.8850641302840565e-05, "loss": 1.4027, "step": 231 }, { "epoch": 0.16066481994459833, "grad_norm": 1.3239039182662964, "learning_rate": 1.8840422621535067e-05, "loss": 1.3196, "step": 232 }, { "epoch": 0.16135734072022162, "grad_norm": 1.1581454277038574, "learning_rate": 1.8830161510106027e-05, "loss": 1.4674, "step": 233 }, { "epoch": 0.16204986149584488, "grad_norm": 1.3907638788223267, "learning_rate": 1.881985801780225e-05, "loss": 1.391, "step": 234 }, { "epoch": 0.16274238227146814, "grad_norm": 1.2587774991989136, "learning_rate": 1.880951219407596e-05, "loss": 1.4388, "step": 235 }, { "epoch": 0.1634349030470914, "grad_norm": 1.3093003034591675, "learning_rate": 1.8799124088582523e-05, "loss": 1.6846, "step": 236 }, { "epoch": 0.16412742382271467, "grad_norm": 1.4729034900665283, "learning_rate": 1.878869375118027e-05, "loss": 1.2827, "step": 237 }, { "epoch": 0.16481994459833796, "grad_norm": 1.171746015548706, "learning_rate": 1.8778221231930204e-05, "loss": 1.517, "step": 238 }, { "epoch": 0.16551246537396122, "grad_norm": 1.384203314781189, "learning_rate": 1.8767706581095796e-05, "loss": 1.5171, "step": 239 }, { "epoch": 0.16620498614958448, "grad_norm": 1.3806113004684448, "learning_rate": 1.8757149849142724e-05, "loss": 1.4459, "step": 240 }, { "epoch": 0.16689750692520774, "grad_norm": 1.2819597721099854, "learning_rate": 1.874655108673864e-05, "loss": 1.5718, "step": 241 }, { "epoch": 0.16759002770083103, "grad_norm": 1.3466342687606812, "learning_rate": 1.8735910344752925e-05, "loss": 1.4098, "step": 242 }, { "epoch": 0.1682825484764543, "grad_norm": 1.301113247871399, "learning_rate": 1.872522767425643e-05, "loss": 1.1911, "step": 243 }, { "epoch": 0.16897506925207756, "grad_norm": 1.115594506263733, "learning_rate": 1.871450312652126e-05, "loss": 1.4129, "step": 244 }, { "epoch": 0.16966759002770082, "grad_norm": 1.236174464225769, "learning_rate": 1.870373675302051e-05, "loss": 1.3763, "step": 245 }, { "epoch": 0.1703601108033241, "grad_norm": 1.6457040309906006, "learning_rate": 1.8692928605428016e-05, "loss": 1.3063, "step": 246 }, { "epoch": 0.17105263157894737, "grad_norm": 1.3574951887130737, "learning_rate": 1.868207873561811e-05, "loss": 1.3223, "step": 247 }, { "epoch": 0.17174515235457063, "grad_norm": 1.3911534547805786, "learning_rate": 1.8671187195665373e-05, "loss": 1.3415, "step": 248 }, { "epoch": 0.1724376731301939, "grad_norm": 1.2706600427627563, "learning_rate": 1.866025403784439e-05, "loss": 1.4396, "step": 249 }, { "epoch": 0.1731301939058172, "grad_norm": 1.2637407779693604, "learning_rate": 1.8649279314629484e-05, "loss": 1.4115, "step": 250 }, { "epoch": 0.17382271468144045, "grad_norm": 1.3504443168640137, "learning_rate": 1.8638263078694483e-05, "loss": 1.333, "step": 251 }, { "epoch": 0.1745152354570637, "grad_norm": 1.1280931234359741, "learning_rate": 1.862720538291245e-05, "loss": 1.454, "step": 252 }, { "epoch": 0.17520775623268697, "grad_norm": 1.309478521347046, "learning_rate": 1.8616106280355443e-05, "loss": 1.455, "step": 253 }, { "epoch": 0.17590027700831026, "grad_norm": 1.4461839199066162, "learning_rate": 1.8604965824294253e-05, "loss": 1.1718, "step": 254 }, { "epoch": 0.17659279778393353, "grad_norm": 1.3986719846725464, "learning_rate": 1.859378406819814e-05, "loss": 1.4925, "step": 255 }, { "epoch": 0.1772853185595568, "grad_norm": 1.329654574394226, "learning_rate": 1.8582561065734602e-05, "loss": 1.4181, "step": 256 }, { "epoch": 0.17797783933518005, "grad_norm": 1.472766637802124, "learning_rate": 1.8571296870769093e-05, "loss": 1.2677, "step": 257 }, { "epoch": 0.1786703601108033, "grad_norm": 1.3047882318496704, "learning_rate": 1.8559991537364767e-05, "loss": 1.4402, "step": 258 }, { "epoch": 0.1793628808864266, "grad_norm": 1.4063674211502075, "learning_rate": 1.854864511978224e-05, "loss": 1.2174, "step": 259 }, { "epoch": 0.18005540166204986, "grad_norm": 1.3601303100585938, "learning_rate": 1.8537257672479293e-05, "loss": 1.5546, "step": 260 }, { "epoch": 0.18074792243767313, "grad_norm": 1.6353052854537964, "learning_rate": 1.8525829250110653e-05, "loss": 1.2166, "step": 261 }, { "epoch": 0.1814404432132964, "grad_norm": 1.4279634952545166, "learning_rate": 1.8514359907527693e-05, "loss": 1.4062, "step": 262 }, { "epoch": 0.18213296398891968, "grad_norm": 1.654395580291748, "learning_rate": 1.8502849699778193e-05, "loss": 1.2227, "step": 263 }, { "epoch": 0.18282548476454294, "grad_norm": 1.4442977905273438, "learning_rate": 1.8491298682106066e-05, "loss": 1.533, "step": 264 }, { "epoch": 0.1835180055401662, "grad_norm": 1.3365898132324219, "learning_rate": 1.8479706909951095e-05, "loss": 1.4334, "step": 265 }, { "epoch": 0.18421052631578946, "grad_norm": 1.3646080493927002, "learning_rate": 1.8468074438948664e-05, "loss": 1.2597, "step": 266 }, { "epoch": 0.18490304709141275, "grad_norm": 1.2578566074371338, "learning_rate": 1.84564013249295e-05, "loss": 1.5369, "step": 267 }, { "epoch": 0.18559556786703602, "grad_norm": 1.4007184505462646, "learning_rate": 1.8444687623919388e-05, "loss": 1.3263, "step": 268 }, { "epoch": 0.18628808864265928, "grad_norm": 1.362870216369629, "learning_rate": 1.8432933392138922e-05, "loss": 1.5152, "step": 269 }, { "epoch": 0.18698060941828254, "grad_norm": 1.277584433555603, "learning_rate": 1.842113868600322e-05, "loss": 1.5021, "step": 270 }, { "epoch": 0.18767313019390583, "grad_norm": 1.2652945518493652, "learning_rate": 1.8409303562121664e-05, "loss": 1.4268, "step": 271 }, { "epoch": 0.1883656509695291, "grad_norm": 1.5869050025939941, "learning_rate": 1.8397428077297622e-05, "loss": 1.4879, "step": 272 }, { "epoch": 0.18905817174515235, "grad_norm": 1.4051944017410278, "learning_rate": 1.838551228852817e-05, "loss": 1.4672, "step": 273 }, { "epoch": 0.18975069252077562, "grad_norm": 1.375993251800537, "learning_rate": 1.837355625300383e-05, "loss": 1.4808, "step": 274 }, { "epoch": 0.1904432132963989, "grad_norm": 1.6987355947494507, "learning_rate": 1.83615600281083e-05, "loss": 1.5659, "step": 275 }, { "epoch": 0.19113573407202217, "grad_norm": 1.5170179605484009, "learning_rate": 1.834952367141816e-05, "loss": 1.5005, "step": 276 }, { "epoch": 0.19182825484764543, "grad_norm": 1.5223069190979004, "learning_rate": 1.8337447240702596e-05, "loss": 1.4851, "step": 277 }, { "epoch": 0.1925207756232687, "grad_norm": 1.4131394624710083, "learning_rate": 1.8325330793923146e-05, "loss": 1.4095, "step": 278 }, { "epoch": 0.19321329639889195, "grad_norm": 1.3926132917404175, "learning_rate": 1.8313174389233403e-05, "loss": 1.3965, "step": 279 }, { "epoch": 0.19390581717451524, "grad_norm": 1.392206072807312, "learning_rate": 1.8300978084978736e-05, "loss": 1.4758, "step": 280 }, { "epoch": 0.1945983379501385, "grad_norm": 1.290574550628662, "learning_rate": 1.8288741939696023e-05, "loss": 1.4714, "step": 281 }, { "epoch": 0.19529085872576177, "grad_norm": 1.2657057046890259, "learning_rate": 1.8276466012113358e-05, "loss": 1.3736, "step": 282 }, { "epoch": 0.19598337950138503, "grad_norm": 1.5142226219177246, "learning_rate": 1.826415036114976e-05, "loss": 1.3636, "step": 283 }, { "epoch": 0.19667590027700832, "grad_norm": 1.4049171209335327, "learning_rate": 1.8251795045914922e-05, "loss": 1.4736, "step": 284 }, { "epoch": 0.19736842105263158, "grad_norm": 1.409709095954895, "learning_rate": 1.8239400125708902e-05, "loss": 1.5229, "step": 285 }, { "epoch": 0.19806094182825484, "grad_norm": 1.5270507335662842, "learning_rate": 1.8226965660021836e-05, "loss": 1.4371, "step": 286 }, { "epoch": 0.1987534626038781, "grad_norm": 1.3285179138183594, "learning_rate": 1.8214491708533667e-05, "loss": 1.4363, "step": 287 }, { "epoch": 0.1994459833795014, "grad_norm": 1.5918437242507935, "learning_rate": 1.8201978331113855e-05, "loss": 1.4965, "step": 288 }, { "epoch": 0.20013850415512466, "grad_norm": 1.3763656616210938, "learning_rate": 1.8189425587821083e-05, "loss": 1.6337, "step": 289 }, { "epoch": 0.20083102493074792, "grad_norm": 1.6876428127288818, "learning_rate": 1.817683353890297e-05, "loss": 1.1922, "step": 290 }, { "epoch": 0.20152354570637118, "grad_norm": 1.462172031402588, "learning_rate": 1.8164202244795795e-05, "loss": 1.4912, "step": 291 }, { "epoch": 0.20221606648199447, "grad_norm": 1.4951701164245605, "learning_rate": 1.8151531766124186e-05, "loss": 1.5338, "step": 292 }, { "epoch": 0.20290858725761773, "grad_norm": 1.4076398611068726, "learning_rate": 1.8138822163700847e-05, "loss": 1.382, "step": 293 }, { "epoch": 0.203601108033241, "grad_norm": 1.4428248405456543, "learning_rate": 1.8126073498526254e-05, "loss": 1.3539, "step": 294 }, { "epoch": 0.20429362880886426, "grad_norm": 1.4771924018859863, "learning_rate": 1.8113285831788364e-05, "loss": 1.4596, "step": 295 }, { "epoch": 0.20498614958448755, "grad_norm": 1.6791538000106812, "learning_rate": 1.8100459224862336e-05, "loss": 1.1887, "step": 296 }, { "epoch": 0.2056786703601108, "grad_norm": 1.3377101421356201, "learning_rate": 1.8087593739310214e-05, "loss": 1.5108, "step": 297 }, { "epoch": 0.20637119113573407, "grad_norm": 1.5619332790374756, "learning_rate": 1.8074689436880643e-05, "loss": 1.3056, "step": 298 }, { "epoch": 0.20706371191135733, "grad_norm": 1.4617923498153687, "learning_rate": 1.806174637950858e-05, "loss": 1.5482, "step": 299 }, { "epoch": 0.2077562326869806, "grad_norm": 1.582040786743164, "learning_rate": 1.804876462931498e-05, "loss": 1.4631, "step": 300 }, { "epoch": 0.2084487534626039, "grad_norm": 1.5981621742248535, "learning_rate": 1.803574424860651e-05, "loss": 1.4452, "step": 301 }, { "epoch": 0.20914127423822715, "grad_norm": 1.3639321327209473, "learning_rate": 1.8022685299875245e-05, "loss": 1.5022, "step": 302 }, { "epoch": 0.2098337950138504, "grad_norm": 1.5008395910263062, "learning_rate": 1.800958784579837e-05, "loss": 1.4582, "step": 303 }, { "epoch": 0.21052631578947367, "grad_norm": 1.4245600700378418, "learning_rate": 1.799645194923788e-05, "loss": 1.4573, "step": 304 }, { "epoch": 0.21121883656509696, "grad_norm": 1.5339776277542114, "learning_rate": 1.7983277673240277e-05, "loss": 1.5179, "step": 305 }, { "epoch": 0.21191135734072022, "grad_norm": 1.4085662364959717, "learning_rate": 1.7970065081036266e-05, "loss": 1.4609, "step": 306 }, { "epoch": 0.2126038781163435, "grad_norm": 1.540628433227539, "learning_rate": 1.795681423604045e-05, "loss": 1.4168, "step": 307 }, { "epoch": 0.21329639889196675, "grad_norm": 1.3457483053207397, "learning_rate": 1.7943525201851038e-05, "loss": 1.3792, "step": 308 }, { "epoch": 0.21398891966759004, "grad_norm": 1.6046233177185059, "learning_rate": 1.7930198042249523e-05, "loss": 1.4604, "step": 309 }, { "epoch": 0.2146814404432133, "grad_norm": 1.6902879476547241, "learning_rate": 1.7916832821200375e-05, "loss": 1.3465, "step": 310 }, { "epoch": 0.21537396121883656, "grad_norm": 1.4801162481307983, "learning_rate": 1.7903429602850765e-05, "loss": 1.4711, "step": 311 }, { "epoch": 0.21606648199445982, "grad_norm": 1.7086902856826782, "learning_rate": 1.7889988451530208e-05, "loss": 1.3191, "step": 312 }, { "epoch": 0.21675900277008311, "grad_norm": 1.6342074871063232, "learning_rate": 1.7876509431750303e-05, "loss": 1.3109, "step": 313 }, { "epoch": 0.21745152354570638, "grad_norm": 1.7456109523773193, "learning_rate": 1.7862992608204384e-05, "loss": 1.5732, "step": 314 }, { "epoch": 0.21814404432132964, "grad_norm": 1.527922511100769, "learning_rate": 1.7849438045767233e-05, "loss": 1.2966, "step": 315 }, { "epoch": 0.2188365650969529, "grad_norm": 1.6418479681015015, "learning_rate": 1.783584580949477e-05, "loss": 1.4165, "step": 316 }, { "epoch": 0.2195290858725762, "grad_norm": 1.5722501277923584, "learning_rate": 1.782221596462372e-05, "loss": 1.4292, "step": 317 }, { "epoch": 0.22022160664819945, "grad_norm": 1.8803280591964722, "learning_rate": 1.7808548576571314e-05, "loss": 1.1589, "step": 318 }, { "epoch": 0.22091412742382271, "grad_norm": 1.510886549949646, "learning_rate": 1.7794843710934982e-05, "loss": 1.3436, "step": 319 }, { "epoch": 0.22160664819944598, "grad_norm": 1.5350871086120605, "learning_rate": 1.7781101433492026e-05, "loss": 1.5063, "step": 320 }, { "epoch": 0.22229916897506924, "grad_norm": 1.538533329963684, "learning_rate": 1.7767321810199305e-05, "loss": 1.5124, "step": 321 }, { "epoch": 0.22299168975069253, "grad_norm": 1.6667261123657227, "learning_rate": 1.7753504907192923e-05, "loss": 1.318, "step": 322 }, { "epoch": 0.2236842105263158, "grad_norm": 1.5465984344482422, "learning_rate": 1.7739650790787915e-05, "loss": 1.4244, "step": 323 }, { "epoch": 0.22437673130193905, "grad_norm": 1.4366633892059326, "learning_rate": 1.7725759527477923e-05, "loss": 1.5743, "step": 324 }, { "epoch": 0.22506925207756232, "grad_norm": 1.5777736902236938, "learning_rate": 1.771183118393486e-05, "loss": 1.4462, "step": 325 }, { "epoch": 0.2257617728531856, "grad_norm": 1.6491996049880981, "learning_rate": 1.769786582700864e-05, "loss": 1.3709, "step": 326 }, { "epoch": 0.22645429362880887, "grad_norm": 1.3817274570465088, "learning_rate": 1.7683863523726798e-05, "loss": 1.4792, "step": 327 }, { "epoch": 0.22714681440443213, "grad_norm": 1.506965160369873, "learning_rate": 1.7669824341294203e-05, "loss": 1.3656, "step": 328 }, { "epoch": 0.2278393351800554, "grad_norm": 1.699218988418579, "learning_rate": 1.7655748347092735e-05, "loss": 1.4267, "step": 329 }, { "epoch": 0.22853185595567868, "grad_norm": 1.546741247177124, "learning_rate": 1.7641635608680942e-05, "loss": 1.5038, "step": 330 }, { "epoch": 0.22922437673130194, "grad_norm": 1.7047183513641357, "learning_rate": 1.7627486193793744e-05, "loss": 1.3509, "step": 331 }, { "epoch": 0.2299168975069252, "grad_norm": 1.6500473022460938, "learning_rate": 1.7613300170342073e-05, "loss": 1.3246, "step": 332 }, { "epoch": 0.23060941828254847, "grad_norm": 1.4320755004882812, "learning_rate": 1.7599077606412577e-05, "loss": 1.416, "step": 333 }, { "epoch": 0.23130193905817176, "grad_norm": 1.5516586303710938, "learning_rate": 1.7584818570267287e-05, "loss": 1.4455, "step": 334 }, { "epoch": 0.23199445983379502, "grad_norm": 1.444295883178711, "learning_rate": 1.757052313034327e-05, "loss": 1.4894, "step": 335 }, { "epoch": 0.23268698060941828, "grad_norm": 1.6120071411132812, "learning_rate": 1.755619135525233e-05, "loss": 1.3861, "step": 336 }, { "epoch": 0.23337950138504154, "grad_norm": 1.4913694858551025, "learning_rate": 1.754182331378065e-05, "loss": 1.4899, "step": 337 }, { "epoch": 0.23407202216066483, "grad_norm": 1.5728522539138794, "learning_rate": 1.7527419074888483e-05, "loss": 1.458, "step": 338 }, { "epoch": 0.2347645429362881, "grad_norm": 1.6220054626464844, "learning_rate": 1.7512978707709817e-05, "loss": 1.4327, "step": 339 }, { "epoch": 0.23545706371191136, "grad_norm": 1.4640010595321655, "learning_rate": 1.749850228155203e-05, "loss": 1.3738, "step": 340 }, { "epoch": 0.23614958448753462, "grad_norm": 1.6346479654312134, "learning_rate": 1.7483989865895582e-05, "loss": 1.4064, "step": 341 }, { "epoch": 0.23684210526315788, "grad_norm": 1.6318504810333252, "learning_rate": 1.7469441530393652e-05, "loss": 1.3148, "step": 342 }, { "epoch": 0.23753462603878117, "grad_norm": 1.7678024768829346, "learning_rate": 1.7454857344871823e-05, "loss": 1.2937, "step": 343 }, { "epoch": 0.23822714681440443, "grad_norm": 1.9176974296569824, "learning_rate": 1.7440237379327745e-05, "loss": 1.3801, "step": 344 }, { "epoch": 0.2389196675900277, "grad_norm": 1.8107141256332397, "learning_rate": 1.7425581703930793e-05, "loss": 1.2044, "step": 345 }, { "epoch": 0.23961218836565096, "grad_norm": 1.5848854780197144, "learning_rate": 1.7410890389021737e-05, "loss": 1.4562, "step": 346 }, { "epoch": 0.24030470914127425, "grad_norm": 1.6646206378936768, "learning_rate": 1.7396163505112398e-05, "loss": 1.279, "step": 347 }, { "epoch": 0.2409972299168975, "grad_norm": 1.407603144645691, "learning_rate": 1.7381401122885316e-05, "loss": 1.2988, "step": 348 }, { "epoch": 0.24168975069252077, "grad_norm": 1.5010101795196533, "learning_rate": 1.73666033131934e-05, "loss": 1.3755, "step": 349 }, { "epoch": 0.24238227146814403, "grad_norm": 1.5880669355392456, "learning_rate": 1.7351770147059604e-05, "loss": 1.5101, "step": 350 }, { "epoch": 0.24307479224376732, "grad_norm": 1.6278942823410034, "learning_rate": 1.733690169567657e-05, "loss": 1.3891, "step": 351 }, { "epoch": 0.24376731301939059, "grad_norm": 1.6076022386550903, "learning_rate": 1.7321998030406303e-05, "loss": 1.4076, "step": 352 }, { "epoch": 0.24445983379501385, "grad_norm": 1.4889235496520996, "learning_rate": 1.7307059222779806e-05, "loss": 1.3706, "step": 353 }, { "epoch": 0.2451523545706371, "grad_norm": 1.619982361793518, "learning_rate": 1.729208534449676e-05, "loss": 1.3843, "step": 354 }, { "epoch": 0.2458448753462604, "grad_norm": 1.5546656847000122, "learning_rate": 1.7277076467425163e-05, "loss": 1.4328, "step": 355 }, { "epoch": 0.24653739612188366, "grad_norm": 1.69061279296875, "learning_rate": 1.7262032663601003e-05, "loss": 1.3942, "step": 356 }, { "epoch": 0.24722991689750692, "grad_norm": 1.8314852714538574, "learning_rate": 1.7246954005227884e-05, "loss": 1.3588, "step": 357 }, { "epoch": 0.24792243767313019, "grad_norm": 1.7705515623092651, "learning_rate": 1.723184056467671e-05, "loss": 1.4111, "step": 358 }, { "epoch": 0.24861495844875348, "grad_norm": 1.7024461030960083, "learning_rate": 1.721669241448532e-05, "loss": 1.3465, "step": 359 }, { "epoch": 0.24930747922437674, "grad_norm": 1.7152822017669678, "learning_rate": 1.7201509627358143e-05, "loss": 1.3865, "step": 360 }, { "epoch": 0.25, "grad_norm": 1.9307787418365479, "learning_rate": 1.718629227616585e-05, "loss": 1.3747, "step": 361 }, { "epoch": 0.25, "eval_loss": 1.0831600427627563, "eval_runtime": 335.7463, "eval_samples_per_second": 4.217, "eval_steps_per_second": 0.527, "step": 361 }, { "epoch": 0.25069252077562326, "grad_norm": 2.011570692062378, "learning_rate": 1.7171040433945006e-05, "loss": 1.219, "step": 362 }, { "epoch": 0.2513850415512465, "grad_norm": 1.763128399848938, "learning_rate": 1.7155754173897718e-05, "loss": 1.4044, "step": 363 }, { "epoch": 0.2520775623268698, "grad_norm": 1.6575963497161865, "learning_rate": 1.7140433569391275e-05, "loss": 1.5222, "step": 364 }, { "epoch": 0.25277008310249305, "grad_norm": 1.6661745309829712, "learning_rate": 1.7125078693957817e-05, "loss": 1.5345, "step": 365 }, { "epoch": 0.25346260387811637, "grad_norm": 1.8055832386016846, "learning_rate": 1.710968962129396e-05, "loss": 1.5074, "step": 366 }, { "epoch": 0.25415512465373963, "grad_norm": 1.827330231666565, "learning_rate": 1.709426642526046e-05, "loss": 1.4927, "step": 367 }, { "epoch": 0.2548476454293629, "grad_norm": 1.907296895980835, "learning_rate": 1.7078809179881847e-05, "loss": 1.2675, "step": 368 }, { "epoch": 0.25554016620498615, "grad_norm": 1.5925551652908325, "learning_rate": 1.706331795934606e-05, "loss": 1.369, "step": 369 }, { "epoch": 0.2562326869806094, "grad_norm": 1.6697511672973633, "learning_rate": 1.704779283800412e-05, "loss": 1.4416, "step": 370 }, { "epoch": 0.2569252077562327, "grad_norm": 1.766982913017273, "learning_rate": 1.7032233890369755e-05, "loss": 1.4247, "step": 371 }, { "epoch": 0.25761772853185594, "grad_norm": 1.8494454622268677, "learning_rate": 1.701664119111904e-05, "loss": 1.4531, "step": 372 }, { "epoch": 0.2583102493074792, "grad_norm": 1.8136225938796997, "learning_rate": 1.700101481509004e-05, "loss": 1.4062, "step": 373 }, { "epoch": 0.2590027700831025, "grad_norm": 1.662751317024231, "learning_rate": 1.6985354837282462e-05, "loss": 1.4713, "step": 374 }, { "epoch": 0.2596952908587258, "grad_norm": 1.670058012008667, "learning_rate": 1.6969661332857278e-05, "loss": 1.5039, "step": 375 }, { "epoch": 0.26038781163434904, "grad_norm": 1.7041593790054321, "learning_rate": 1.6953934377136375e-05, "loss": 1.5144, "step": 376 }, { "epoch": 0.2610803324099723, "grad_norm": 1.9452157020568848, "learning_rate": 1.6938174045602203e-05, "loss": 1.5408, "step": 377 }, { "epoch": 0.26177285318559557, "grad_norm": 1.7609469890594482, "learning_rate": 1.6922380413897382e-05, "loss": 1.4979, "step": 378 }, { "epoch": 0.26246537396121883, "grad_norm": 1.777033805847168, "learning_rate": 1.6906553557824372e-05, "loss": 1.3503, "step": 379 }, { "epoch": 0.2631578947368421, "grad_norm": 1.6396095752716064, "learning_rate": 1.689069355334509e-05, "loss": 1.3918, "step": 380 }, { "epoch": 0.26385041551246535, "grad_norm": 1.517849087715149, "learning_rate": 1.6874800476580553e-05, "loss": 1.4, "step": 381 }, { "epoch": 0.26454293628808867, "grad_norm": 1.8570725917816162, "learning_rate": 1.6858874403810507e-05, "loss": 1.2621, "step": 382 }, { "epoch": 0.26523545706371193, "grad_norm": 1.9514487981796265, "learning_rate": 1.684291541147307e-05, "loss": 1.4256, "step": 383 }, { "epoch": 0.2659279778393352, "grad_norm": 1.9198554754257202, "learning_rate": 1.682692357616435e-05, "loss": 1.5005, "step": 384 }, { "epoch": 0.26662049861495846, "grad_norm": 1.7078975439071655, "learning_rate": 1.6810898974638098e-05, "loss": 1.3862, "step": 385 }, { "epoch": 0.2673130193905817, "grad_norm": 1.663574457168579, "learning_rate": 1.679484168380532e-05, "loss": 1.3513, "step": 386 }, { "epoch": 0.268005540166205, "grad_norm": 1.7911577224731445, "learning_rate": 1.677875178073392e-05, "loss": 1.4409, "step": 387 }, { "epoch": 0.26869806094182824, "grad_norm": 1.7094714641571045, "learning_rate": 1.676262934264832e-05, "loss": 1.5266, "step": 388 }, { "epoch": 0.2693905817174515, "grad_norm": 1.6942095756530762, "learning_rate": 1.674647444692911e-05, "loss": 1.4287, "step": 389 }, { "epoch": 0.27008310249307477, "grad_norm": 1.7591415643692017, "learning_rate": 1.6730287171112652e-05, "loss": 1.3419, "step": 390 }, { "epoch": 0.2707756232686981, "grad_norm": 1.739908218383789, "learning_rate": 1.6714067592890713e-05, "loss": 1.3593, "step": 391 }, { "epoch": 0.27146814404432135, "grad_norm": 1.729467749595642, "learning_rate": 1.669781579011011e-05, "loss": 1.4044, "step": 392 }, { "epoch": 0.2721606648199446, "grad_norm": 1.7394282817840576, "learning_rate": 1.6681531840772314e-05, "loss": 1.5558, "step": 393 }, { "epoch": 0.27285318559556787, "grad_norm": 1.867372989654541, "learning_rate": 1.666521582303309e-05, "loss": 1.4419, "step": 394 }, { "epoch": 0.27354570637119113, "grad_norm": 1.7546651363372803, "learning_rate": 1.664886781520212e-05, "loss": 1.4957, "step": 395 }, { "epoch": 0.2742382271468144, "grad_norm": 2.4754140377044678, "learning_rate": 1.6632487895742612e-05, "loss": 1.3339, "step": 396 }, { "epoch": 0.27493074792243766, "grad_norm": 1.6385406255722046, "learning_rate": 1.661607614327095e-05, "loss": 1.3994, "step": 397 }, { "epoch": 0.2756232686980609, "grad_norm": 1.951462984085083, "learning_rate": 1.6599632636556292e-05, "loss": 1.2803, "step": 398 }, { "epoch": 0.27631578947368424, "grad_norm": 1.989600419998169, "learning_rate": 1.6583157454520214e-05, "loss": 1.359, "step": 399 }, { "epoch": 0.2770083102493075, "grad_norm": 1.6627402305603027, "learning_rate": 1.6566650676236307e-05, "loss": 1.4155, "step": 400 }, { "epoch": 0.27770083102493076, "grad_norm": 1.7846615314483643, "learning_rate": 1.6550112380929814e-05, "loss": 1.6159, "step": 401 }, { "epoch": 0.278393351800554, "grad_norm": 1.8167567253112793, "learning_rate": 1.653354264797725e-05, "loss": 1.411, "step": 402 }, { "epoch": 0.2790858725761773, "grad_norm": 2.0551259517669678, "learning_rate": 1.651694155690601e-05, "loss": 1.0842, "step": 403 }, { "epoch": 0.27977839335180055, "grad_norm": 1.8748019933700562, "learning_rate": 1.6500309187394005e-05, "loss": 1.3191, "step": 404 }, { "epoch": 0.2804709141274238, "grad_norm": 1.9822982549667358, "learning_rate": 1.6483645619269256e-05, "loss": 1.3953, "step": 405 }, { "epoch": 0.28116343490304707, "grad_norm": 1.8638688325881958, "learning_rate": 1.6466950932509532e-05, "loss": 1.3538, "step": 406 }, { "epoch": 0.28185595567867033, "grad_norm": 1.7385188341140747, "learning_rate": 1.645022520724195e-05, "loss": 1.4492, "step": 407 }, { "epoch": 0.28254847645429365, "grad_norm": 1.7316495180130005, "learning_rate": 1.643346852374261e-05, "loss": 1.3456, "step": 408 }, { "epoch": 0.2832409972299169, "grad_norm": 1.8079771995544434, "learning_rate": 1.641668096243619e-05, "loss": 1.3072, "step": 409 }, { "epoch": 0.2839335180055402, "grad_norm": 2.2126691341400146, "learning_rate": 1.6399862603895563e-05, "loss": 1.327, "step": 410 }, { "epoch": 0.28462603878116344, "grad_norm": 1.7359458208084106, "learning_rate": 1.6383013528841424e-05, "loss": 1.5197, "step": 411 }, { "epoch": 0.2853185595567867, "grad_norm": 1.96955406665802, "learning_rate": 1.6366133818141893e-05, "loss": 1.3577, "step": 412 }, { "epoch": 0.28601108033240996, "grad_norm": 1.8084375858306885, "learning_rate": 1.6349223552812125e-05, "loss": 1.3055, "step": 413 }, { "epoch": 0.2867036011080332, "grad_norm": 1.7716041803359985, "learning_rate": 1.633228281401392e-05, "loss": 1.3429, "step": 414 }, { "epoch": 0.2873961218836565, "grad_norm": 2.1030023097991943, "learning_rate": 1.631531168305534e-05, "loss": 1.142, "step": 415 }, { "epoch": 0.2880886426592798, "grad_norm": 1.6409399509429932, "learning_rate": 1.6298310241390326e-05, "loss": 1.5097, "step": 416 }, { "epoch": 0.28878116343490307, "grad_norm": 1.7756257057189941, "learning_rate": 1.6281278570618276e-05, "loss": 1.3902, "step": 417 }, { "epoch": 0.2894736842105263, "grad_norm": 1.753067970275879, "learning_rate": 1.6264216752483697e-05, "loss": 1.358, "step": 418 }, { "epoch": 0.2901662049861496, "grad_norm": 2.262544631958008, "learning_rate": 1.6247124868875777e-05, "loss": 1.4347, "step": 419 }, { "epoch": 0.29085872576177285, "grad_norm": 1.791831135749817, "learning_rate": 1.6230003001828e-05, "loss": 1.4917, "step": 420 }, { "epoch": 0.2915512465373961, "grad_norm": 1.7962827682495117, "learning_rate": 1.6212851233517772e-05, "loss": 1.3176, "step": 421 }, { "epoch": 0.2922437673130194, "grad_norm": 1.8246115446090698, "learning_rate": 1.6195669646266003e-05, "loss": 1.2801, "step": 422 }, { "epoch": 0.29293628808864264, "grad_norm": 1.9331282377243042, "learning_rate": 1.6178458322536722e-05, "loss": 1.3653, "step": 423 }, { "epoch": 0.29362880886426596, "grad_norm": 1.9322010278701782, "learning_rate": 1.616121734493668e-05, "loss": 1.4005, "step": 424 }, { "epoch": 0.2943213296398892, "grad_norm": 1.946174144744873, "learning_rate": 1.614394679621495e-05, "loss": 1.5496, "step": 425 }, { "epoch": 0.2950138504155125, "grad_norm": 1.8700590133666992, "learning_rate": 1.6126646759262548e-05, "loss": 1.3886, "step": 426 }, { "epoch": 0.29570637119113574, "grad_norm": 2.0393857955932617, "learning_rate": 1.6109317317111996e-05, "loss": 1.2218, "step": 427 }, { "epoch": 0.296398891966759, "grad_norm": 1.9611023664474487, "learning_rate": 1.609195855293697e-05, "loss": 1.3157, "step": 428 }, { "epoch": 0.29709141274238227, "grad_norm": 2.472890853881836, "learning_rate": 1.6074570550051864e-05, "loss": 1.3484, "step": 429 }, { "epoch": 0.29778393351800553, "grad_norm": 1.8426192998886108, "learning_rate": 1.6057153391911422e-05, "loss": 1.3931, "step": 430 }, { "epoch": 0.2984764542936288, "grad_norm": 1.7728618383407593, "learning_rate": 1.6039707162110296e-05, "loss": 1.3515, "step": 431 }, { "epoch": 0.29916897506925205, "grad_norm": 2.0701711177825928, "learning_rate": 1.6022231944382693e-05, "loss": 1.3197, "step": 432 }, { "epoch": 0.29986149584487537, "grad_norm": 1.7944912910461426, "learning_rate": 1.6004727822601932e-05, "loss": 1.4434, "step": 433 }, { "epoch": 0.30055401662049863, "grad_norm": 1.810572862625122, "learning_rate": 1.598719488078007e-05, "loss": 1.331, "step": 434 }, { "epoch": 0.3012465373961219, "grad_norm": 1.7466157674789429, "learning_rate": 1.596963320306748e-05, "loss": 1.3863, "step": 435 }, { "epoch": 0.30193905817174516, "grad_norm": 1.9109450578689575, "learning_rate": 1.5952042873752463e-05, "loss": 1.3476, "step": 436 }, { "epoch": 0.3026315789473684, "grad_norm": 1.788562297821045, "learning_rate": 1.593442397726082e-05, "loss": 1.3668, "step": 437 }, { "epoch": 0.3033240997229917, "grad_norm": 1.8199845552444458, "learning_rate": 1.5916776598155478e-05, "loss": 1.3688, "step": 438 }, { "epoch": 0.30401662049861494, "grad_norm": 1.8265742063522339, "learning_rate": 1.5899100821136065e-05, "loss": 1.4673, "step": 439 }, { "epoch": 0.3047091412742382, "grad_norm": 1.729217290878296, "learning_rate": 1.5881396731038493e-05, "loss": 1.4452, "step": 440 }, { "epoch": 0.3054016620498615, "grad_norm": 1.8088419437408447, "learning_rate": 1.5863664412834582e-05, "loss": 1.5256, "step": 441 }, { "epoch": 0.3060941828254848, "grad_norm": 1.9562954902648926, "learning_rate": 1.584590395163162e-05, "loss": 1.3788, "step": 442 }, { "epoch": 0.30678670360110805, "grad_norm": 2.257169008255005, "learning_rate": 1.5828115432671984e-05, "loss": 1.2451, "step": 443 }, { "epoch": 0.3074792243767313, "grad_norm": 1.9341715574264526, "learning_rate": 1.5810298941332696e-05, "loss": 1.4096, "step": 444 }, { "epoch": 0.30817174515235457, "grad_norm": 2.0554494857788086, "learning_rate": 1.579245456312506e-05, "loss": 1.552, "step": 445 }, { "epoch": 0.30886426592797783, "grad_norm": 2.0175342559814453, "learning_rate": 1.5774582383694196e-05, "loss": 1.4782, "step": 446 }, { "epoch": 0.3095567867036011, "grad_norm": 1.9586889743804932, "learning_rate": 1.5756682488818674e-05, "loss": 1.401, "step": 447 }, { "epoch": 0.31024930747922436, "grad_norm": 2.00122332572937, "learning_rate": 1.5738754964410084e-05, "loss": 1.3873, "step": 448 }, { "epoch": 0.3109418282548476, "grad_norm": 1.9168496131896973, "learning_rate": 1.5720799896512625e-05, "loss": 1.329, "step": 449 }, { "epoch": 0.31163434903047094, "grad_norm": 2.23854923248291, "learning_rate": 1.5702817371302684e-05, "loss": 1.3508, "step": 450 }, { "epoch": 0.3123268698060942, "grad_norm": 1.7631722688674927, "learning_rate": 1.5684807475088453e-05, "loss": 1.4677, "step": 451 }, { "epoch": 0.31301939058171746, "grad_norm": 2.0044777393341064, "learning_rate": 1.5666770294309467e-05, "loss": 1.3986, "step": 452 }, { "epoch": 0.3137119113573407, "grad_norm": 1.8325257301330566, "learning_rate": 1.5648705915536227e-05, "loss": 1.2592, "step": 453 }, { "epoch": 0.314404432132964, "grad_norm": 2.203279495239258, "learning_rate": 1.5630614425469776e-05, "loss": 1.1724, "step": 454 }, { "epoch": 0.31509695290858725, "grad_norm": 1.7246652841567993, "learning_rate": 1.561249591094127e-05, "loss": 1.4664, "step": 455 }, { "epoch": 0.3157894736842105, "grad_norm": 2.057136058807373, "learning_rate": 1.5594350458911586e-05, "loss": 1.3707, "step": 456 }, { "epoch": 0.31648199445983377, "grad_norm": 1.8668776750564575, "learning_rate": 1.5576178156470863e-05, "loss": 1.4212, "step": 457 }, { "epoch": 0.3171745152354571, "grad_norm": 1.90840744972229, "learning_rate": 1.5557979090838136e-05, "loss": 1.529, "step": 458 }, { "epoch": 0.31786703601108035, "grad_norm": 2.1340227127075195, "learning_rate": 1.553975334936088e-05, "loss": 1.4705, "step": 459 }, { "epoch": 0.3185595567867036, "grad_norm": 2.315753698348999, "learning_rate": 1.55215010195146e-05, "loss": 1.2551, "step": 460 }, { "epoch": 0.3192520775623269, "grad_norm": 2.071791172027588, "learning_rate": 1.5503222188902412e-05, "loss": 1.2973, "step": 461 }, { "epoch": 0.31994459833795014, "grad_norm": 1.9834339618682861, "learning_rate": 1.5484916945254642e-05, "loss": 1.351, "step": 462 }, { "epoch": 0.3206371191135734, "grad_norm": 1.92372465133667, "learning_rate": 1.5466585376428365e-05, "loss": 1.3197, "step": 463 }, { "epoch": 0.32132963988919666, "grad_norm": 2.1652302742004395, "learning_rate": 1.5448227570407012e-05, "loss": 1.3793, "step": 464 }, { "epoch": 0.3220221606648199, "grad_norm": 2.0220046043395996, "learning_rate": 1.5429843615299953e-05, "loss": 1.3064, "step": 465 }, { "epoch": 0.32271468144044324, "grad_norm": 2.301950454711914, "learning_rate": 1.5411433599342038e-05, "loss": 1.2608, "step": 466 }, { "epoch": 0.3234072022160665, "grad_norm": 2.135833501815796, "learning_rate": 1.539299761089322e-05, "loss": 1.1929, "step": 467 }, { "epoch": 0.32409972299168976, "grad_norm": 1.8044958114624023, "learning_rate": 1.5374535738438105e-05, "loss": 1.3328, "step": 468 }, { "epoch": 0.324792243767313, "grad_norm": 1.8159520626068115, "learning_rate": 1.5356048070585513e-05, "loss": 1.2574, "step": 469 }, { "epoch": 0.3254847645429363, "grad_norm": 1.9937934875488281, "learning_rate": 1.5337534696068088e-05, "loss": 1.2826, "step": 470 }, { "epoch": 0.32617728531855955, "grad_norm": 2.1979124546051025, "learning_rate": 1.5318995703741854e-05, "loss": 1.4514, "step": 471 }, { "epoch": 0.3268698060941828, "grad_norm": 2.0025925636291504, "learning_rate": 1.5300431182585777e-05, "loss": 1.499, "step": 472 }, { "epoch": 0.3275623268698061, "grad_norm": 2.007138252258301, "learning_rate": 1.528184122170137e-05, "loss": 1.2578, "step": 473 }, { "epoch": 0.32825484764542934, "grad_norm": 1.976704478263855, "learning_rate": 1.5263225910312222e-05, "loss": 1.5246, "step": 474 }, { "epoch": 0.32894736842105265, "grad_norm": 2.6022000312805176, "learning_rate": 1.524458533776361e-05, "loss": 1.1411, "step": 475 }, { "epoch": 0.3296398891966759, "grad_norm": 2.1338884830474854, "learning_rate": 1.5225919593522049e-05, "loss": 1.3387, "step": 476 }, { "epoch": 0.3303324099722992, "grad_norm": 2.0747604370117188, "learning_rate": 1.5207228767174865e-05, "loss": 1.3682, "step": 477 }, { "epoch": 0.33102493074792244, "grad_norm": 2.2153992652893066, "learning_rate": 1.5188512948429765e-05, "loss": 1.2949, "step": 478 }, { "epoch": 0.3317174515235457, "grad_norm": 2.1263861656188965, "learning_rate": 1.516977222711442e-05, "loss": 1.3209, "step": 479 }, { "epoch": 0.33240997229916897, "grad_norm": 2.2222840785980225, "learning_rate": 1.5151006693176005e-05, "loss": 1.1555, "step": 480 }, { "epoch": 0.3331024930747922, "grad_norm": 2.1737382411956787, "learning_rate": 1.5132216436680798e-05, "loss": 1.2706, "step": 481 }, { "epoch": 0.3337950138504155, "grad_norm": 1.967319369316101, "learning_rate": 1.5113401547813732e-05, "loss": 1.3886, "step": 482 }, { "epoch": 0.3344875346260388, "grad_norm": 2.301297664642334, "learning_rate": 1.5094562116877955e-05, "loss": 1.2934, "step": 483 }, { "epoch": 0.33518005540166207, "grad_norm": 1.921366810798645, "learning_rate": 1.5075698234294424e-05, "loss": 1.4122, "step": 484 }, { "epoch": 0.33587257617728533, "grad_norm": 2.278940439224243, "learning_rate": 1.5056809990601434e-05, "loss": 1.3407, "step": 485 }, { "epoch": 0.3365650969529086, "grad_norm": 2.016507148742676, "learning_rate": 1.5037897476454219e-05, "loss": 1.4208, "step": 486 }, { "epoch": 0.33725761772853186, "grad_norm": 1.9419777393341064, "learning_rate": 1.5018960782624486e-05, "loss": 1.3269, "step": 487 }, { "epoch": 0.3379501385041551, "grad_norm": 1.8907411098480225, "learning_rate": 1.5000000000000002e-05, "loss": 1.493, "step": 488 }, { "epoch": 0.3386426592797784, "grad_norm": 2.395200252532959, "learning_rate": 1.4981015219584149e-05, "loss": 1.2829, "step": 489 }, { "epoch": 0.33933518005540164, "grad_norm": 2.1097700595855713, "learning_rate": 1.496200653249549e-05, "loss": 1.4754, "step": 490 }, { "epoch": 0.3400277008310249, "grad_norm": 2.089921474456787, "learning_rate": 1.4942974029967319e-05, "loss": 1.389, "step": 491 }, { "epoch": 0.3407202216066482, "grad_norm": 2.3994808197021484, "learning_rate": 1.492391780334725e-05, "loss": 1.2998, "step": 492 }, { "epoch": 0.3414127423822715, "grad_norm": 2.2223410606384277, "learning_rate": 1.4904837944096744e-05, "loss": 1.3659, "step": 493 }, { "epoch": 0.34210526315789475, "grad_norm": 2.277879238128662, "learning_rate": 1.4885734543790707e-05, "loss": 1.3224, "step": 494 }, { "epoch": 0.342797783933518, "grad_norm": 2.726764440536499, "learning_rate": 1.4866607694117012e-05, "loss": 1.5039, "step": 495 }, { "epoch": 0.34349030470914127, "grad_norm": 2.2179274559020996, "learning_rate": 1.4847457486876097e-05, "loss": 1.3766, "step": 496 }, { "epoch": 0.34418282548476453, "grad_norm": 2.170088052749634, "learning_rate": 1.4828284013980492e-05, "loss": 1.5275, "step": 497 }, { "epoch": 0.3448753462603878, "grad_norm": 2.057279586791992, "learning_rate": 1.4809087367454402e-05, "loss": 1.409, "step": 498 }, { "epoch": 0.34556786703601106, "grad_norm": 2.0357258319854736, "learning_rate": 1.478986763943325e-05, "loss": 1.4436, "step": 499 }, { "epoch": 0.3462603878116344, "grad_norm": 2.172924518585205, "learning_rate": 1.4770624922163233e-05, "loss": 1.2419, "step": 500 }, { "epoch": 0.34695290858725764, "grad_norm": 2.027270793914795, "learning_rate": 1.4751359308000904e-05, "loss": 1.3761, "step": 501 }, { "epoch": 0.3476454293628809, "grad_norm": 2.094010591506958, "learning_rate": 1.4732070889412693e-05, "loss": 1.5432, "step": 502 }, { "epoch": 0.34833795013850416, "grad_norm": 1.9574612379074097, "learning_rate": 1.4712759758974496e-05, "loss": 1.3299, "step": 503 }, { "epoch": 0.3490304709141274, "grad_norm": 2.2416937351226807, "learning_rate": 1.4693426009371203e-05, "loss": 1.2703, "step": 504 }, { "epoch": 0.3497229916897507, "grad_norm": 2.2920377254486084, "learning_rate": 1.4674069733396277e-05, "loss": 1.359, "step": 505 }, { "epoch": 0.35041551246537395, "grad_norm": 2.218531847000122, "learning_rate": 1.4654691023951289e-05, "loss": 1.3286, "step": 506 }, { "epoch": 0.3511080332409972, "grad_norm": 2.079054117202759, "learning_rate": 1.4635289974045485e-05, "loss": 1.4356, "step": 507 }, { "epoch": 0.3518005540166205, "grad_norm": 1.972861886024475, "learning_rate": 1.4615866676795334e-05, "loss": 1.4091, "step": 508 }, { "epoch": 0.3524930747922438, "grad_norm": 2.4713618755340576, "learning_rate": 1.4596421225424084e-05, "loss": 1.28, "step": 509 }, { "epoch": 0.35318559556786705, "grad_norm": 1.9506804943084717, "learning_rate": 1.4576953713261313e-05, "loss": 1.3192, "step": 510 }, { "epoch": 0.3538781163434903, "grad_norm": 2.0880720615386963, "learning_rate": 1.4557464233742478e-05, "loss": 1.4332, "step": 511 }, { "epoch": 0.3545706371191136, "grad_norm": 2.2580926418304443, "learning_rate": 1.4537952880408472e-05, "loss": 1.3437, "step": 512 }, { "epoch": 0.35526315789473684, "grad_norm": 2.160332441329956, "learning_rate": 1.4518419746905174e-05, "loss": 1.2442, "step": 513 }, { "epoch": 0.3559556786703601, "grad_norm": 2.0894155502319336, "learning_rate": 1.4498864926982996e-05, "loss": 1.4602, "step": 514 }, { "epoch": 0.35664819944598336, "grad_norm": 2.108365058898926, "learning_rate": 1.4479288514496434e-05, "loss": 1.4086, "step": 515 }, { "epoch": 0.3573407202216066, "grad_norm": 1.9162973165512085, "learning_rate": 1.4459690603403623e-05, "loss": 1.3188, "step": 516 }, { "epoch": 0.35803324099722994, "grad_norm": 1.92423677444458, "learning_rate": 1.4440071287765876e-05, "loss": 1.3285, "step": 517 }, { "epoch": 0.3587257617728532, "grad_norm": 2.039736270904541, "learning_rate": 1.4420430661747245e-05, "loss": 1.4598, "step": 518 }, { "epoch": 0.35941828254847646, "grad_norm": 2.099224090576172, "learning_rate": 1.4400768819614056e-05, "loss": 1.3224, "step": 519 }, { "epoch": 0.3601108033240997, "grad_norm": 2.0194785594940186, "learning_rate": 1.4381085855734468e-05, "loss": 1.308, "step": 520 }, { "epoch": 0.360803324099723, "grad_norm": 1.9957362413406372, "learning_rate": 1.4361381864578014e-05, "loss": 1.3694, "step": 521 }, { "epoch": 0.36149584487534625, "grad_norm": 2.1399877071380615, "learning_rate": 1.4341656940715147e-05, "loss": 1.4561, "step": 522 }, { "epoch": 0.3621883656509695, "grad_norm": 2.405832052230835, "learning_rate": 1.432191117881679e-05, "loss": 1.3531, "step": 523 }, { "epoch": 0.3628808864265928, "grad_norm": 2.3182177543640137, "learning_rate": 1.4302144673653875e-05, "loss": 1.1962, "step": 524 }, { "epoch": 0.3635734072022161, "grad_norm": 2.29636287689209, "learning_rate": 1.4282357520096896e-05, "loss": 1.2889, "step": 525 }, { "epoch": 0.36426592797783935, "grad_norm": 2.095970630645752, "learning_rate": 1.426254981311545e-05, "loss": 1.3959, "step": 526 }, { "epoch": 0.3649584487534626, "grad_norm": 2.20633864402771, "learning_rate": 1.424272164777778e-05, "loss": 1.3896, "step": 527 }, { "epoch": 0.3656509695290859, "grad_norm": 2.5923547744750977, "learning_rate": 1.4222873119250325e-05, "loss": 1.2329, "step": 528 }, { "epoch": 0.36634349030470914, "grad_norm": 2.6066620349884033, "learning_rate": 1.4203004322797252e-05, "loss": 1.167, "step": 529 }, { "epoch": 0.3670360110803324, "grad_norm": 2.101332187652588, "learning_rate": 1.4183115353780001e-05, "loss": 1.3173, "step": 530 }, { "epoch": 0.36772853185595566, "grad_norm": 2.3599350452423096, "learning_rate": 1.4163206307656849e-05, "loss": 1.3849, "step": 531 }, { "epoch": 0.3684210526315789, "grad_norm": 2.4479289054870605, "learning_rate": 1.4143277279982415e-05, "loss": 1.3258, "step": 532 }, { "epoch": 0.3691135734072022, "grad_norm": 2.5400850772857666, "learning_rate": 1.4123328366407235e-05, "loss": 1.2734, "step": 533 }, { "epoch": 0.3698060941828255, "grad_norm": 2.1413142681121826, "learning_rate": 1.4103359662677276e-05, "loss": 1.3371, "step": 534 }, { "epoch": 0.37049861495844877, "grad_norm": 2.246459484100342, "learning_rate": 1.4083371264633496e-05, "loss": 1.3819, "step": 535 }, { "epoch": 0.37119113573407203, "grad_norm": 2.5155673027038574, "learning_rate": 1.406336326821138e-05, "loss": 1.3142, "step": 536 }, { "epoch": 0.3718836565096953, "grad_norm": 2.42018461227417, "learning_rate": 1.4043335769440471e-05, "loss": 1.5278, "step": 537 }, { "epoch": 0.37257617728531855, "grad_norm": 2.8900959491729736, "learning_rate": 1.4023288864443915e-05, "loss": 1.142, "step": 538 }, { "epoch": 0.3732686980609418, "grad_norm": 2.2948379516601562, "learning_rate": 1.4003222649438004e-05, "loss": 1.296, "step": 539 }, { "epoch": 0.3739612188365651, "grad_norm": 2.6425232887268066, "learning_rate": 1.3983137220731702e-05, "loss": 1.3842, "step": 540 }, { "epoch": 0.37465373961218834, "grad_norm": 2.036116123199463, "learning_rate": 1.3963032674726197e-05, "loss": 1.3431, "step": 541 }, { "epoch": 0.37534626038781166, "grad_norm": 2.152360200881958, "learning_rate": 1.3942909107914431e-05, "loss": 1.3037, "step": 542 }, { "epoch": 0.3760387811634349, "grad_norm": 2.285806894302368, "learning_rate": 1.392276661688063e-05, "loss": 1.2652, "step": 543 }, { "epoch": 0.3767313019390582, "grad_norm": 2.133744239807129, "learning_rate": 1.390260529829986e-05, "loss": 1.2983, "step": 544 }, { "epoch": 0.37742382271468145, "grad_norm": 2.4105823040008545, "learning_rate": 1.388242524893754e-05, "loss": 1.3611, "step": 545 }, { "epoch": 0.3781163434903047, "grad_norm": 2.275736093521118, "learning_rate": 1.3862226565648996e-05, "loss": 1.2985, "step": 546 }, { "epoch": 0.37880886426592797, "grad_norm": 2.5612637996673584, "learning_rate": 1.3842009345378977e-05, "loss": 1.3993, "step": 547 }, { "epoch": 0.37950138504155123, "grad_norm": 2.3111212253570557, "learning_rate": 1.3821773685161224e-05, "loss": 1.2687, "step": 548 }, { "epoch": 0.3801939058171745, "grad_norm": 2.2761833667755127, "learning_rate": 1.3801519682117957e-05, "loss": 1.3877, "step": 549 }, { "epoch": 0.3808864265927978, "grad_norm": 2.110255002975464, "learning_rate": 1.3781247433459447e-05, "loss": 1.2858, "step": 550 }, { "epoch": 0.3815789473684211, "grad_norm": 2.6079838275909424, "learning_rate": 1.3760957036483532e-05, "loss": 1.3634, "step": 551 }, { "epoch": 0.38227146814404434, "grad_norm": 2.3239288330078125, "learning_rate": 1.3740648588575156e-05, "loss": 1.234, "step": 552 }, { "epoch": 0.3829639889196676, "grad_norm": 2.26419997215271, "learning_rate": 1.3720322187205897e-05, "loss": 1.1981, "step": 553 }, { "epoch": 0.38365650969529086, "grad_norm": 2.4017229080200195, "learning_rate": 1.3699977929933503e-05, "loss": 1.3152, "step": 554 }, { "epoch": 0.3843490304709141, "grad_norm": 2.3292219638824463, "learning_rate": 1.3679615914401423e-05, "loss": 1.4073, "step": 555 }, { "epoch": 0.3850415512465374, "grad_norm": 2.4745140075683594, "learning_rate": 1.3659236238338339e-05, "loss": 1.1486, "step": 556 }, { "epoch": 0.38573407202216065, "grad_norm": 2.755042314529419, "learning_rate": 1.3638838999557687e-05, "loss": 1.3127, "step": 557 }, { "epoch": 0.3864265927977839, "grad_norm": 2.610299587249756, "learning_rate": 1.361842429595721e-05, "loss": 1.3348, "step": 558 }, { "epoch": 0.3871191135734072, "grad_norm": 2.1434555053710938, "learning_rate": 1.3597992225518466e-05, "loss": 1.3572, "step": 559 }, { "epoch": 0.3878116343490305, "grad_norm": 2.3972840309143066, "learning_rate": 1.3577542886306367e-05, "loss": 1.3792, "step": 560 }, { "epoch": 0.38850415512465375, "grad_norm": 2.316579818725586, "learning_rate": 1.355707637646871e-05, "loss": 1.4216, "step": 561 }, { "epoch": 0.389196675900277, "grad_norm": 2.352252721786499, "learning_rate": 1.3536592794235696e-05, "loss": 1.3232, "step": 562 }, { "epoch": 0.3898891966759003, "grad_norm": 2.1979241371154785, "learning_rate": 1.3516092237919479e-05, "loss": 1.2895, "step": 563 }, { "epoch": 0.39058171745152354, "grad_norm": 2.2471816539764404, "learning_rate": 1.3495574805913669e-05, "loss": 1.3374, "step": 564 }, { "epoch": 0.3912742382271468, "grad_norm": 2.787330150604248, "learning_rate": 1.3475040596692876e-05, "loss": 1.1684, "step": 565 }, { "epoch": 0.39196675900277006, "grad_norm": 2.628101348876953, "learning_rate": 1.3454489708812237e-05, "loss": 1.3537, "step": 566 }, { "epoch": 0.3926592797783934, "grad_norm": 2.726956605911255, "learning_rate": 1.3433922240906936e-05, "loss": 1.3914, "step": 567 }, { "epoch": 0.39335180055401664, "grad_norm": 2.6245710849761963, "learning_rate": 1.3413338291691726e-05, "loss": 1.3597, "step": 568 }, { "epoch": 0.3940443213296399, "grad_norm": 2.3888232707977295, "learning_rate": 1.3392737959960482e-05, "loss": 1.3567, "step": 569 }, { "epoch": 0.39473684210526316, "grad_norm": 2.635704755783081, "learning_rate": 1.3372121344585694e-05, "loss": 1.2514, "step": 570 }, { "epoch": 0.3954293628808864, "grad_norm": 2.6729440689086914, "learning_rate": 1.3351488544518003e-05, "loss": 1.1759, "step": 571 }, { "epoch": 0.3961218836565097, "grad_norm": 2.542128801345825, "learning_rate": 1.3330839658785739e-05, "loss": 1.27, "step": 572 }, { "epoch": 0.39681440443213295, "grad_norm": 2.387706756591797, "learning_rate": 1.3310174786494432e-05, "loss": 1.2951, "step": 573 }, { "epoch": 0.3975069252077562, "grad_norm": 2.5825939178466797, "learning_rate": 1.3289494026826337e-05, "loss": 1.3551, "step": 574 }, { "epoch": 0.3981994459833795, "grad_norm": 2.307957649230957, "learning_rate": 1.3268797479039977e-05, "loss": 1.3592, "step": 575 }, { "epoch": 0.3988919667590028, "grad_norm": 2.071687936782837, "learning_rate": 1.3248085242469629e-05, "loss": 1.28, "step": 576 }, { "epoch": 0.39958448753462605, "grad_norm": 2.5913336277008057, "learning_rate": 1.3227357416524878e-05, "loss": 1.1898, "step": 577 }, { "epoch": 0.4002770083102493, "grad_norm": 2.27807354927063, "learning_rate": 1.3206614100690139e-05, "loss": 1.3734, "step": 578 }, { "epoch": 0.4009695290858726, "grad_norm": 2.303732395172119, "learning_rate": 1.318585539452416e-05, "loss": 1.3414, "step": 579 }, { "epoch": 0.40166204986149584, "grad_norm": 2.587827682495117, "learning_rate": 1.3165081397659563e-05, "loss": 1.4122, "step": 580 }, { "epoch": 0.4023545706371191, "grad_norm": 2.292515516281128, "learning_rate": 1.3144292209802358e-05, "loss": 1.3282, "step": 581 }, { "epoch": 0.40304709141274236, "grad_norm": 2.2421929836273193, "learning_rate": 1.3123487930731464e-05, "loss": 1.4146, "step": 582 }, { "epoch": 0.4037396121883656, "grad_norm": 2.186304807662964, "learning_rate": 1.3102668660298229e-05, "loss": 1.3864, "step": 583 }, { "epoch": 0.40443213296398894, "grad_norm": 2.5028858184814453, "learning_rate": 1.3081834498425952e-05, "loss": 1.1858, "step": 584 }, { "epoch": 0.4051246537396122, "grad_norm": 2.309316635131836, "learning_rate": 1.3060985545109408e-05, "loss": 1.359, "step": 585 }, { "epoch": 0.40581717451523547, "grad_norm": 2.236069917678833, "learning_rate": 1.3040121900414371e-05, "loss": 1.3318, "step": 586 }, { "epoch": 0.40650969529085873, "grad_norm": 2.1627039909362793, "learning_rate": 1.3019243664477108e-05, "loss": 1.3227, "step": 587 }, { "epoch": 0.407202216066482, "grad_norm": 2.374979257583618, "learning_rate": 1.2998350937503939e-05, "loss": 1.2892, "step": 588 }, { "epoch": 0.40789473684210525, "grad_norm": 2.3773934841156006, "learning_rate": 1.2977443819770717e-05, "loss": 1.3139, "step": 589 }, { "epoch": 0.4085872576177285, "grad_norm": 2.3296051025390625, "learning_rate": 1.2956522411622377e-05, "loss": 1.267, "step": 590 }, { "epoch": 0.4092797783933518, "grad_norm": 2.764315605163574, "learning_rate": 1.2935586813472433e-05, "loss": 1.3891, "step": 591 }, { "epoch": 0.4099722991689751, "grad_norm": 2.6064682006835938, "learning_rate": 1.2914637125802514e-05, "loss": 1.2025, "step": 592 }, { "epoch": 0.41066481994459836, "grad_norm": 2.60998272895813, "learning_rate": 1.2893673449161859e-05, "loss": 1.3943, "step": 593 }, { "epoch": 0.4113573407202216, "grad_norm": 2.4021079540252686, "learning_rate": 1.287269588416686e-05, "loss": 1.2099, "step": 594 }, { "epoch": 0.4120498614958449, "grad_norm": 2.560509443283081, "learning_rate": 1.2851704531500564e-05, "loss": 1.3387, "step": 595 }, { "epoch": 0.41274238227146814, "grad_norm": 2.1831934452056885, "learning_rate": 1.2830699491912186e-05, "loss": 1.4038, "step": 596 }, { "epoch": 0.4134349030470914, "grad_norm": 2.457613945007324, "learning_rate": 1.2809680866216647e-05, "loss": 1.4022, "step": 597 }, { "epoch": 0.41412742382271467, "grad_norm": 2.413032054901123, "learning_rate": 1.2788648755294056e-05, "loss": 1.3812, "step": 598 }, { "epoch": 0.41481994459833793, "grad_norm": 3.021824359893799, "learning_rate": 1.2767603260089257e-05, "loss": 1.2631, "step": 599 }, { "epoch": 0.4155124653739612, "grad_norm": 2.2408480644226074, "learning_rate": 1.2746544481611336e-05, "loss": 1.3919, "step": 600 }, { "epoch": 0.4162049861495845, "grad_norm": 2.4218242168426514, "learning_rate": 1.2725472520933122e-05, "loss": 1.3804, "step": 601 }, { "epoch": 0.4168975069252078, "grad_norm": 2.305410146713257, "learning_rate": 1.270438747919072e-05, "loss": 1.3913, "step": 602 }, { "epoch": 0.41759002770083103, "grad_norm": 2.362793207168579, "learning_rate": 1.2683289457583018e-05, "loss": 1.3815, "step": 603 }, { "epoch": 0.4182825484764543, "grad_norm": 2.615732431411743, "learning_rate": 1.2662178557371198e-05, "loss": 1.449, "step": 604 }, { "epoch": 0.41897506925207756, "grad_norm": 2.481135606765747, "learning_rate": 1.264105487987826e-05, "loss": 1.2778, "step": 605 }, { "epoch": 0.4196675900277008, "grad_norm": 2.73952054977417, "learning_rate": 1.261991852648852e-05, "loss": 1.334, "step": 606 }, { "epoch": 0.4203601108033241, "grad_norm": 2.4660911560058594, "learning_rate": 1.2598769598647136e-05, "loss": 1.2903, "step": 607 }, { "epoch": 0.42105263157894735, "grad_norm": 2.5404601097106934, "learning_rate": 1.2577608197859627e-05, "loss": 1.274, "step": 608 }, { "epoch": 0.42174515235457066, "grad_norm": 2.2028279304504395, "learning_rate": 1.2556434425691363e-05, "loss": 1.3181, "step": 609 }, { "epoch": 0.4224376731301939, "grad_norm": 2.5085532665252686, "learning_rate": 1.2535248383767102e-05, "loss": 1.3525, "step": 610 }, { "epoch": 0.4231301939058172, "grad_norm": 2.7390897274017334, "learning_rate": 1.2514050173770484e-05, "loss": 1.394, "step": 611 }, { "epoch": 0.42382271468144045, "grad_norm": 2.349844455718994, "learning_rate": 1.2492839897443554e-05, "loss": 1.3215, "step": 612 }, { "epoch": 0.4245152354570637, "grad_norm": 2.703737735748291, "learning_rate": 1.247161765658627e-05, "loss": 1.4081, "step": 613 }, { "epoch": 0.425207756232687, "grad_norm": 2.4210963249206543, "learning_rate": 1.2450383553056011e-05, "loss": 1.182, "step": 614 }, { "epoch": 0.42590027700831024, "grad_norm": 2.9356367588043213, "learning_rate": 1.2429137688767095e-05, "loss": 1.2014, "step": 615 }, { "epoch": 0.4265927977839335, "grad_norm": 1.582685947418213, "learning_rate": 1.2407880165690289e-05, "loss": 0.7954, "step": 616 }, { "epoch": 0.42728531855955676, "grad_norm": 2.6504549980163574, "learning_rate": 1.238661108585231e-05, "loss": 1.385, "step": 617 }, { "epoch": 0.4279778393351801, "grad_norm": 2.2224066257476807, "learning_rate": 1.2365330551335348e-05, "loss": 1.3786, "step": 618 }, { "epoch": 0.42867036011080334, "grad_norm": 2.686243772506714, "learning_rate": 1.234403866427657e-05, "loss": 1.2678, "step": 619 }, { "epoch": 0.4293628808864266, "grad_norm": 2.7656173706054688, "learning_rate": 1.2322735526867624e-05, "loss": 1.2544, "step": 620 }, { "epoch": 0.43005540166204986, "grad_norm": 2.2233879566192627, "learning_rate": 1.2301421241354166e-05, "loss": 1.354, "step": 621 }, { "epoch": 0.4307479224376731, "grad_norm": 2.6432528495788574, "learning_rate": 1.2280095910035343e-05, "loss": 1.3564, "step": 622 }, { "epoch": 0.4314404432132964, "grad_norm": 2.741586685180664, "learning_rate": 1.2258759635263333e-05, "loss": 1.2194, "step": 623 }, { "epoch": 0.43213296398891965, "grad_norm": 2.5118141174316406, "learning_rate": 1.2237412519442828e-05, "loss": 1.2546, "step": 624 }, { "epoch": 0.4328254847645429, "grad_norm": 2.843817710876465, "learning_rate": 1.2216054665030552e-05, "loss": 1.2101, "step": 625 }, { "epoch": 0.43351800554016623, "grad_norm": 2.5602662563323975, "learning_rate": 1.2194686174534771e-05, "loss": 1.3588, "step": 626 }, { "epoch": 0.4342105263157895, "grad_norm": 2.5971269607543945, "learning_rate": 1.2173307150514809e-05, "loss": 1.3227, "step": 627 }, { "epoch": 0.43490304709141275, "grad_norm": 2.9508907794952393, "learning_rate": 1.2151917695580523e-05, "loss": 1.2234, "step": 628 }, { "epoch": 0.435595567867036, "grad_norm": 2.6319260597229004, "learning_rate": 1.2130517912391861e-05, "loss": 1.36, "step": 629 }, { "epoch": 0.4362880886426593, "grad_norm": 2.494126081466675, "learning_rate": 1.2109107903658326e-05, "loss": 1.3959, "step": 630 }, { "epoch": 0.43698060941828254, "grad_norm": 3.109811782836914, "learning_rate": 1.2087687772138501e-05, "loss": 1.0508, "step": 631 }, { "epoch": 0.4376731301939058, "grad_norm": 2.5065367221832275, "learning_rate": 1.2066257620639557e-05, "loss": 1.3455, "step": 632 }, { "epoch": 0.43836565096952906, "grad_norm": 2.493720769882202, "learning_rate": 1.2044817552016752e-05, "loss": 1.3614, "step": 633 }, { "epoch": 0.4390581717451524, "grad_norm": 2.4560728073120117, "learning_rate": 1.2023367669172947e-05, "loss": 1.4117, "step": 634 }, { "epoch": 0.43975069252077564, "grad_norm": 2.2620697021484375, "learning_rate": 1.2001908075058108e-05, "loss": 1.3028, "step": 635 }, { "epoch": 0.4404432132963989, "grad_norm": 2.5376715660095215, "learning_rate": 1.1980438872668803e-05, "loss": 1.1942, "step": 636 }, { "epoch": 0.44113573407202217, "grad_norm": 1.7369788885116577, "learning_rate": 1.1958960165047716e-05, "loss": 0.7623, "step": 637 }, { "epoch": 0.44182825484764543, "grad_norm": 2.8980188369750977, "learning_rate": 1.1937472055283168e-05, "loss": 1.3815, "step": 638 }, { "epoch": 0.4425207756232687, "grad_norm": 2.5378687381744385, "learning_rate": 1.1915974646508581e-05, "loss": 1.457, "step": 639 }, { "epoch": 0.44321329639889195, "grad_norm": 2.880448818206787, "learning_rate": 1.189446804190203e-05, "loss": 1.0968, "step": 640 }, { "epoch": 0.4439058171745152, "grad_norm": 2.571089506149292, "learning_rate": 1.1872952344685713e-05, "loss": 1.2438, "step": 641 }, { "epoch": 0.4445983379501385, "grad_norm": 2.754330635070801, "learning_rate": 1.1851427658125474e-05, "loss": 1.2729, "step": 642 }, { "epoch": 0.4452908587257618, "grad_norm": 2.6155052185058594, "learning_rate": 1.1829894085530297e-05, "loss": 1.2282, "step": 643 }, { "epoch": 0.44598337950138506, "grad_norm": 2.518204689025879, "learning_rate": 1.180835173025183e-05, "loss": 1.3603, "step": 644 }, { "epoch": 0.4466759002770083, "grad_norm": 2.3861498832702637, "learning_rate": 1.1786800695683847e-05, "loss": 1.3179, "step": 645 }, { "epoch": 0.4473684210526316, "grad_norm": 2.5852103233337402, "learning_rate": 1.1765241085261802e-05, "loss": 1.3163, "step": 646 }, { "epoch": 0.44806094182825484, "grad_norm": 2.024843215942383, "learning_rate": 1.1743673002462303e-05, "loss": 0.7131, "step": 647 }, { "epoch": 0.4487534626038781, "grad_norm": 2.430532932281494, "learning_rate": 1.172209655080262e-05, "loss": 1.2961, "step": 648 }, { "epoch": 0.44944598337950137, "grad_norm": 2.568700075149536, "learning_rate": 1.1700511833840186e-05, "loss": 1.3669, "step": 649 }, { "epoch": 0.45013850415512463, "grad_norm": 2.587846517562866, "learning_rate": 1.1678918955172112e-05, "loss": 1.2835, "step": 650 }, { "epoch": 0.45083102493074795, "grad_norm": 3.3329403400421143, "learning_rate": 1.1657318018434673e-05, "loss": 1.1834, "step": 651 }, { "epoch": 0.4515235457063712, "grad_norm": 3.202256917953491, "learning_rate": 1.163570912730283e-05, "loss": 1.3079, "step": 652 }, { "epoch": 0.45221606648199447, "grad_norm": 2.8685662746429443, "learning_rate": 1.1614092385489708e-05, "loss": 1.4066, "step": 653 }, { "epoch": 0.45290858725761773, "grad_norm": 2.6699678897857666, "learning_rate": 1.1592467896746122e-05, "loss": 1.302, "step": 654 }, { "epoch": 0.453601108033241, "grad_norm": 2.71836519241333, "learning_rate": 1.157083576486007e-05, "loss": 1.3545, "step": 655 }, { "epoch": 0.45429362880886426, "grad_norm": 2.4176559448242188, "learning_rate": 1.1549196093656223e-05, "loss": 1.2774, "step": 656 }, { "epoch": 0.4549861495844875, "grad_norm": 2.4645159244537354, "learning_rate": 1.1527548986995449e-05, "loss": 1.3403, "step": 657 }, { "epoch": 0.4556786703601108, "grad_norm": 2.757352352142334, "learning_rate": 1.1505894548774294e-05, "loss": 1.3216, "step": 658 }, { "epoch": 0.45637119113573404, "grad_norm": 2.649862051010132, "learning_rate": 1.1484232882924501e-05, "loss": 1.2886, "step": 659 }, { "epoch": 0.45706371191135736, "grad_norm": 2.594386100769043, "learning_rate": 1.1462564093412493e-05, "loss": 1.2525, "step": 660 }, { "epoch": 0.4577562326869806, "grad_norm": 2.5127627849578857, "learning_rate": 1.144088828423889e-05, "loss": 1.25, "step": 661 }, { "epoch": 0.4584487534626039, "grad_norm": 2.3333370685577393, "learning_rate": 1.1419205559437998e-05, "loss": 1.3152, "step": 662 }, { "epoch": 0.45914127423822715, "grad_norm": 2.4757964611053467, "learning_rate": 1.1397516023077326e-05, "loss": 1.3548, "step": 663 }, { "epoch": 0.4598337950138504, "grad_norm": 2.6083545684814453, "learning_rate": 1.1375819779257058e-05, "loss": 1.5082, "step": 664 }, { "epoch": 0.4605263157894737, "grad_norm": 2.9225447177886963, "learning_rate": 1.1354116932109586e-05, "loss": 1.3173, "step": 665 }, { "epoch": 0.46121883656509693, "grad_norm": 2.9521665573120117, "learning_rate": 1.1332407585798992e-05, "loss": 1.3909, "step": 666 }, { "epoch": 0.4619113573407202, "grad_norm": 2.658435583114624, "learning_rate": 1.1310691844520543e-05, "loss": 1.3327, "step": 667 }, { "epoch": 0.4626038781163435, "grad_norm": 2.4518954753875732, "learning_rate": 1.1288969812500209e-05, "loss": 1.3068, "step": 668 }, { "epoch": 0.4632963988919668, "grad_norm": 3.0496392250061035, "learning_rate": 1.1267241593994145e-05, "loss": 1.2705, "step": 669 }, { "epoch": 0.46398891966759004, "grad_norm": 2.8972740173339844, "learning_rate": 1.1245507293288204e-05, "loss": 1.2617, "step": 670 }, { "epoch": 0.4646814404432133, "grad_norm": 2.567683219909668, "learning_rate": 1.1223767014697428e-05, "loss": 1.2594, "step": 671 }, { "epoch": 0.46537396121883656, "grad_norm": 2.789400577545166, "learning_rate": 1.1202020862565555e-05, "loss": 1.4519, "step": 672 }, { "epoch": 0.4660664819944598, "grad_norm": 2.6831300258636475, "learning_rate": 1.11802689412645e-05, "loss": 1.2585, "step": 673 }, { "epoch": 0.4667590027700831, "grad_norm": 2.578671455383301, "learning_rate": 1.1158511355193888e-05, "loss": 1.2332, "step": 674 }, { "epoch": 0.46745152354570635, "grad_norm": 2.688199996948242, "learning_rate": 1.1136748208780512e-05, "loss": 1.318, "step": 675 }, { "epoch": 0.46814404432132967, "grad_norm": 2.3270740509033203, "learning_rate": 1.1114979606477867e-05, "loss": 1.3231, "step": 676 }, { "epoch": 0.46883656509695293, "grad_norm": 2.6363329887390137, "learning_rate": 1.1093205652765627e-05, "loss": 1.2892, "step": 677 }, { "epoch": 0.4695290858725762, "grad_norm": 3.152540445327759, "learning_rate": 1.1071426452149152e-05, "loss": 1.2353, "step": 678 }, { "epoch": 0.47022160664819945, "grad_norm": 2.9541869163513184, "learning_rate": 1.1049642109158982e-05, "loss": 1.4063, "step": 679 }, { "epoch": 0.4709141274238227, "grad_norm": 2.647003412246704, "learning_rate": 1.1027852728350343e-05, "loss": 1.2904, "step": 680 }, { "epoch": 0.471606648199446, "grad_norm": 2.9237818717956543, "learning_rate": 1.1006058414302637e-05, "loss": 1.159, "step": 681 }, { "epoch": 0.47229916897506924, "grad_norm": 3.276366710662842, "learning_rate": 1.0984259271618947e-05, "loss": 1.2979, "step": 682 }, { "epoch": 0.4729916897506925, "grad_norm": 3.14654541015625, "learning_rate": 1.0962455404925524e-05, "loss": 1.2838, "step": 683 }, { "epoch": 0.47368421052631576, "grad_norm": 3.0861613750457764, "learning_rate": 1.09406469188713e-05, "loss": 1.2665, "step": 684 }, { "epoch": 0.4743767313019391, "grad_norm": 2.8374383449554443, "learning_rate": 1.0918833918127377e-05, "loss": 1.3168, "step": 685 }, { "epoch": 0.47506925207756234, "grad_norm": 3.03313946723938, "learning_rate": 1.0897016507386513e-05, "loss": 1.3287, "step": 686 }, { "epoch": 0.4757617728531856, "grad_norm": 2.811178207397461, "learning_rate": 1.0875194791362656e-05, "loss": 1.1427, "step": 687 }, { "epoch": 0.47645429362880887, "grad_norm": 2.7737855911254883, "learning_rate": 1.0853368874790392e-05, "loss": 1.3189, "step": 688 }, { "epoch": 0.47714681440443213, "grad_norm": 2.610959053039551, "learning_rate": 1.0831538862424488e-05, "loss": 1.3865, "step": 689 }, { "epoch": 0.4778393351800554, "grad_norm": 1.9542778730392456, "learning_rate": 1.0809704859039357e-05, "loss": 0.8104, "step": 690 }, { "epoch": 0.47853185595567865, "grad_norm": 2.8905603885650635, "learning_rate": 1.0787866969428569e-05, "loss": 1.3594, "step": 691 }, { "epoch": 0.4792243767313019, "grad_norm": 3.1295323371887207, "learning_rate": 1.0766025298404346e-05, "loss": 1.3643, "step": 692 }, { "epoch": 0.47991689750692523, "grad_norm": 2.8047592639923096, "learning_rate": 1.0744179950797069e-05, "loss": 1.2713, "step": 693 }, { "epoch": 0.4806094182825485, "grad_norm": 2.5008580684661865, "learning_rate": 1.0722331031454749e-05, "loss": 1.3686, "step": 694 }, { "epoch": 0.48130193905817176, "grad_norm": 3.0125064849853516, "learning_rate": 1.0700478645242552e-05, "loss": 1.2412, "step": 695 }, { "epoch": 0.481994459833795, "grad_norm": 2.661039352416992, "learning_rate": 1.0678622897042279e-05, "loss": 1.3159, "step": 696 }, { "epoch": 0.4826869806094183, "grad_norm": 2.9381444454193115, "learning_rate": 1.0656763891751865e-05, "loss": 1.341, "step": 697 }, { "epoch": 0.48337950138504154, "grad_norm": 2.9836440086364746, "learning_rate": 1.063490173428488e-05, "loss": 1.3917, "step": 698 }, { "epoch": 0.4840720221606648, "grad_norm": 3.4271857738494873, "learning_rate": 1.0613036529570022e-05, "loss": 1.1885, "step": 699 }, { "epoch": 0.48476454293628807, "grad_norm": 2.6977245807647705, "learning_rate": 1.0591168382550617e-05, "loss": 1.453, "step": 700 }, { "epoch": 0.48545706371191133, "grad_norm": 3.1043028831481934, "learning_rate": 1.0569297398184111e-05, "loss": 1.2817, "step": 701 }, { "epoch": 0.48614958448753465, "grad_norm": 2.9746053218841553, "learning_rate": 1.0547423681441567e-05, "loss": 1.2756, "step": 702 }, { "epoch": 0.4868421052631579, "grad_norm": 2.829667091369629, "learning_rate": 1.052554733730716e-05, "loss": 1.3638, "step": 703 }, { "epoch": 0.48753462603878117, "grad_norm": 3.197615385055542, "learning_rate": 1.050366847077769e-05, "loss": 1.2731, "step": 704 }, { "epoch": 0.48822714681440443, "grad_norm": 2.7651829719543457, "learning_rate": 1.0481787186862036e-05, "loss": 1.1986, "step": 705 }, { "epoch": 0.4889196675900277, "grad_norm": 2.8822498321533203, "learning_rate": 1.0459903590580706e-05, "loss": 1.1886, "step": 706 }, { "epoch": 0.48961218836565096, "grad_norm": 2.9656972885131836, "learning_rate": 1.0438017786965295e-05, "loss": 1.2598, "step": 707 }, { "epoch": 0.4903047091412742, "grad_norm": 3.422245979309082, "learning_rate": 1.0416129881057987e-05, "loss": 0.9257, "step": 708 }, { "epoch": 0.4909972299168975, "grad_norm": 1.814936637878418, "learning_rate": 1.0394239977911069e-05, "loss": 0.733, "step": 709 }, { "epoch": 0.4916897506925208, "grad_norm": 3.0122458934783936, "learning_rate": 1.03723481825864e-05, "loss": 1.2547, "step": 710 }, { "epoch": 0.49238227146814406, "grad_norm": 3.0159964561462402, "learning_rate": 1.0350454600154932e-05, "loss": 1.2414, "step": 711 }, { "epoch": 0.4930747922437673, "grad_norm": 2.8420827388763428, "learning_rate": 1.0328559335696188e-05, "loss": 1.1437, "step": 712 }, { "epoch": 0.4937673130193906, "grad_norm": 3.323718547821045, "learning_rate": 1.0306662494297766e-05, "loss": 1.1508, "step": 713 }, { "epoch": 0.49445983379501385, "grad_norm": 2.852436065673828, "learning_rate": 1.028476418105483e-05, "loss": 1.2692, "step": 714 }, { "epoch": 0.4951523545706371, "grad_norm": 3.037775993347168, "learning_rate": 1.0262864501069619e-05, "loss": 1.2536, "step": 715 }, { "epoch": 0.49584487534626037, "grad_norm": 3.1405160427093506, "learning_rate": 1.0240963559450909e-05, "loss": 1.2877, "step": 716 }, { "epoch": 0.49653739612188363, "grad_norm": 2.7466113567352295, "learning_rate": 1.0219061461313557e-05, "loss": 1.2839, "step": 717 }, { "epoch": 0.49722991689750695, "grad_norm": 2.971862554550171, "learning_rate": 1.0197158311777957e-05, "loss": 1.4034, "step": 718 }, { "epoch": 0.4979224376731302, "grad_norm": 2.713968515396118, "learning_rate": 1.017525421596955e-05, "loss": 1.3094, "step": 719 }, { "epoch": 0.4986149584487535, "grad_norm": 2.7105119228363037, "learning_rate": 1.015334927901832e-05, "loss": 1.2357, "step": 720 }, { "epoch": 0.49930747922437674, "grad_norm": 2.978726387023926, "learning_rate": 1.0131443606058291e-05, "loss": 1.3151, "step": 721 }, { "epoch": 0.5, "grad_norm": 2.9404547214508057, "learning_rate": 1.0109537302227012e-05, "loss": 1.1534, "step": 722 }, { "epoch": 0.5, "eval_loss": 1.0960729122161865, "eval_runtime": 338.4976, "eval_samples_per_second": 4.183, "eval_steps_per_second": 0.523, "step": 722 }, { "epoch": 0.5006925207756233, "grad_norm": 3.121488571166992, "learning_rate": 1.0087630472665071e-05, "loss": 1.3357, "step": 723 }, { "epoch": 0.5013850415512465, "grad_norm": 2.7625746726989746, "learning_rate": 1.0065723222515566e-05, "loss": 1.3157, "step": 724 }, { "epoch": 0.5020775623268698, "grad_norm": 2.6933093070983887, "learning_rate": 1.0043815656923627e-05, "loss": 1.2687, "step": 725 }, { "epoch": 0.502770083102493, "grad_norm": 3.369460344314575, "learning_rate": 1.0021907881035891e-05, "loss": 1.1404, "step": 726 }, { "epoch": 0.5034626038781164, "grad_norm": 3.114061117172241, "learning_rate": 1e-05, "loss": 1.3558, "step": 727 }, { "epoch": 0.5041551246537396, "grad_norm": 3.136636972427368, "learning_rate": 9.97809211896411e-06, "loss": 1.2918, "step": 728 }, { "epoch": 0.5048476454293629, "grad_norm": 2.9409475326538086, "learning_rate": 9.956184343076378e-06, "loss": 1.2745, "step": 729 }, { "epoch": 0.5055401662049861, "grad_norm": 3.3385579586029053, "learning_rate": 9.934276777484436e-06, "loss": 1.3101, "step": 730 }, { "epoch": 0.5062326869806094, "grad_norm": 3.108218193054199, "learning_rate": 9.91236952733493e-06, "loss": 1.3295, "step": 731 }, { "epoch": 0.5069252077562327, "grad_norm": 2.7342264652252197, "learning_rate": 9.89046269777299e-06, "loss": 1.4148, "step": 732 }, { "epoch": 0.5076177285318559, "grad_norm": 2.875964879989624, "learning_rate": 9.868556393941714e-06, "loss": 1.1803, "step": 733 }, { "epoch": 0.5083102493074793, "grad_norm": 3.1713013648986816, "learning_rate": 9.846650720981682e-06, "loss": 1.368, "step": 734 }, { "epoch": 0.5090027700831025, "grad_norm": 3.592292070388794, "learning_rate": 9.824745784030451e-06, "loss": 1.1784, "step": 735 }, { "epoch": 0.5096952908587258, "grad_norm": 2.91928768157959, "learning_rate": 9.802841688222043e-06, "loss": 1.3635, "step": 736 }, { "epoch": 0.510387811634349, "grad_norm": 3.0569591522216797, "learning_rate": 9.780938538686444e-06, "loss": 1.185, "step": 737 }, { "epoch": 0.5110803324099723, "grad_norm": 3.4262917041778564, "learning_rate": 9.759036440549093e-06, "loss": 1.2004, "step": 738 }, { "epoch": 0.5117728531855956, "grad_norm": 2.7363193035125732, "learning_rate": 9.737135498930385e-06, "loss": 1.1571, "step": 739 }, { "epoch": 0.5124653739612188, "grad_norm": 3.3447537422180176, "learning_rate": 9.715235818945171e-06, "loss": 1.309, "step": 740 }, { "epoch": 0.5131578947368421, "grad_norm": 1.9384816884994507, "learning_rate": 9.693337505702238e-06, "loss": 0.8006, "step": 741 }, { "epoch": 0.5138504155124654, "grad_norm": 3.3189609050750732, "learning_rate": 9.671440664303813e-06, "loss": 1.1473, "step": 742 }, { "epoch": 0.5145429362880887, "grad_norm": 3.228372097015381, "learning_rate": 9.64954539984507e-06, "loss": 1.2468, "step": 743 }, { "epoch": 0.5152354570637119, "grad_norm": 2.7112698554992676, "learning_rate": 9.627651817413605e-06, "loss": 1.394, "step": 744 }, { "epoch": 0.5159279778393352, "grad_norm": 3.023843288421631, "learning_rate": 9.605760022088935e-06, "loss": 1.3856, "step": 745 }, { "epoch": 0.5166204986149584, "grad_norm": 2.7612435817718506, "learning_rate": 9.583870118942014e-06, "loss": 1.2712, "step": 746 }, { "epoch": 0.5173130193905817, "grad_norm": 2.827341318130493, "learning_rate": 9.561982213034707e-06, "loss": 1.2756, "step": 747 }, { "epoch": 0.518005540166205, "grad_norm": 2.7056517601013184, "learning_rate": 9.540096409419295e-06, "loss": 1.4065, "step": 748 }, { "epoch": 0.5186980609418282, "grad_norm": 2.6263115406036377, "learning_rate": 9.518212813137966e-06, "loss": 1.282, "step": 749 }, { "epoch": 0.5193905817174516, "grad_norm": 3.2539684772491455, "learning_rate": 9.496331529222313e-06, "loss": 1.4131, "step": 750 }, { "epoch": 0.5200831024930748, "grad_norm": 2.7921719551086426, "learning_rate": 9.47445266269284e-06, "loss": 1.25, "step": 751 }, { "epoch": 0.5207756232686981, "grad_norm": 2.722177267074585, "learning_rate": 9.452576318558437e-06, "loss": 1.2422, "step": 752 }, { "epoch": 0.5214681440443213, "grad_norm": 3.5722434520721436, "learning_rate": 9.430702601815892e-06, "loss": 1.356, "step": 753 }, { "epoch": 0.5221606648199446, "grad_norm": 2.7773613929748535, "learning_rate": 9.408831617449385e-06, "loss": 1.2914, "step": 754 }, { "epoch": 0.5228531855955678, "grad_norm": 4.336549758911133, "learning_rate": 9.38696347042998e-06, "loss": 1.1878, "step": 755 }, { "epoch": 0.5235457063711911, "grad_norm": 2.752716541290283, "learning_rate": 9.365098265715124e-06, "loss": 1.2605, "step": 756 }, { "epoch": 0.5242382271468145, "grad_norm": 2.604163885116577, "learning_rate": 9.34323610824814e-06, "loss": 1.3817, "step": 757 }, { "epoch": 0.5249307479224377, "grad_norm": 3.2916760444641113, "learning_rate": 9.321377102957723e-06, "loss": 1.1103, "step": 758 }, { "epoch": 0.525623268698061, "grad_norm": 3.1670544147491455, "learning_rate": 9.29952135475745e-06, "loss": 1.328, "step": 759 }, { "epoch": 0.5263157894736842, "grad_norm": 2.944361925125122, "learning_rate": 9.277668968545253e-06, "loss": 1.3703, "step": 760 }, { "epoch": 0.5270083102493075, "grad_norm": 2.8937935829162598, "learning_rate": 9.255820049202933e-06, "loss": 1.3303, "step": 761 }, { "epoch": 0.5277008310249307, "grad_norm": 3.042147636413574, "learning_rate": 9.233974701595654e-06, "loss": 1.2626, "step": 762 }, { "epoch": 0.528393351800554, "grad_norm": 3.470090627670288, "learning_rate": 9.212133030571438e-06, "loss": 1.4438, "step": 763 }, { "epoch": 0.5290858725761773, "grad_norm": 2.5336360931396484, "learning_rate": 9.190295140960649e-06, "loss": 1.2959, "step": 764 }, { "epoch": 0.5297783933518005, "grad_norm": 2.6885414123535156, "learning_rate": 9.168461137575515e-06, "loss": 1.2422, "step": 765 }, { "epoch": 0.5304709141274239, "grad_norm": 3.321873664855957, "learning_rate": 9.146631125209608e-06, "loss": 1.1752, "step": 766 }, { "epoch": 0.5311634349030471, "grad_norm": 2.9666600227355957, "learning_rate": 9.124805208637349e-06, "loss": 1.3353, "step": 767 }, { "epoch": 0.5318559556786704, "grad_norm": 2.8593125343322754, "learning_rate": 9.102983492613489e-06, "loss": 1.2074, "step": 768 }, { "epoch": 0.5325484764542936, "grad_norm": 2.73651385307312, "learning_rate": 9.081166081872627e-06, "loss": 1.2614, "step": 769 }, { "epoch": 0.5332409972299169, "grad_norm": 2.727332592010498, "learning_rate": 9.059353081128702e-06, "loss": 1.2188, "step": 770 }, { "epoch": 0.5339335180055401, "grad_norm": 2.6417131423950195, "learning_rate": 9.037544595074479e-06, "loss": 1.2823, "step": 771 }, { "epoch": 0.5346260387811634, "grad_norm": 2.800295352935791, "learning_rate": 9.015740728381055e-06, "loss": 1.2197, "step": 772 }, { "epoch": 0.5353185595567868, "grad_norm": 3.024468421936035, "learning_rate": 8.993941585697363e-06, "loss": 1.2803, "step": 773 }, { "epoch": 0.53601108033241, "grad_norm": 3.050037384033203, "learning_rate": 8.972147271649662e-06, "loss": 1.2804, "step": 774 }, { "epoch": 0.5367036011080333, "grad_norm": 3.3923215866088867, "learning_rate": 8.950357890841021e-06, "loss": 1.3596, "step": 775 }, { "epoch": 0.5373961218836565, "grad_norm": 2.8710789680480957, "learning_rate": 8.928573547850852e-06, "loss": 1.3116, "step": 776 }, { "epoch": 0.5380886426592798, "grad_norm": 2.874081611633301, "learning_rate": 8.906794347234373e-06, "loss": 1.3111, "step": 777 }, { "epoch": 0.538781163434903, "grad_norm": 2.6735353469848633, "learning_rate": 8.885020393522136e-06, "loss": 1.2969, "step": 778 }, { "epoch": 0.5394736842105263, "grad_norm": 3.593169689178467, "learning_rate": 8.863251791219491e-06, "loss": 1.2788, "step": 779 }, { "epoch": 0.5401662049861495, "grad_norm": 3.3737025260925293, "learning_rate": 8.841488644806115e-06, "loss": 1.2895, "step": 780 }, { "epoch": 0.5408587257617729, "grad_norm": 2.32466721534729, "learning_rate": 8.819731058735502e-06, "loss": 0.7021, "step": 781 }, { "epoch": 0.5415512465373962, "grad_norm": 3.052380323410034, "learning_rate": 8.797979137434452e-06, "loss": 1.1673, "step": 782 }, { "epoch": 0.5422437673130194, "grad_norm": 3.2256016731262207, "learning_rate": 8.776232985302573e-06, "loss": 1.282, "step": 783 }, { "epoch": 0.5429362880886427, "grad_norm": 3.534031629562378, "learning_rate": 8.754492706711798e-06, "loss": 1.2677, "step": 784 }, { "epoch": 0.5436288088642659, "grad_norm": 3.229583263397217, "learning_rate": 8.732758406005855e-06, "loss": 1.2163, "step": 785 }, { "epoch": 0.5443213296398892, "grad_norm": 2.711229085922241, "learning_rate": 8.711030187499795e-06, "loss": 1.2238, "step": 786 }, { "epoch": 0.5450138504155124, "grad_norm": 3.2641870975494385, "learning_rate": 8.68930815547946e-06, "loss": 1.2413, "step": 787 }, { "epoch": 0.5457063711911357, "grad_norm": 3.3665711879730225, "learning_rate": 8.66759241420101e-06, "loss": 1.3301, "step": 788 }, { "epoch": 0.546398891966759, "grad_norm": 3.5201163291931152, "learning_rate": 8.645883067890416e-06, "loss": 1.4187, "step": 789 }, { "epoch": 0.5470914127423823, "grad_norm": 3.2204978466033936, "learning_rate": 8.624180220742945e-06, "loss": 1.0933, "step": 790 }, { "epoch": 0.5477839335180056, "grad_norm": 3.3322551250457764, "learning_rate": 8.602483976922678e-06, "loss": 1.2983, "step": 791 }, { "epoch": 0.5484764542936288, "grad_norm": 2.787132740020752, "learning_rate": 8.580794440562003e-06, "loss": 1.3391, "step": 792 }, { "epoch": 0.5491689750692521, "grad_norm": 2.921457290649414, "learning_rate": 8.559111715761115e-06, "loss": 1.3271, "step": 793 }, { "epoch": 0.5498614958448753, "grad_norm": 2.8040547370910645, "learning_rate": 8.53743590658751e-06, "loss": 1.3346, "step": 794 }, { "epoch": 0.5505540166204986, "grad_norm": 3.099111557006836, "learning_rate": 8.515767117075502e-06, "loss": 1.3953, "step": 795 }, { "epoch": 0.5512465373961218, "grad_norm": 3.060025453567505, "learning_rate": 8.494105451225706e-06, "loss": 1.3549, "step": 796 }, { "epoch": 0.5519390581717452, "grad_norm": 2.8175106048583984, "learning_rate": 8.472451013004554e-06, "loss": 1.386, "step": 797 }, { "epoch": 0.5526315789473685, "grad_norm": 3.159795045852661, "learning_rate": 8.45080390634378e-06, "loss": 1.2336, "step": 798 }, { "epoch": 0.5533240997229917, "grad_norm": 3.1138765811920166, "learning_rate": 8.429164235139931e-06, "loss": 1.3276, "step": 799 }, { "epoch": 0.554016620498615, "grad_norm": 3.215301513671875, "learning_rate": 8.407532103253878e-06, "loss": 1.1742, "step": 800 }, { "epoch": 0.5547091412742382, "grad_norm": 3.050508737564087, "learning_rate": 8.385907614510297e-06, "loss": 1.33, "step": 801 }, { "epoch": 0.5554016620498615, "grad_norm": 3.1976349353790283, "learning_rate": 8.364290872697175e-06, "loss": 1.3126, "step": 802 }, { "epoch": 0.5560941828254847, "grad_norm": 2.6836695671081543, "learning_rate": 8.342681981565329e-06, "loss": 1.3309, "step": 803 }, { "epoch": 0.556786703601108, "grad_norm": 3.17729115486145, "learning_rate": 8.321081044827894e-06, "loss": 1.3075, "step": 804 }, { "epoch": 0.5574792243767313, "grad_norm": 3.135615587234497, "learning_rate": 8.299488166159817e-06, "loss": 1.2755, "step": 805 }, { "epoch": 0.5581717451523546, "grad_norm": 3.6281352043151855, "learning_rate": 8.277903449197383e-06, "loss": 1.1755, "step": 806 }, { "epoch": 0.5588642659279779, "grad_norm": 3.361931085586548, "learning_rate": 8.256326997537695e-06, "loss": 1.4074, "step": 807 }, { "epoch": 0.5595567867036011, "grad_norm": 2.785465717315674, "learning_rate": 8.2347589147382e-06, "loss": 1.2723, "step": 808 }, { "epoch": 0.5602493074792244, "grad_norm": 3.313589572906494, "learning_rate": 8.213199304316156e-06, "loss": 1.1785, "step": 809 }, { "epoch": 0.5609418282548476, "grad_norm": 3.2528610229492188, "learning_rate": 8.191648269748173e-06, "loss": 1.1545, "step": 810 }, { "epoch": 0.5616343490304709, "grad_norm": 3.762922763824463, "learning_rate": 8.170105914469701e-06, "loss": 1.3754, "step": 811 }, { "epoch": 0.5623268698060941, "grad_norm": 3.4045610427856445, "learning_rate": 8.14857234187453e-06, "loss": 1.173, "step": 812 }, { "epoch": 0.5630193905817175, "grad_norm": 3.2984979152679443, "learning_rate": 8.127047655314289e-06, "loss": 1.2817, "step": 813 }, { "epoch": 0.5637119113573407, "grad_norm": 3.1819562911987305, "learning_rate": 8.105531958097973e-06, "loss": 1.1583, "step": 814 }, { "epoch": 0.564404432132964, "grad_norm": 3.3363871574401855, "learning_rate": 8.08402535349142e-06, "loss": 1.2589, "step": 815 }, { "epoch": 0.5650969529085873, "grad_norm": 3.924525022506714, "learning_rate": 8.062527944716837e-06, "loss": 1.2983, "step": 816 }, { "epoch": 0.5657894736842105, "grad_norm": 3.191929340362549, "learning_rate": 8.041039834952287e-06, "loss": 1.1706, "step": 817 }, { "epoch": 0.5664819944598338, "grad_norm": 3.0657148361206055, "learning_rate": 8.019561127331202e-06, "loss": 1.325, "step": 818 }, { "epoch": 0.567174515235457, "grad_norm": 3.4930810928344727, "learning_rate": 7.998091924941897e-06, "loss": 1.3118, "step": 819 }, { "epoch": 0.5678670360110804, "grad_norm": 3.066667079925537, "learning_rate": 7.976632330827056e-06, "loss": 1.2784, "step": 820 }, { "epoch": 0.5685595567867036, "grad_norm": 3.7067830562591553, "learning_rate": 7.95518244798325e-06, "loss": 1.269, "step": 821 }, { "epoch": 0.5692520775623269, "grad_norm": 2.3034355640411377, "learning_rate": 7.933742379360446e-06, "loss": 0.7615, "step": 822 }, { "epoch": 0.5699445983379502, "grad_norm": 3.537672996520996, "learning_rate": 7.912312227861504e-06, "loss": 1.0391, "step": 823 }, { "epoch": 0.5706371191135734, "grad_norm": 3.186122179031372, "learning_rate": 7.890892096341677e-06, "loss": 1.2197, "step": 824 }, { "epoch": 0.5713296398891967, "grad_norm": 3.3523175716400146, "learning_rate": 7.86948208760814e-06, "loss": 1.3319, "step": 825 }, { "epoch": 0.5720221606648199, "grad_norm": 3.0430562496185303, "learning_rate": 7.848082304419478e-06, "loss": 1.2969, "step": 826 }, { "epoch": 0.5727146814404432, "grad_norm": 3.475856304168701, "learning_rate": 7.826692849485198e-06, "loss": 1.2284, "step": 827 }, { "epoch": 0.5734072022160664, "grad_norm": 3.8289895057678223, "learning_rate": 7.805313825465232e-06, "loss": 1.1896, "step": 828 }, { "epoch": 0.5740997229916898, "grad_norm": 3.8057913780212402, "learning_rate": 7.783945334969452e-06, "loss": 1.2574, "step": 829 }, { "epoch": 0.574792243767313, "grad_norm": 3.7936198711395264, "learning_rate": 7.762587480557175e-06, "loss": 1.2529, "step": 830 }, { "epoch": 0.5754847645429363, "grad_norm": 3.666149377822876, "learning_rate": 7.74124036473667e-06, "loss": 1.3672, "step": 831 }, { "epoch": 0.5761772853185596, "grad_norm": 3.881035089492798, "learning_rate": 7.719904089964658e-06, "loss": 1.119, "step": 832 }, { "epoch": 0.5768698060941828, "grad_norm": 3.367007255554199, "learning_rate": 7.698578758645838e-06, "loss": 1.4159, "step": 833 }, { "epoch": 0.5775623268698061, "grad_norm": 3.0489461421966553, "learning_rate": 7.67726447313238e-06, "loss": 1.3289, "step": 834 }, { "epoch": 0.5782548476454293, "grad_norm": 3.567683696746826, "learning_rate": 7.655961335723432e-06, "loss": 1.0198, "step": 835 }, { "epoch": 0.5789473684210527, "grad_norm": 3.55120849609375, "learning_rate": 7.634669448664655e-06, "loss": 1.3868, "step": 836 }, { "epoch": 0.5796398891966759, "grad_norm": 3.100919246673584, "learning_rate": 7.61338891414769e-06, "loss": 1.3426, "step": 837 }, { "epoch": 0.5803324099722992, "grad_norm": 2.899195432662964, "learning_rate": 7.5921198343097145e-06, "loss": 1.2532, "step": 838 }, { "epoch": 0.5810249307479224, "grad_norm": 3.1293692588806152, "learning_rate": 7.570862311232909e-06, "loss": 1.2963, "step": 839 }, { "epoch": 0.5817174515235457, "grad_norm": 3.7493510246276855, "learning_rate": 7.549616446943992e-06, "loss": 1.2741, "step": 840 }, { "epoch": 0.582409972299169, "grad_norm": 3.714538335800171, "learning_rate": 7.528382343413734e-06, "loss": 1.3811, "step": 841 }, { "epoch": 0.5831024930747922, "grad_norm": 3.606914520263672, "learning_rate": 7.507160102556451e-06, "loss": 1.2921, "step": 842 }, { "epoch": 0.5837950138504155, "grad_norm": 3.9541878700256348, "learning_rate": 7.485949826229518e-06, "loss": 1.1239, "step": 843 }, { "epoch": 0.5844875346260388, "grad_norm": 3.4801037311553955, "learning_rate": 7.464751616232902e-06, "loss": 1.32, "step": 844 }, { "epoch": 0.5851800554016621, "grad_norm": 3.1288328170776367, "learning_rate": 7.443565574308638e-06, "loss": 1.208, "step": 845 }, { "epoch": 0.5858725761772853, "grad_norm": 3.5429086685180664, "learning_rate": 7.422391802140376e-06, "loss": 1.1675, "step": 846 }, { "epoch": 0.5865650969529086, "grad_norm": 3.190911293029785, "learning_rate": 7.401230401352867e-06, "loss": 1.2133, "step": 847 }, { "epoch": 0.5872576177285319, "grad_norm": 3.729435920715332, "learning_rate": 7.380081473511484e-06, "loss": 1.1991, "step": 848 }, { "epoch": 0.5879501385041551, "grad_norm": 3.054455280303955, "learning_rate": 7.358945120121743e-06, "loss": 1.1601, "step": 849 }, { "epoch": 0.5886426592797784, "grad_norm": 3.179671287536621, "learning_rate": 7.337821442628805e-06, "loss": 1.3278, "step": 850 }, { "epoch": 0.5893351800554016, "grad_norm": 3.9718713760375977, "learning_rate": 7.316710542416983e-06, "loss": 1.2899, "step": 851 }, { "epoch": 0.590027700831025, "grad_norm": 3.6802737712860107, "learning_rate": 7.295612520809281e-06, "loss": 1.3197, "step": 852 }, { "epoch": 0.5907202216066482, "grad_norm": 4.423649311065674, "learning_rate": 7.274527479066883e-06, "loss": 1.4049, "step": 853 }, { "epoch": 0.5914127423822715, "grad_norm": 3.5196075439453125, "learning_rate": 7.253455518388668e-06, "loss": 1.2989, "step": 854 }, { "epoch": 0.5921052631578947, "grad_norm": 3.266993284225464, "learning_rate": 7.232396739910746e-06, "loss": 1.2234, "step": 855 }, { "epoch": 0.592797783933518, "grad_norm": 3.2850778102874756, "learning_rate": 7.211351244705947e-06, "loss": 1.2039, "step": 856 }, { "epoch": 0.5934903047091413, "grad_norm": 3.7952563762664795, "learning_rate": 7.190319133783359e-06, "loss": 1.2871, "step": 857 }, { "epoch": 0.5941828254847645, "grad_norm": 3.411527633666992, "learning_rate": 7.169300508087815e-06, "loss": 1.3137, "step": 858 }, { "epoch": 0.5948753462603878, "grad_norm": 3.390535593032837, "learning_rate": 7.148295468499438e-06, "loss": 1.3028, "step": 859 }, { "epoch": 0.5955678670360111, "grad_norm": 3.034454107284546, "learning_rate": 7.127304115833141e-06, "loss": 1.1951, "step": 860 }, { "epoch": 0.5962603878116344, "grad_norm": 3.163402557373047, "learning_rate": 7.106326550838145e-06, "loss": 1.4047, "step": 861 }, { "epoch": 0.5969529085872576, "grad_norm": 3.626631498336792, "learning_rate": 7.08536287419749e-06, "loss": 1.1003, "step": 862 }, { "epoch": 0.5976454293628809, "grad_norm": 3.2359683513641357, "learning_rate": 7.0644131865275675e-06, "loss": 1.3011, "step": 863 }, { "epoch": 0.5983379501385041, "grad_norm": 2.9554173946380615, "learning_rate": 7.043477588377623e-06, "loss": 1.31, "step": 864 }, { "epoch": 0.5990304709141274, "grad_norm": 3.3498265743255615, "learning_rate": 7.022556180229285e-06, "loss": 1.1903, "step": 865 }, { "epoch": 0.5997229916897507, "grad_norm": 2.4006574153900146, "learning_rate": 7.001649062496065e-06, "loss": 0.8033, "step": 866 }, { "epoch": 0.600415512465374, "grad_norm": 3.5354299545288086, "learning_rate": 6.980756335522893e-06, "loss": 1.2626, "step": 867 }, { "epoch": 0.6011080332409973, "grad_norm": 2.9956421852111816, "learning_rate": 6.959878099585634e-06, "loss": 1.2707, "step": 868 }, { "epoch": 0.6018005540166205, "grad_norm": 3.3528342247009277, "learning_rate": 6.939014454890594e-06, "loss": 0.978, "step": 869 }, { "epoch": 0.6024930747922438, "grad_norm": 3.61551833152771, "learning_rate": 6.918165501574051e-06, "loss": 1.327, "step": 870 }, { "epoch": 0.603185595567867, "grad_norm": 3.31195068359375, "learning_rate": 6.897331339701776e-06, "loss": 1.1225, "step": 871 }, { "epoch": 0.6038781163434903, "grad_norm": 2.8019473552703857, "learning_rate": 6.876512069268541e-06, "loss": 1.2743, "step": 872 }, { "epoch": 0.6045706371191135, "grad_norm": 3.4144673347473145, "learning_rate": 6.855707790197644e-06, "loss": 1.2898, "step": 873 }, { "epoch": 0.6052631578947368, "grad_norm": 3.687757730484009, "learning_rate": 6.834918602340439e-06, "loss": 1.1372, "step": 874 }, { "epoch": 0.6059556786703602, "grad_norm": 3.283721923828125, "learning_rate": 6.814144605475842e-06, "loss": 1.3031, "step": 875 }, { "epoch": 0.6066481994459834, "grad_norm": 3.4214913845062256, "learning_rate": 6.793385899309866e-06, "loss": 1.1862, "step": 876 }, { "epoch": 0.6073407202216067, "grad_norm": 3.3713910579681396, "learning_rate": 6.772642583475126e-06, "loss": 1.281, "step": 877 }, { "epoch": 0.6080332409972299, "grad_norm": 3.9484262466430664, "learning_rate": 6.751914757530375e-06, "loss": 1.2102, "step": 878 }, { "epoch": 0.6087257617728532, "grad_norm": 3.3405299186706543, "learning_rate": 6.731202520960025e-06, "loss": 1.3257, "step": 879 }, { "epoch": 0.6094182825484764, "grad_norm": 3.453195810317993, "learning_rate": 6.7105059731736645e-06, "loss": 1.2954, "step": 880 }, { "epoch": 0.6101108033240997, "grad_norm": 3.2071444988250732, "learning_rate": 6.689825213505572e-06, "loss": 1.2726, "step": 881 }, { "epoch": 0.610803324099723, "grad_norm": 3.140890121459961, "learning_rate": 6.669160341214265e-06, "loss": 1.3308, "step": 882 }, { "epoch": 0.6114958448753463, "grad_norm": 3.415692090988159, "learning_rate": 6.648511455482003e-06, "loss": 1.1971, "step": 883 }, { "epoch": 0.6121883656509696, "grad_norm": 3.596778392791748, "learning_rate": 6.627878655414311e-06, "loss": 1.3473, "step": 884 }, { "epoch": 0.6128808864265928, "grad_norm": 3.2050392627716064, "learning_rate": 6.607262040039519e-06, "loss": 1.2164, "step": 885 }, { "epoch": 0.6135734072022161, "grad_norm": 3.888380765914917, "learning_rate": 6.586661708308273e-06, "loss": 1.2232, "step": 886 }, { "epoch": 0.6142659279778393, "grad_norm": 2.849752902984619, "learning_rate": 6.56607775909307e-06, "loss": 1.3409, "step": 887 }, { "epoch": 0.6149584487534626, "grad_norm": 2.974795341491699, "learning_rate": 6.5455102911877665e-06, "loss": 1.3312, "step": 888 }, { "epoch": 0.6156509695290858, "grad_norm": 3.666857957839966, "learning_rate": 6.524959403307125e-06, "loss": 1.1317, "step": 889 }, { "epoch": 0.6163434903047091, "grad_norm": 3.64629864692688, "learning_rate": 6.504425194086334e-06, "loss": 1.4063, "step": 890 }, { "epoch": 0.6170360110803325, "grad_norm": 3.5016064643859863, "learning_rate": 6.483907762080526e-06, "loss": 1.2191, "step": 891 }, { "epoch": 0.6177285318559557, "grad_norm": 3.065629243850708, "learning_rate": 6.4634072057643045e-06, "loss": 1.181, "step": 892 }, { "epoch": 0.618421052631579, "grad_norm": 4.297072887420654, "learning_rate": 6.442923623531294e-06, "loss": 1.1937, "step": 893 }, { "epoch": 0.6191135734072022, "grad_norm": 3.8993167877197266, "learning_rate": 6.422457113693633e-06, "loss": 1.1729, "step": 894 }, { "epoch": 0.6198060941828255, "grad_norm": 3.1856400966644287, "learning_rate": 6.402007774481536e-06, "loss": 1.2613, "step": 895 }, { "epoch": 0.6204986149584487, "grad_norm": 3.68772554397583, "learning_rate": 6.381575704042792e-06, "loss": 1.1512, "step": 896 }, { "epoch": 0.621191135734072, "grad_norm": 3.291123151779175, "learning_rate": 6.361161000442313e-06, "loss": 1.2249, "step": 897 }, { "epoch": 0.6218836565096952, "grad_norm": 3.7242109775543213, "learning_rate": 6.340763761661665e-06, "loss": 1.1158, "step": 898 }, { "epoch": 0.6225761772853186, "grad_norm": 3.25076961517334, "learning_rate": 6.320384085598581e-06, "loss": 1.2239, "step": 899 }, { "epoch": 0.6232686980609419, "grad_norm": 3.5171546936035156, "learning_rate": 6.3000220700664985e-06, "loss": 1.2539, "step": 900 }, { "epoch": 0.6239612188365651, "grad_norm": 3.7872190475463867, "learning_rate": 6.2796778127941025e-06, "loss": 1.3232, "step": 901 }, { "epoch": 0.6246537396121884, "grad_norm": 3.50396466255188, "learning_rate": 6.259351411424849e-06, "loss": 1.2813, "step": 902 }, { "epoch": 0.6253462603878116, "grad_norm": 3.1483330726623535, "learning_rate": 6.239042963516471e-06, "loss": 1.2651, "step": 903 }, { "epoch": 0.6260387811634349, "grad_norm": 3.2591357231140137, "learning_rate": 6.218752566540555e-06, "loss": 1.1838, "step": 904 }, { "epoch": 0.6267313019390581, "grad_norm": 3.9565536975860596, "learning_rate": 6.1984803178820456e-06, "loss": 1.0281, "step": 905 }, { "epoch": 0.6274238227146814, "grad_norm": 3.1557421684265137, "learning_rate": 6.17822631483878e-06, "loss": 1.2428, "step": 906 }, { "epoch": 0.6281163434903048, "grad_norm": 3.933734655380249, "learning_rate": 6.157990654621025e-06, "loss": 1.31, "step": 907 }, { "epoch": 0.628808864265928, "grad_norm": 3.057314395904541, "learning_rate": 6.137773434351009e-06, "loss": 1.2082, "step": 908 }, { "epoch": 0.6295013850415513, "grad_norm": 2.9611635208129883, "learning_rate": 6.11757475106246e-06, "loss": 1.2675, "step": 909 }, { "epoch": 0.6301939058171745, "grad_norm": 3.71175217628479, "learning_rate": 6.097394701700146e-06, "loss": 1.3373, "step": 910 }, { "epoch": 0.6308864265927978, "grad_norm": 3.4437453746795654, "learning_rate": 6.077233383119372e-06, "loss": 1.3137, "step": 911 }, { "epoch": 0.631578947368421, "grad_norm": 3.245744466781616, "learning_rate": 6.057090892085571e-06, "loss": 1.1768, "step": 912 }, { "epoch": 0.6322714681440443, "grad_norm": 3.2878289222717285, "learning_rate": 6.036967325273807e-06, "loss": 1.344, "step": 913 }, { "epoch": 0.6329639889196675, "grad_norm": 3.1663408279418945, "learning_rate": 6.016862779268301e-06, "loss": 1.2041, "step": 914 }, { "epoch": 0.6336565096952909, "grad_norm": 3.6326308250427246, "learning_rate": 5.996777350561997e-06, "loss": 1.3128, "step": 915 }, { "epoch": 0.6343490304709142, "grad_norm": 3.5937588214874268, "learning_rate": 5.976711135556086e-06, "loss": 1.2553, "step": 916 }, { "epoch": 0.6350415512465374, "grad_norm": 3.206855535507202, "learning_rate": 5.956664230559532e-06, "loss": 1.2699, "step": 917 }, { "epoch": 0.6357340720221607, "grad_norm": 3.8456692695617676, "learning_rate": 5.936636731788621e-06, "loss": 1.2254, "step": 918 }, { "epoch": 0.6364265927977839, "grad_norm": 4.080446720123291, "learning_rate": 5.916628735366505e-06, "loss": 1.314, "step": 919 }, { "epoch": 0.6371191135734072, "grad_norm": 3.2942323684692383, "learning_rate": 5.896640337322725e-06, "loss": 1.2532, "step": 920 }, { "epoch": 0.6378116343490304, "grad_norm": 3.4992527961730957, "learning_rate": 5.876671633592769e-06, "loss": 1.4546, "step": 921 }, { "epoch": 0.6385041551246537, "grad_norm": 3.231581687927246, "learning_rate": 5.8567227200175865e-06, "loss": 1.1727, "step": 922 }, { "epoch": 0.639196675900277, "grad_norm": 3.426119565963745, "learning_rate": 5.836793692343154e-06, "loss": 1.322, "step": 923 }, { "epoch": 0.6398891966759003, "grad_norm": 3.254599094390869, "learning_rate": 5.816884646219997e-06, "loss": 1.3904, "step": 924 }, { "epoch": 0.6405817174515236, "grad_norm": 3.066844940185547, "learning_rate": 5.7969956772027535e-06, "loss": 1.3591, "step": 925 }, { "epoch": 0.6412742382271468, "grad_norm": 3.5856401920318604, "learning_rate": 5.7771268807496794e-06, "loss": 1.0619, "step": 926 }, { "epoch": 0.6419667590027701, "grad_norm": 3.263921022415161, "learning_rate": 5.757278352222218e-06, "loss": 1.1964, "step": 927 }, { "epoch": 0.6426592797783933, "grad_norm": 3.6725008487701416, "learning_rate": 5.737450186884555e-06, "loss": 1.1908, "step": 928 }, { "epoch": 0.6433518005540166, "grad_norm": 3.5194790363311768, "learning_rate": 5.71764247990311e-06, "loss": 1.296, "step": 929 }, { "epoch": 0.6440443213296398, "grad_norm": 4.3180413246154785, "learning_rate": 5.6978553263461265e-06, "loss": 1.1636, "step": 930 }, { "epoch": 0.6447368421052632, "grad_norm": 3.6422464847564697, "learning_rate": 5.678088821183212e-06, "loss": 1.221, "step": 931 }, { "epoch": 0.6454293628808865, "grad_norm": 3.5116426944732666, "learning_rate": 5.6583430592848565e-06, "loss": 1.1897, "step": 932 }, { "epoch": 0.6461218836565097, "grad_norm": 3.304812431335449, "learning_rate": 5.638618135421986e-06, "loss": 1.2334, "step": 933 }, { "epoch": 0.646814404432133, "grad_norm": 3.4453015327453613, "learning_rate": 5.618914144265532e-06, "loss": 1.347, "step": 934 }, { "epoch": 0.6475069252077562, "grad_norm": 4.002638816833496, "learning_rate": 5.599231180385945e-06, "loss": 1.337, "step": 935 }, { "epoch": 0.6481994459833795, "grad_norm": 4.314698696136475, "learning_rate": 5.579569338252758e-06, "loss": 1.1725, "step": 936 }, { "epoch": 0.6488919667590027, "grad_norm": 3.4308438301086426, "learning_rate": 5.559928712234126e-06, "loss": 1.3876, "step": 937 }, { "epoch": 0.649584487534626, "grad_norm": 3.4807419776916504, "learning_rate": 5.5403093965963806e-06, "loss": 1.1882, "step": 938 }, { "epoch": 0.6502770083102493, "grad_norm": 3.439375638961792, "learning_rate": 5.520711485503569e-06, "loss": 1.2106, "step": 939 }, { "epoch": 0.6509695290858726, "grad_norm": 3.515831708908081, "learning_rate": 5.501135073017008e-06, "loss": 1.2495, "step": 940 }, { "epoch": 0.6516620498614959, "grad_norm": 4.047608852386475, "learning_rate": 5.481580253094828e-06, "loss": 1.2475, "step": 941 }, { "epoch": 0.6523545706371191, "grad_norm": 3.4397003650665283, "learning_rate": 5.4620471195915304e-06, "loss": 1.259, "step": 942 }, { "epoch": 0.6530470914127424, "grad_norm": 3.3549153804779053, "learning_rate": 5.4425357662575255e-06, "loss": 1.2415, "step": 943 }, { "epoch": 0.6537396121883656, "grad_norm": 3.8778305053710938, "learning_rate": 5.42304628673869e-06, "loss": 1.2269, "step": 944 }, { "epoch": 0.6544321329639889, "grad_norm": 3.3690638542175293, "learning_rate": 5.403578774575919e-06, "loss": 1.295, "step": 945 }, { "epoch": 0.6551246537396122, "grad_norm": 3.7962167263031006, "learning_rate": 5.384133323204666e-06, "loss": 1.2703, "step": 946 }, { "epoch": 0.6558171745152355, "grad_norm": 3.735410451889038, "learning_rate": 5.36471002595452e-06, "loss": 1.2357, "step": 947 }, { "epoch": 0.6565096952908587, "grad_norm": 3.9717748165130615, "learning_rate": 5.345308976048715e-06, "loss": 1.17, "step": 948 }, { "epoch": 0.657202216066482, "grad_norm": 3.596519708633423, "learning_rate": 5.325930266603724e-06, "loss": 1.3412, "step": 949 }, { "epoch": 0.6578947368421053, "grad_norm": 3.551438331604004, "learning_rate": 5.306573990628796e-06, "loss": 1.1691, "step": 950 }, { "epoch": 0.6585872576177285, "grad_norm": 4.270081043243408, "learning_rate": 5.287240241025509e-06, "loss": 1.358, "step": 951 }, { "epoch": 0.6592797783933518, "grad_norm": 3.3043808937072754, "learning_rate": 5.267929110587308e-06, "loss": 1.2799, "step": 952 }, { "epoch": 0.659972299168975, "grad_norm": 3.735628604888916, "learning_rate": 5.248640691999099e-06, "loss": 1.1946, "step": 953 }, { "epoch": 0.6606648199445984, "grad_norm": 3.4741759300231934, "learning_rate": 5.229375077836769e-06, "loss": 1.3411, "step": 954 }, { "epoch": 0.6613573407202216, "grad_norm": 3.6146764755249023, "learning_rate": 5.210132360566756e-06, "loss": 1.1197, "step": 955 }, { "epoch": 0.6620498614958449, "grad_norm": 3.68607497215271, "learning_rate": 5.190912632545599e-06, "loss": 1.3933, "step": 956 }, { "epoch": 0.6627423822714681, "grad_norm": 3.1479551792144775, "learning_rate": 5.171715986019509e-06, "loss": 1.3279, "step": 957 }, { "epoch": 0.6634349030470914, "grad_norm": 3.4101524353027344, "learning_rate": 5.152542513123906e-06, "loss": 1.354, "step": 958 }, { "epoch": 0.6641274238227147, "grad_norm": 3.9616758823394775, "learning_rate": 5.13339230588299e-06, "loss": 1.2024, "step": 959 }, { "epoch": 0.6648199445983379, "grad_norm": 3.6341795921325684, "learning_rate": 5.114265456209297e-06, "loss": 1.0029, "step": 960 }, { "epoch": 0.6655124653739612, "grad_norm": 3.9162726402282715, "learning_rate": 5.095162055903257e-06, "loss": 1.1473, "step": 961 }, { "epoch": 0.6662049861495845, "grad_norm": 3.362164258956909, "learning_rate": 5.076082196652754e-06, "loss": 1.3375, "step": 962 }, { "epoch": 0.6668975069252078, "grad_norm": 3.8291680812835693, "learning_rate": 5.057025970032683e-06, "loss": 1.3385, "step": 963 }, { "epoch": 0.667590027700831, "grad_norm": 3.777529239654541, "learning_rate": 5.037993467504515e-06, "loss": 1.3639, "step": 964 }, { "epoch": 0.6682825484764543, "grad_norm": 3.950564384460449, "learning_rate": 5.01898478041585e-06, "loss": 1.2299, "step": 965 }, { "epoch": 0.6689750692520776, "grad_norm": 4.049495697021484, "learning_rate": 5.000000000000003e-06, "loss": 1.2618, "step": 966 }, { "epoch": 0.6696675900277008, "grad_norm": 3.7501418590545654, "learning_rate": 4.98103921737552e-06, "loss": 1.2776, "step": 967 }, { "epoch": 0.6703601108033241, "grad_norm": 3.297684669494629, "learning_rate": 4.962102523545782e-06, "loss": 1.2447, "step": 968 }, { "epoch": 0.6710526315789473, "grad_norm": 3.3546786308288574, "learning_rate": 4.9431900093985655e-06, "loss": 1.201, "step": 969 }, { "epoch": 0.6717451523545707, "grad_norm": 4.063179969787598, "learning_rate": 4.92430176570558e-06, "loss": 1.257, "step": 970 }, { "epoch": 0.6724376731301939, "grad_norm": 3.4167563915252686, "learning_rate": 4.905437883122044e-06, "loss": 1.2553, "step": 971 }, { "epoch": 0.6731301939058172, "grad_norm": 4.087886810302734, "learning_rate": 4.88659845218627e-06, "loss": 1.4056, "step": 972 }, { "epoch": 0.6738227146814404, "grad_norm": 3.643519163131714, "learning_rate": 4.867783563319206e-06, "loss": 1.1488, "step": 973 }, { "epoch": 0.6745152354570637, "grad_norm": 3.689683675765991, "learning_rate": 4.848993306823997e-06, "loss": 1.1962, "step": 974 }, { "epoch": 0.675207756232687, "grad_norm": 3.7413206100463867, "learning_rate": 4.8302277728855826e-06, "loss": 1.1533, "step": 975 }, { "epoch": 0.6759002770083102, "grad_norm": 2.7265217304229736, "learning_rate": 4.811487051570235e-06, "loss": 0.7533, "step": 976 }, { "epoch": 0.6765927977839336, "grad_norm": 3.708652973175049, "learning_rate": 4.792771232825137e-06, "loss": 1.3488, "step": 977 }, { "epoch": 0.6772853185595568, "grad_norm": 3.641941547393799, "learning_rate": 4.774080406477954e-06, "loss": 1.2899, "step": 978 }, { "epoch": 0.6779778393351801, "grad_norm": 3.654796838760376, "learning_rate": 4.755414662236392e-06, "loss": 1.268, "step": 979 }, { "epoch": 0.6786703601108033, "grad_norm": 3.293848752975464, "learning_rate": 4.736774089687781e-06, "loss": 1.253, "step": 980 }, { "epoch": 0.6793628808864266, "grad_norm": 3.528782367706299, "learning_rate": 4.718158778298635e-06, "loss": 1.1901, "step": 981 }, { "epoch": 0.6800554016620498, "grad_norm": 4.274374485015869, "learning_rate": 4.699568817414224e-06, "loss": 1.2461, "step": 982 }, { "epoch": 0.6807479224376731, "grad_norm": 3.8092503547668457, "learning_rate": 4.681004296258151e-06, "loss": 1.2979, "step": 983 }, { "epoch": 0.6814404432132964, "grad_norm": 3.6626265048980713, "learning_rate": 4.662465303931912e-06, "loss": 1.2795, "step": 984 }, { "epoch": 0.6821329639889196, "grad_norm": 3.7979207038879395, "learning_rate": 4.643951929414493e-06, "loss": 1.2645, "step": 985 }, { "epoch": 0.682825484764543, "grad_norm": 3.516573429107666, "learning_rate": 4.625464261561902e-06, "loss": 1.1531, "step": 986 }, { "epoch": 0.6835180055401662, "grad_norm": 3.2881338596343994, "learning_rate": 4.607002389106777e-06, "loss": 1.263, "step": 987 }, { "epoch": 0.6842105263157895, "grad_norm": 3.467747688293457, "learning_rate": 4.588566400657965e-06, "loss": 1.2436, "step": 988 }, { "epoch": 0.6849030470914127, "grad_norm": 3.760425090789795, "learning_rate": 4.570156384700053e-06, "loss": 1.324, "step": 989 }, { "epoch": 0.685595567867036, "grad_norm": 3.4610824584960938, "learning_rate": 4.551772429592987e-06, "loss": 1.1936, "step": 990 }, { "epoch": 0.6862880886426593, "grad_norm": 3.2501208782196045, "learning_rate": 4.533414623571637e-06, "loss": 1.2801, "step": 991 }, { "epoch": 0.6869806094182825, "grad_norm": 3.383007526397705, "learning_rate": 4.515083054745363e-06, "loss": 1.2488, "step": 992 }, { "epoch": 0.6876731301939059, "grad_norm": 3.851450204849243, "learning_rate": 4.496777811097588e-06, "loss": 1.1017, "step": 993 }, { "epoch": 0.6883656509695291, "grad_norm": 3.6036598682403564, "learning_rate": 4.4784989804854055e-06, "loss": 1.2557, "step": 994 }, { "epoch": 0.6890581717451524, "grad_norm": 3.6002819538116455, "learning_rate": 4.460246650639125e-06, "loss": 1.2561, "step": 995 }, { "epoch": 0.6897506925207756, "grad_norm": 3.8944315910339355, "learning_rate": 4.4420209091618675e-06, "loss": 1.1098, "step": 996 }, { "epoch": 0.6904432132963989, "grad_norm": 3.2508392333984375, "learning_rate": 4.423821843529139e-06, "loss": 1.1746, "step": 997 }, { "epoch": 0.6911357340720221, "grad_norm": 3.485062837600708, "learning_rate": 4.405649541088419e-06, "loss": 1.2441, "step": 998 }, { "epoch": 0.6918282548476454, "grad_norm": 4.058342456817627, "learning_rate": 4.38750408905873e-06, "loss": 1.0997, "step": 999 }, { "epoch": 0.6925207756232687, "grad_norm": 3.4277503490448, "learning_rate": 4.369385574530227e-06, "loss": 1.3599, "step": 1000 }, { "epoch": 0.693213296398892, "grad_norm": 4.020823955535889, "learning_rate": 4.351294084463776e-06, "loss": 1.0891, "step": 1001 }, { "epoch": 0.6939058171745153, "grad_norm": 3.4967589378356934, "learning_rate": 4.3332297056905385e-06, "loss": 1.292, "step": 1002 }, { "epoch": 0.6945983379501385, "grad_norm": 4.008986949920654, "learning_rate": 4.315192524911551e-06, "loss": 1.2815, "step": 1003 }, { "epoch": 0.6952908587257618, "grad_norm": 3.5449891090393066, "learning_rate": 4.2971826286973175e-06, "loss": 1.2599, "step": 1004 }, { "epoch": 0.695983379501385, "grad_norm": 3.296858310699463, "learning_rate": 4.279200103487381e-06, "loss": 1.1863, "step": 1005 }, { "epoch": 0.6966759002770083, "grad_norm": 3.776249408721924, "learning_rate": 4.261245035589917e-06, "loss": 1.0698, "step": 1006 }, { "epoch": 0.6973684210526315, "grad_norm": 3.578810214996338, "learning_rate": 4.2433175111813305e-06, "loss": 1.2139, "step": 1007 }, { "epoch": 0.6980609418282548, "grad_norm": 3.9288926124572754, "learning_rate": 4.225417616305809e-06, "loss": 1.321, "step": 1008 }, { "epoch": 0.6987534626038782, "grad_norm": 3.7822978496551514, "learning_rate": 4.207545436874941e-06, "loss": 1.212, "step": 1009 }, { "epoch": 0.6994459833795014, "grad_norm": 3.7697253227233887, "learning_rate": 4.189701058667301e-06, "loss": 1.2954, "step": 1010 }, { "epoch": 0.7001385041551247, "grad_norm": 3.6546411514282227, "learning_rate": 4.171884567328021e-06, "loss": 1.1255, "step": 1011 }, { "epoch": 0.7008310249307479, "grad_norm": 3.8267335891723633, "learning_rate": 4.15409604836838e-06, "loss": 1.2291, "step": 1012 }, { "epoch": 0.7015235457063712, "grad_norm": 3.5930862426757812, "learning_rate": 4.136335587165421e-06, "loss": 1.3605, "step": 1013 }, { "epoch": 0.7022160664819944, "grad_norm": 3.3282723426818848, "learning_rate": 4.118603268961509e-06, "loss": 1.2689, "step": 1014 }, { "epoch": 0.7029085872576177, "grad_norm": 3.3957300186157227, "learning_rate": 4.100899178863938e-06, "loss": 1.3074, "step": 1015 }, { "epoch": 0.703601108033241, "grad_norm": 4.040927410125732, "learning_rate": 4.083223401844523e-06, "loss": 1.2394, "step": 1016 }, { "epoch": 0.7042936288088643, "grad_norm": 3.9132001399993896, "learning_rate": 4.065576022739181e-06, "loss": 1.2725, "step": 1017 }, { "epoch": 0.7049861495844876, "grad_norm": 4.065426826477051, "learning_rate": 4.047957126247542e-06, "loss": 1.078, "step": 1018 }, { "epoch": 0.7056786703601108, "grad_norm": 3.6188275814056396, "learning_rate": 4.030366796932521e-06, "loss": 1.2997, "step": 1019 }, { "epoch": 0.7063711911357341, "grad_norm": 3.8960142135620117, "learning_rate": 4.0128051192199325e-06, "loss": 1.2801, "step": 1020 }, { "epoch": 0.7070637119113573, "grad_norm": 3.781181812286377, "learning_rate": 3.99527217739807e-06, "loss": 1.1776, "step": 1021 }, { "epoch": 0.7077562326869806, "grad_norm": 3.5254998207092285, "learning_rate": 3.97776805561731e-06, "loss": 1.3147, "step": 1022 }, { "epoch": 0.7084487534626038, "grad_norm": 3.9870445728302, "learning_rate": 3.960292837889707e-06, "loss": 1.2304, "step": 1023 }, { "epoch": 0.7091412742382271, "grad_norm": 3.8298988342285156, "learning_rate": 3.942846608088583e-06, "loss": 1.3364, "step": 1024 }, { "epoch": 0.7098337950138505, "grad_norm": 4.035478591918945, "learning_rate": 3.925429449948134e-06, "loss": 1.2999, "step": 1025 }, { "epoch": 0.7105263157894737, "grad_norm": 3.4640512466430664, "learning_rate": 3.908041447063034e-06, "loss": 1.309, "step": 1026 }, { "epoch": 0.711218836565097, "grad_norm": 3.686994791030884, "learning_rate": 3.8906826828880085e-06, "loss": 1.2533, "step": 1027 }, { "epoch": 0.7119113573407202, "grad_norm": 3.6794652938842773, "learning_rate": 3.8733532407374555e-06, "loss": 1.3765, "step": 1028 }, { "epoch": 0.7126038781163435, "grad_norm": 3.3836612701416016, "learning_rate": 3.856053203785049e-06, "loss": 1.2114, "step": 1029 }, { "epoch": 0.7132963988919667, "grad_norm": 3.39042329788208, "learning_rate": 3.838782655063326e-06, "loss": 1.2425, "step": 1030 }, { "epoch": 0.71398891966759, "grad_norm": 3.2604761123657227, "learning_rate": 3.82154167746328e-06, "loss": 1.3216, "step": 1031 }, { "epoch": 0.7146814404432132, "grad_norm": 3.7054238319396973, "learning_rate": 3.804330353733998e-06, "loss": 1.1362, "step": 1032 }, { "epoch": 0.7153739612188366, "grad_norm": 3.187572479248047, "learning_rate": 3.7871487664822326e-06, "loss": 1.2687, "step": 1033 }, { "epoch": 0.7160664819944599, "grad_norm": 2.562303304672241, "learning_rate": 3.769996998172002e-06, "loss": 0.7703, "step": 1034 }, { "epoch": 0.7167590027700831, "grad_norm": 3.7910683155059814, "learning_rate": 3.7528751311242273e-06, "loss": 1.1611, "step": 1035 }, { "epoch": 0.7174515235457064, "grad_norm": 3.786738157272339, "learning_rate": 3.735783247516305e-06, "loss": 1.2285, "step": 1036 }, { "epoch": 0.7181440443213296, "grad_norm": 3.5773112773895264, "learning_rate": 3.718721429381724e-06, "loss": 1.2091, "step": 1037 }, { "epoch": 0.7188365650969529, "grad_norm": 4.090264320373535, "learning_rate": 3.7016897586096778e-06, "loss": 1.2265, "step": 1038 }, { "epoch": 0.7195290858725761, "grad_norm": 4.514917850494385, "learning_rate": 3.6846883169446624e-06, "loss": 1.3236, "step": 1039 }, { "epoch": 0.7202216066481995, "grad_norm": 4.169983386993408, "learning_rate": 3.667717185986085e-06, "loss": 1.2036, "step": 1040 }, { "epoch": 0.7209141274238227, "grad_norm": 2.4571571350097656, "learning_rate": 3.6507764471878804e-06, "loss": 0.6646, "step": 1041 }, { "epoch": 0.721606648199446, "grad_norm": 3.5560426712036133, "learning_rate": 3.63386618185811e-06, "loss": 1.196, "step": 1042 }, { "epoch": 0.7222991689750693, "grad_norm": 3.649376153945923, "learning_rate": 3.616986471158579e-06, "loss": 1.2761, "step": 1043 }, { "epoch": 0.7229916897506925, "grad_norm": 3.541705846786499, "learning_rate": 3.6001373961044385e-06, "loss": 1.2419, "step": 1044 }, { "epoch": 0.7236842105263158, "grad_norm": 3.4428951740264893, "learning_rate": 3.583319037563816e-06, "loss": 1.2972, "step": 1045 }, { "epoch": 0.724376731301939, "grad_norm": 3.983161687850952, "learning_rate": 3.5665314762573933e-06, "loss": 1.2115, "step": 1046 }, { "epoch": 0.7250692520775623, "grad_norm": 3.9475114345550537, "learning_rate": 3.5497747927580495e-06, "loss": 1.1102, "step": 1047 }, { "epoch": 0.7257617728531855, "grad_norm": 3.721270799636841, "learning_rate": 3.5330490674904737e-06, "loss": 1.2487, "step": 1048 }, { "epoch": 0.7264542936288089, "grad_norm": 3.9590468406677246, "learning_rate": 3.516354380730749e-06, "loss": 1.1853, "step": 1049 }, { "epoch": 0.7271468144044322, "grad_norm": 3.30857515335083, "learning_rate": 3.499690812605997e-06, "loss": 1.0928, "step": 1050 }, { "epoch": 0.7278393351800554, "grad_norm": 3.6948113441467285, "learning_rate": 3.4830584430939896e-06, "loss": 1.3328, "step": 1051 }, { "epoch": 0.7285318559556787, "grad_norm": 3.596212148666382, "learning_rate": 3.4664573520227564e-06, "loss": 1.2525, "step": 1052 }, { "epoch": 0.7292243767313019, "grad_norm": 3.829021692276001, "learning_rate": 3.449887619070188e-06, "loss": 1.2454, "step": 1053 }, { "epoch": 0.7299168975069252, "grad_norm": 3.797069787979126, "learning_rate": 3.433349323763696e-06, "loss": 1.2858, "step": 1054 }, { "epoch": 0.7306094182825484, "grad_norm": 3.8871891498565674, "learning_rate": 3.4168425454797884e-06, "loss": 1.1735, "step": 1055 }, { "epoch": 0.7313019390581718, "grad_norm": 3.7732465267181396, "learning_rate": 3.4003673634437084e-06, "loss": 1.2215, "step": 1056 }, { "epoch": 0.731994459833795, "grad_norm": 3.984323024749756, "learning_rate": 3.3839238567290523e-06, "loss": 1.1836, "step": 1057 }, { "epoch": 0.7326869806094183, "grad_norm": 3.9881927967071533, "learning_rate": 3.367512104257391e-06, "loss": 1.1729, "step": 1058 }, { "epoch": 0.7333795013850416, "grad_norm": 3.331230640411377, "learning_rate": 3.351132184797884e-06, "loss": 1.1262, "step": 1059 }, { "epoch": 0.7340720221606648, "grad_norm": 3.5761208534240723, "learning_rate": 3.334784176966912e-06, "loss": 1.2797, "step": 1060 }, { "epoch": 0.7347645429362881, "grad_norm": 3.5858423709869385, "learning_rate": 3.318468159227689e-06, "loss": 1.3824, "step": 1061 }, { "epoch": 0.7354570637119113, "grad_norm": 3.6287832260131836, "learning_rate": 3.3021842098898938e-06, "loss": 1.3382, "step": 1062 }, { "epoch": 0.7361495844875346, "grad_norm": 3.945394277572632, "learning_rate": 3.2859324071092902e-06, "loss": 1.2343, "step": 1063 }, { "epoch": 0.7368421052631579, "grad_norm": 3.6410295963287354, "learning_rate": 3.2697128288873536e-06, "loss": 1.1632, "step": 1064 }, { "epoch": 0.7375346260387812, "grad_norm": 4.534976482391357, "learning_rate": 3.253525553070893e-06, "loss": 1.2305, "step": 1065 }, { "epoch": 0.7382271468144044, "grad_norm": 4.022517204284668, "learning_rate": 3.2373706573516795e-06, "loss": 1.3596, "step": 1066 }, { "epoch": 0.7389196675900277, "grad_norm": 3.2530150413513184, "learning_rate": 3.2212482192660876e-06, "loss": 1.235, "step": 1067 }, { "epoch": 0.739612188365651, "grad_norm": 3.64341402053833, "learning_rate": 3.2051583161946865e-06, "loss": 1.1267, "step": 1068 }, { "epoch": 0.7403047091412742, "grad_norm": 4.010177135467529, "learning_rate": 3.189101025361905e-06, "loss": 1.0941, "step": 1069 }, { "epoch": 0.7409972299168975, "grad_norm": 3.3516669273376465, "learning_rate": 3.1730764238356517e-06, "loss": 1.2421, "step": 1070 }, { "epoch": 0.7416897506925207, "grad_norm": 3.6677613258361816, "learning_rate": 3.1570845885269365e-06, "loss": 1.3087, "step": 1071 }, { "epoch": 0.7423822714681441, "grad_norm": 3.4726169109344482, "learning_rate": 3.141125596189494e-06, "loss": 1.2735, "step": 1072 }, { "epoch": 0.7430747922437673, "grad_norm": 4.0901079177856445, "learning_rate": 3.125199523419449e-06, "loss": 1.2798, "step": 1073 }, { "epoch": 0.7437673130193906, "grad_norm": 3.457503318786621, "learning_rate": 3.109306446654912e-06, "loss": 1.2239, "step": 1074 }, { "epoch": 0.7444598337950139, "grad_norm": 4.589022159576416, "learning_rate": 3.0934464421756306e-06, "loss": 1.2632, "step": 1075 }, { "epoch": 0.7451523545706371, "grad_norm": 4.605857849121094, "learning_rate": 3.0776195861026202e-06, "loss": 1.1476, "step": 1076 }, { "epoch": 0.7458448753462604, "grad_norm": 3.9767348766326904, "learning_rate": 3.0618259543978e-06, "loss": 1.0512, "step": 1077 }, { "epoch": 0.7465373961218836, "grad_norm": 3.565870761871338, "learning_rate": 3.0460656228636254e-06, "loss": 1.2016, "step": 1078 }, { "epoch": 0.747229916897507, "grad_norm": 3.557732105255127, "learning_rate": 3.030338667142726e-06, "loss": 1.2852, "step": 1079 }, { "epoch": 0.7479224376731302, "grad_norm": 3.5791172981262207, "learning_rate": 3.014645162717542e-06, "loss": 1.2961, "step": 1080 }, { "epoch": 0.7486149584487535, "grad_norm": 4.24458646774292, "learning_rate": 2.9989851849099595e-06, "loss": 1.1436, "step": 1081 }, { "epoch": 0.7493074792243767, "grad_norm": 3.791700601577759, "learning_rate": 2.9833588088809627e-06, "loss": 1.1667, "step": 1082 }, { "epoch": 0.75, "grad_norm": 4.004382610321045, "learning_rate": 2.9677661096302467e-06, "loss": 1.1902, "step": 1083 }, { "epoch": 0.75, "eval_loss": 1.1204665899276733, "eval_runtime": 339.3016, "eval_samples_per_second": 4.173, "eval_steps_per_second": 0.522, "step": 1083 }, { "epoch": 0.7506925207756233, "grad_norm": 4.0300984382629395, "learning_rate": 2.952207161995879e-06, "loss": 1.1181, "step": 1084 }, { "epoch": 0.7513850415512465, "grad_norm": 4.063140392303467, "learning_rate": 2.936682040653942e-06, "loss": 1.2998, "step": 1085 }, { "epoch": 0.7520775623268698, "grad_norm": 3.3608922958374023, "learning_rate": 2.9211908201181592e-06, "loss": 1.2074, "step": 1086 }, { "epoch": 0.752770083102493, "grad_norm": 3.313950777053833, "learning_rate": 2.905733574739542e-06, "loss": 1.2484, "step": 1087 }, { "epoch": 0.7534626038781164, "grad_norm": 3.8276939392089844, "learning_rate": 2.8903103787060395e-06, "loss": 1.1704, "step": 1088 }, { "epoch": 0.7541551246537396, "grad_norm": 4.115742206573486, "learning_rate": 2.8749213060421854e-06, "loss": 1.2243, "step": 1089 }, { "epoch": 0.7548476454293629, "grad_norm": 4.2053022384643555, "learning_rate": 2.8595664306087313e-06, "loss": 1.2551, "step": 1090 }, { "epoch": 0.7555401662049861, "grad_norm": 3.9844744205474854, "learning_rate": 2.8442458261022867e-06, "loss": 1.1682, "step": 1091 }, { "epoch": 0.7562326869806094, "grad_norm": 3.5984349250793457, "learning_rate": 2.828959566054996e-06, "loss": 1.1658, "step": 1092 }, { "epoch": 0.7569252077562327, "grad_norm": 4.055597305297852, "learning_rate": 2.8137077238341526e-06, "loss": 1.1369, "step": 1093 }, { "epoch": 0.7576177285318559, "grad_norm": 3.7269349098205566, "learning_rate": 2.79849037264186e-06, "loss": 1.2058, "step": 1094 }, { "epoch": 0.7583102493074793, "grad_norm": 3.704430341720581, "learning_rate": 2.783307585514683e-06, "loss": 1.2248, "step": 1095 }, { "epoch": 0.7590027700831025, "grad_norm": 3.74150013923645, "learning_rate": 2.7681594353232934e-06, "loss": 1.2354, "step": 1096 }, { "epoch": 0.7596952908587258, "grad_norm": 4.019088268280029, "learning_rate": 2.75304599477212e-06, "loss": 1.2765, "step": 1097 }, { "epoch": 0.760387811634349, "grad_norm": 3.8129332065582275, "learning_rate": 2.737967336399002e-06, "loss": 1.2936, "step": 1098 }, { "epoch": 0.7610803324099723, "grad_norm": 4.70414400100708, "learning_rate": 2.7229235325748394e-06, "loss": 1.1368, "step": 1099 }, { "epoch": 0.7617728531855956, "grad_norm": 4.109380722045898, "learning_rate": 2.707914655503242e-06, "loss": 1.2475, "step": 1100 }, { "epoch": 0.7624653739612188, "grad_norm": 4.041932106018066, "learning_rate": 2.6929407772201986e-06, "loss": 1.1576, "step": 1101 }, { "epoch": 0.7631578947368421, "grad_norm": 3.5189266204833984, "learning_rate": 2.678001969593701e-06, "loss": 1.1453, "step": 1102 }, { "epoch": 0.7638504155124654, "grad_norm": 3.557849407196045, "learning_rate": 2.663098304323429e-06, "loss": 1.195, "step": 1103 }, { "epoch": 0.7645429362880887, "grad_norm": 4.041141510009766, "learning_rate": 2.6482298529403973e-06, "loss": 1.0855, "step": 1104 }, { "epoch": 0.7652354570637119, "grad_norm": 3.6010632514953613, "learning_rate": 2.633396686806604e-06, "loss": 1.1841, "step": 1105 }, { "epoch": 0.7659279778393352, "grad_norm": 3.5304527282714844, "learning_rate": 2.6185988771146864e-06, "loss": 1.208, "step": 1106 }, { "epoch": 0.7666204986149584, "grad_norm": 3.7058682441711426, "learning_rate": 2.603836494887603e-06, "loss": 1.2629, "step": 1107 }, { "epoch": 0.7673130193905817, "grad_norm": 4.019807815551758, "learning_rate": 2.5891096109782644e-06, "loss": 1.2371, "step": 1108 }, { "epoch": 0.768005540166205, "grad_norm": 4.549735069274902, "learning_rate": 2.5744182960692087e-06, "loss": 1.0124, "step": 1109 }, { "epoch": 0.7686980609418282, "grad_norm": 4.584209442138672, "learning_rate": 2.5597626206722583e-06, "loss": 1.134, "step": 1110 }, { "epoch": 0.7693905817174516, "grad_norm": 3.772904872894287, "learning_rate": 2.54514265512818e-06, "loss": 1.1927, "step": 1111 }, { "epoch": 0.7700831024930748, "grad_norm": 4.115574359893799, "learning_rate": 2.530558469606351e-06, "loss": 1.1982, "step": 1112 }, { "epoch": 0.7707756232686981, "grad_norm": 4.603262901306152, "learning_rate": 2.5160101341044195e-06, "loss": 1.0217, "step": 1113 }, { "epoch": 0.7714681440443213, "grad_norm": 3.315068483352661, "learning_rate": 2.5014977184479696e-06, "loss": 1.2076, "step": 1114 }, { "epoch": 0.7721606648199446, "grad_norm": 4.200598239898682, "learning_rate": 2.487021292290186e-06, "loss": 1.2153, "step": 1115 }, { "epoch": 0.7728531855955678, "grad_norm": 3.8972954750061035, "learning_rate": 2.4725809251115208e-06, "loss": 1.2619, "step": 1116 }, { "epoch": 0.7735457063711911, "grad_norm": 4.2333831787109375, "learning_rate": 2.4581766862193556e-06, "loss": 1.1756, "step": 1117 }, { "epoch": 0.7742382271468145, "grad_norm": 4.078391075134277, "learning_rate": 2.443808644747675e-06, "loss": 1.2905, "step": 1118 }, { "epoch": 0.7749307479224377, "grad_norm": 3.5866286754608154, "learning_rate": 2.4294768696567293e-06, "loss": 1.2961, "step": 1119 }, { "epoch": 0.775623268698061, "grad_norm": 4.606243133544922, "learning_rate": 2.4151814297327157e-06, "loss": 1.3393, "step": 1120 }, { "epoch": 0.7763157894736842, "grad_norm": 4.044128894805908, "learning_rate": 2.4009223935874237e-06, "loss": 1.1318, "step": 1121 }, { "epoch": 0.7770083102493075, "grad_norm": 3.6675806045532227, "learning_rate": 2.386699829657928e-06, "loss": 1.2092, "step": 1122 }, { "epoch": 0.7777008310249307, "grad_norm": 3.6052377223968506, "learning_rate": 2.372513806206258e-06, "loss": 1.1631, "step": 1123 }, { "epoch": 0.778393351800554, "grad_norm": 4.187111854553223, "learning_rate": 2.35836439131906e-06, "loss": 1.048, "step": 1124 }, { "epoch": 0.7790858725761773, "grad_norm": 3.9402425289154053, "learning_rate": 2.3442516529072677e-06, "loss": 1.1767, "step": 1125 }, { "epoch": 0.7797783933518005, "grad_norm": 4.503572940826416, "learning_rate": 2.330175658705799e-06, "loss": 1.1202, "step": 1126 }, { "epoch": 0.7804709141274239, "grad_norm": 3.9013688564300537, "learning_rate": 2.3161364762732095e-06, "loss": 1.1952, "step": 1127 }, { "epoch": 0.7811634349030471, "grad_norm": 3.9454150199890137, "learning_rate": 2.3021341729913625e-06, "loss": 1.2963, "step": 1128 }, { "epoch": 0.7818559556786704, "grad_norm": 4.15333890914917, "learning_rate": 2.2881688160651406e-06, "loss": 1.2271, "step": 1129 }, { "epoch": 0.7825484764542936, "grad_norm": 3.8262038230895996, "learning_rate": 2.2742404725220823e-06, "loss": 1.2192, "step": 1130 }, { "epoch": 0.7832409972299169, "grad_norm": 4.1932244300842285, "learning_rate": 2.260349209212086e-06, "loss": 1.2081, "step": 1131 }, { "epoch": 0.7839335180055401, "grad_norm": 3.71751070022583, "learning_rate": 2.246495092807077e-06, "loss": 1.3275, "step": 1132 }, { "epoch": 0.7846260387811634, "grad_norm": 4.331583023071289, "learning_rate": 2.232678189800698e-06, "loss": 1.279, "step": 1133 }, { "epoch": 0.7853185595567868, "grad_norm": 4.326261043548584, "learning_rate": 2.2188985665079777e-06, "loss": 1.3143, "step": 1134 }, { "epoch": 0.78601108033241, "grad_norm": 3.9819676876068115, "learning_rate": 2.20515628906502e-06, "loss": 1.1659, "step": 1135 }, { "epoch": 0.7867036011080333, "grad_norm": 3.6455228328704834, "learning_rate": 2.19145142342869e-06, "loss": 1.1818, "step": 1136 }, { "epoch": 0.7873961218836565, "grad_norm": 3.962939500808716, "learning_rate": 2.177784035376286e-06, "loss": 1.1648, "step": 1137 }, { "epoch": 0.7880886426592798, "grad_norm": 4.336730003356934, "learning_rate": 2.164154190505231e-06, "loss": 1.2002, "step": 1138 }, { "epoch": 0.788781163434903, "grad_norm": 3.8926541805267334, "learning_rate": 2.150561954232768e-06, "loss": 1.1932, "step": 1139 }, { "epoch": 0.7894736842105263, "grad_norm": 4.326043605804443, "learning_rate": 2.13700739179562e-06, "loss": 1.1593, "step": 1140 }, { "epoch": 0.7901662049861495, "grad_norm": 3.7204015254974365, "learning_rate": 2.1234905682496987e-06, "loss": 1.1474, "step": 1141 }, { "epoch": 0.7908587257617729, "grad_norm": 4.206140041351318, "learning_rate": 2.1100115484697946e-06, "loss": 1.1646, "step": 1142 }, { "epoch": 0.7915512465373962, "grad_norm": 4.0659308433532715, "learning_rate": 2.096570397149239e-06, "loss": 1.2382, "step": 1143 }, { "epoch": 0.7922437673130194, "grad_norm": 4.224088191986084, "learning_rate": 2.083167178799623e-06, "loss": 1.2274, "step": 1144 }, { "epoch": 0.7929362880886427, "grad_norm": 3.992633819580078, "learning_rate": 2.069801957750479e-06, "loss": 1.1936, "step": 1145 }, { "epoch": 0.7936288088642659, "grad_norm": 4.030457019805908, "learning_rate": 2.0564747981489643e-06, "loss": 1.0835, "step": 1146 }, { "epoch": 0.7943213296398892, "grad_norm": 4.128167629241943, "learning_rate": 2.043185763959549e-06, "loss": 1.0168, "step": 1147 }, { "epoch": 0.7950138504155124, "grad_norm": 4.305471897125244, "learning_rate": 2.029934918963735e-06, "loss": 1.1999, "step": 1148 }, { "epoch": 0.7957063711911357, "grad_norm": 3.8589119911193848, "learning_rate": 2.0167223267597246e-06, "loss": 1.208, "step": 1149 }, { "epoch": 0.796398891966759, "grad_norm": 3.898130416870117, "learning_rate": 2.0035480507621217e-06, "loss": 1.2731, "step": 1150 }, { "epoch": 0.7970914127423823, "grad_norm": 4.046352386474609, "learning_rate": 1.990412154201633e-06, "loss": 1.3092, "step": 1151 }, { "epoch": 0.7977839335180056, "grad_norm": 3.7665176391601562, "learning_rate": 1.9773147001247583e-06, "loss": 1.1797, "step": 1152 }, { "epoch": 0.7984764542936288, "grad_norm": 5.006435394287109, "learning_rate": 1.9642557513934935e-06, "loss": 1.0088, "step": 1153 }, { "epoch": 0.7991689750692521, "grad_norm": 3.8915364742279053, "learning_rate": 1.951235370685023e-06, "loss": 1.1999, "step": 1154 }, { "epoch": 0.7998614958448753, "grad_norm": 3.539031744003296, "learning_rate": 1.9382536204914214e-06, "loss": 1.2408, "step": 1155 }, { "epoch": 0.8005540166204986, "grad_norm": 3.7190475463867188, "learning_rate": 1.925310563119358e-06, "loss": 1.1743, "step": 1156 }, { "epoch": 0.8012465373961218, "grad_norm": 3.969273090362549, "learning_rate": 1.9124062606897884e-06, "loss": 1.2168, "step": 1157 }, { "epoch": 0.8019390581717452, "grad_norm": 4.360279083251953, "learning_rate": 1.899540775137666e-06, "loss": 1.3146, "step": 1158 }, { "epoch": 0.8026315789473685, "grad_norm": 3.9002370834350586, "learning_rate": 1.8867141682116373e-06, "loss": 1.2197, "step": 1159 }, { "epoch": 0.8033240997229917, "grad_norm": 4.110561370849609, "learning_rate": 1.8739265014737473e-06, "loss": 1.2671, "step": 1160 }, { "epoch": 0.804016620498615, "grad_norm": 4.611395835876465, "learning_rate": 1.861177836299155e-06, "loss": 1.1325, "step": 1161 }, { "epoch": 0.8047091412742382, "grad_norm": 3.9032986164093018, "learning_rate": 1.8484682338758154e-06, "loss": 1.3979, "step": 1162 }, { "epoch": 0.8054016620498615, "grad_norm": 4.64093542098999, "learning_rate": 1.835797755204205e-06, "loss": 1.1017, "step": 1163 }, { "epoch": 0.8060941828254847, "grad_norm": 4.68850564956665, "learning_rate": 1.8231664610970301e-06, "loss": 1.2508, "step": 1164 }, { "epoch": 0.806786703601108, "grad_norm": 3.6796510219573975, "learning_rate": 1.8105744121789226e-06, "loss": 1.3439, "step": 1165 }, { "epoch": 0.8074792243767313, "grad_norm": 3.682265043258667, "learning_rate": 1.7980216688861475e-06, "loss": 1.1941, "step": 1166 }, { "epoch": 0.8081717451523546, "grad_norm": 4.359194278717041, "learning_rate": 1.7855082914663346e-06, "loss": 1.2085, "step": 1167 }, { "epoch": 0.8088642659279779, "grad_norm": 4.421899318695068, "learning_rate": 1.773034339978167e-06, "loss": 1.2169, "step": 1168 }, { "epoch": 0.8095567867036011, "grad_norm": 3.645191192626953, "learning_rate": 1.7605998742911001e-06, "loss": 1.2932, "step": 1169 }, { "epoch": 0.8102493074792244, "grad_norm": 4.023017406463623, "learning_rate": 1.7482049540850788e-06, "loss": 1.145, "step": 1170 }, { "epoch": 0.8109418282548476, "grad_norm": 3.481224536895752, "learning_rate": 1.7358496388502422e-06, "loss": 1.2044, "step": 1171 }, { "epoch": 0.8116343490304709, "grad_norm": 3.7499403953552246, "learning_rate": 1.7235339878866474e-06, "loss": 1.1941, "step": 1172 }, { "epoch": 0.8123268698060941, "grad_norm": 3.887758255004883, "learning_rate": 1.7112580603039785e-06, "loss": 1.115, "step": 1173 }, { "epoch": 0.8130193905817175, "grad_norm": 3.799678087234497, "learning_rate": 1.6990219150212662e-06, "loss": 1.3243, "step": 1174 }, { "epoch": 0.8137119113573407, "grad_norm": 4.975963592529297, "learning_rate": 1.6868256107666015e-06, "loss": 1.1648, "step": 1175 }, { "epoch": 0.814404432132964, "grad_norm": 3.937077045440674, "learning_rate": 1.6746692060768576e-06, "loss": 1.2042, "step": 1176 }, { "epoch": 0.8150969529085873, "grad_norm": 4.324456691741943, "learning_rate": 1.6625527592974077e-06, "loss": 1.1491, "step": 1177 }, { "epoch": 0.8157894736842105, "grad_norm": 4.019317150115967, "learning_rate": 1.6504763285818437e-06, "loss": 1.2842, "step": 1178 }, { "epoch": 0.8164819944598338, "grad_norm": 3.58640718460083, "learning_rate": 1.6384399718916977e-06, "loss": 1.3005, "step": 1179 }, { "epoch": 0.817174515235457, "grad_norm": 4.04345703125, "learning_rate": 1.6264437469961703e-06, "loss": 1.1709, "step": 1180 }, { "epoch": 0.8178670360110804, "grad_norm": 3.9605162143707275, "learning_rate": 1.614487711471835e-06, "loss": 1.2854, "step": 1181 }, { "epoch": 0.8185595567867036, "grad_norm": 4.292003631591797, "learning_rate": 1.6025719227023817e-06, "loss": 1.1602, "step": 1182 }, { "epoch": 0.8192520775623269, "grad_norm": 4.0546417236328125, "learning_rate": 1.5906964378783373e-06, "loss": 1.3041, "step": 1183 }, { "epoch": 0.8199445983379502, "grad_norm": 4.742153644561768, "learning_rate": 1.5788613139967846e-06, "loss": 1.1946, "step": 1184 }, { "epoch": 0.8206371191135734, "grad_norm": 3.6266913414001465, "learning_rate": 1.5670666078610809e-06, "loss": 1.255, "step": 1185 }, { "epoch": 0.8213296398891967, "grad_norm": 3.7363014221191406, "learning_rate": 1.5553123760806144e-06, "loss": 1.2767, "step": 1186 }, { "epoch": 0.8220221606648199, "grad_norm": 3.828493118286133, "learning_rate": 1.5435986750705046e-06, "loss": 1.3323, "step": 1187 }, { "epoch": 0.8227146814404432, "grad_norm": 4.154706001281738, "learning_rate": 1.531925561051336e-06, "loss": 1.1205, "step": 1188 }, { "epoch": 0.8234072022160664, "grad_norm": 3.972463607788086, "learning_rate": 1.5202930900489056e-06, "loss": 1.2494, "step": 1189 }, { "epoch": 0.8240997229916898, "grad_norm": 4.380939960479736, "learning_rate": 1.5087013178939347e-06, "loss": 1.2353, "step": 1190 }, { "epoch": 0.824792243767313, "grad_norm": 3.84091854095459, "learning_rate": 1.497150300221808e-06, "loss": 1.1958, "step": 1191 }, { "epoch": 0.8254847645429363, "grad_norm": 3.6779820919036865, "learning_rate": 1.485640092472308e-06, "loss": 1.2238, "step": 1192 }, { "epoch": 0.8261772853185596, "grad_norm": 3.7688772678375244, "learning_rate": 1.4741707498893487e-06, "loss": 1.1069, "step": 1193 }, { "epoch": 0.8268698060941828, "grad_norm": 4.308027744293213, "learning_rate": 1.4627423275207075e-06, "loss": 1.1624, "step": 1194 }, { "epoch": 0.8275623268698061, "grad_norm": 3.8426570892333984, "learning_rate": 1.4513548802177635e-06, "loss": 1.3091, "step": 1195 }, { "epoch": 0.8282548476454293, "grad_norm": 3.7444517612457275, "learning_rate": 1.440008462635234e-06, "loss": 1.1742, "step": 1196 }, { "epoch": 0.8289473684210527, "grad_norm": 3.861121892929077, "learning_rate": 1.4287031292309105e-06, "loss": 1.0395, "step": 1197 }, { "epoch": 0.8296398891966759, "grad_norm": 3.4704298973083496, "learning_rate": 1.4174389342653972e-06, "loss": 1.2084, "step": 1198 }, { "epoch": 0.8303324099722992, "grad_norm": 3.99474835395813, "learning_rate": 1.406215931801862e-06, "loss": 1.1811, "step": 1199 }, { "epoch": 0.8310249307479224, "grad_norm": 3.7253665924072266, "learning_rate": 1.395034175705753e-06, "loss": 1.0946, "step": 1200 }, { "epoch": 0.8317174515235457, "grad_norm": 3.6922335624694824, "learning_rate": 1.383893719644558e-06, "loss": 1.183, "step": 1201 }, { "epoch": 0.832409972299169, "grad_norm": 3.5496158599853516, "learning_rate": 1.3727946170875538e-06, "loss": 1.2656, "step": 1202 }, { "epoch": 0.8331024930747922, "grad_norm": 3.911721706390381, "learning_rate": 1.361736921305522e-06, "loss": 1.2568, "step": 1203 }, { "epoch": 0.8337950138504155, "grad_norm": 4.524016380310059, "learning_rate": 1.3507206853705178e-06, "loss": 1.1337, "step": 1204 }, { "epoch": 0.8344875346260388, "grad_norm": 4.609461784362793, "learning_rate": 1.339745962155613e-06, "loss": 1.228, "step": 1205 }, { "epoch": 0.8351800554016621, "grad_norm": 4.127496242523193, "learning_rate": 1.3288128043346315e-06, "loss": 1.155, "step": 1206 }, { "epoch": 0.8358725761772853, "grad_norm": 3.739071846008301, "learning_rate": 1.3179212643818928e-06, "loss": 1.1306, "step": 1207 }, { "epoch": 0.8365650969529086, "grad_norm": 4.528608798980713, "learning_rate": 1.307071394571986e-06, "loss": 1.2538, "step": 1208 }, { "epoch": 0.8372576177285319, "grad_norm": 4.651935577392578, "learning_rate": 1.2962632469794901e-06, "loss": 1.241, "step": 1209 }, { "epoch": 0.8379501385041551, "grad_norm": 3.754818916320801, "learning_rate": 1.2854968734787398e-06, "loss": 1.1738, "step": 1210 }, { "epoch": 0.8386426592797784, "grad_norm": 4.032466888427734, "learning_rate": 1.2747723257435729e-06, "loss": 1.3379, "step": 1211 }, { "epoch": 0.8393351800554016, "grad_norm": 3.8999366760253906, "learning_rate": 1.2640896552470795e-06, "loss": 1.2087, "step": 1212 }, { "epoch": 0.840027700831025, "grad_norm": 5.117029666900635, "learning_rate": 1.2534489132613603e-06, "loss": 1.0604, "step": 1213 }, { "epoch": 0.8407202216066482, "grad_norm": 3.513732433319092, "learning_rate": 1.2428501508572765e-06, "loss": 1.2153, "step": 1214 }, { "epoch": 0.8414127423822715, "grad_norm": 4.629495620727539, "learning_rate": 1.232293418904207e-06, "loss": 1.0813, "step": 1215 }, { "epoch": 0.8421052631578947, "grad_norm": 3.8420591354370117, "learning_rate": 1.221778768069799e-06, "loss": 1.1894, "step": 1216 }, { "epoch": 0.842797783933518, "grad_norm": 4.045253753662109, "learning_rate": 1.2113062488197347e-06, "loss": 1.2621, "step": 1217 }, { "epoch": 0.8434903047091413, "grad_norm": 4.0366950035095215, "learning_rate": 1.2008759114174794e-06, "loss": 1.105, "step": 1218 }, { "epoch": 0.8441828254847645, "grad_norm": 4.136510372161865, "learning_rate": 1.1904878059240443e-06, "loss": 1.155, "step": 1219 }, { "epoch": 0.8448753462603878, "grad_norm": 3.633707284927368, "learning_rate": 1.1801419821977479e-06, "loss": 1.0978, "step": 1220 }, { "epoch": 0.8455678670360111, "grad_norm": 4.147572040557861, "learning_rate": 1.1698384898939774e-06, "loss": 1.2548, "step": 1221 }, { "epoch": 0.8462603878116344, "grad_norm": 4.219207286834717, "learning_rate": 1.159577378464939e-06, "loss": 1.0852, "step": 1222 }, { "epoch": 0.8469529085872576, "grad_norm": 3.746647596359253, "learning_rate": 1.149358697159435e-06, "loss": 1.2269, "step": 1223 }, { "epoch": 0.8476454293628809, "grad_norm": 3.936307430267334, "learning_rate": 1.1391824950226272e-06, "loss": 0.9683, "step": 1224 }, { "epoch": 0.8483379501385041, "grad_norm": 4.122901439666748, "learning_rate": 1.1290488208957894e-06, "loss": 1.1598, "step": 1225 }, { "epoch": 0.8490304709141274, "grad_norm": 4.317487716674805, "learning_rate": 1.1189577234160764e-06, "loss": 1.2321, "step": 1226 }, { "epoch": 0.8497229916897507, "grad_norm": 3.937451124191284, "learning_rate": 1.1089092510163025e-06, "loss": 1.253, "step": 1227 }, { "epoch": 0.850415512465374, "grad_norm": 3.7667171955108643, "learning_rate": 1.0989034519246956e-06, "loss": 1.1728, "step": 1228 }, { "epoch": 0.8511080332409973, "grad_norm": 4.663061618804932, "learning_rate": 1.088940374164672e-06, "loss": 1.3034, "step": 1229 }, { "epoch": 0.8518005540166205, "grad_norm": 4.142401218414307, "learning_rate": 1.0790200655546035e-06, "loss": 1.1516, "step": 1230 }, { "epoch": 0.8524930747922438, "grad_norm": 4.334724426269531, "learning_rate": 1.06914257370759e-06, "loss": 1.3624, "step": 1231 }, { "epoch": 0.853185595567867, "grad_norm": 5.013029098510742, "learning_rate": 1.0593079460312284e-06, "loss": 1.0803, "step": 1232 }, { "epoch": 0.8538781163434903, "grad_norm": 4.472379684448242, "learning_rate": 1.0495162297273876e-06, "loss": 0.9917, "step": 1233 }, { "epoch": 0.8545706371191135, "grad_norm": 4.156463146209717, "learning_rate": 1.0397674717919803e-06, "loss": 1.2364, "step": 1234 }, { "epoch": 0.8552631578947368, "grad_norm": 4.236593246459961, "learning_rate": 1.0300617190147388e-06, "loss": 1.2187, "step": 1235 }, { "epoch": 0.8559556786703602, "grad_norm": 4.179319381713867, "learning_rate": 1.0203990179789892e-06, "loss": 1.2905, "step": 1236 }, { "epoch": 0.8566481994459834, "grad_norm": 3.6013402938842773, "learning_rate": 1.010779415061428e-06, "loss": 1.1256, "step": 1237 }, { "epoch": 0.8573407202216067, "grad_norm": 4.2881340980529785, "learning_rate": 1.0012029564318993e-06, "loss": 1.1806, "step": 1238 }, { "epoch": 0.8580332409972299, "grad_norm": 3.446340322494507, "learning_rate": 9.916696880531739e-07, "loss": 1.1771, "step": 1239 }, { "epoch": 0.8587257617728532, "grad_norm": 3.927043914794922, "learning_rate": 9.82179655680734e-07, "loss": 1.1842, "step": 1240 }, { "epoch": 0.8594182825484764, "grad_norm": 4.7350616455078125, "learning_rate": 9.727329048625355e-07, "loss": 1.1911, "step": 1241 }, { "epoch": 0.8601108033240997, "grad_norm": 4.3509039878845215, "learning_rate": 9.633294809388127e-07, "loss": 1.1404, "step": 1242 }, { "epoch": 0.860803324099723, "grad_norm": 3.8866496086120605, "learning_rate": 9.539694290418488e-07, "loss": 1.1203, "step": 1243 }, { "epoch": 0.8614958448753463, "grad_norm": 4.465489387512207, "learning_rate": 9.446527940957573e-07, "loss": 1.2293, "step": 1244 }, { "epoch": 0.8621883656509696, "grad_norm": 4.468167304992676, "learning_rate": 9.353796208162669e-07, "loss": 1.1552, "step": 1245 }, { "epoch": 0.8628808864265928, "grad_norm": 4.596248626708984, "learning_rate": 9.261499537105179e-07, "loss": 1.252, "step": 1246 }, { "epoch": 0.8635734072022161, "grad_norm": 3.8985869884490967, "learning_rate": 9.169638370768341e-07, "loss": 1.1292, "step": 1247 }, { "epoch": 0.8642659279778393, "grad_norm": 5.140615463256836, "learning_rate": 9.078213150045145e-07, "loss": 1.2206, "step": 1248 }, { "epoch": 0.8649584487534626, "grad_norm": 4.030377388000488, "learning_rate": 8.987224313736309e-07, "loss": 1.2621, "step": 1249 }, { "epoch": 0.8656509695290858, "grad_norm": 4.131312370300293, "learning_rate": 8.896672298548037e-07, "loss": 1.2128, "step": 1250 }, { "epoch": 0.8663434903047091, "grad_norm": 4.123977184295654, "learning_rate": 8.806557539089988e-07, "loss": 1.2752, "step": 1251 }, { "epoch": 0.8670360110803325, "grad_norm": 4.873196125030518, "learning_rate": 8.716880467873235e-07, "loss": 1.1041, "step": 1252 }, { "epoch": 0.8677285318559557, "grad_norm": 3.93961238861084, "learning_rate": 8.627641515308072e-07, "loss": 1.2894, "step": 1253 }, { "epoch": 0.868421052631579, "grad_norm": 3.7824409008026123, "learning_rate": 8.538841109702079e-07, "loss": 1.2282, "step": 1254 }, { "epoch": 0.8691135734072022, "grad_norm": 4.6755757331848145, "learning_rate": 8.450479677257962e-07, "loss": 1.0459, "step": 1255 }, { "epoch": 0.8698060941828255, "grad_norm": 4.243119716644287, "learning_rate": 8.362557642071567e-07, "loss": 1.317, "step": 1256 }, { "epoch": 0.8704986149584487, "grad_norm": 4.0560479164123535, "learning_rate": 8.275075426129831e-07, "loss": 1.2543, "step": 1257 }, { "epoch": 0.871191135734072, "grad_norm": 3.3924214839935303, "learning_rate": 8.188033449308719e-07, "loss": 1.2128, "step": 1258 }, { "epoch": 0.8718836565096952, "grad_norm": 3.7904396057128906, "learning_rate": 8.101432129371345e-07, "loss": 1.1834, "step": 1259 }, { "epoch": 0.8725761772853186, "grad_norm": 3.563037395477295, "learning_rate": 8.015271881965714e-07, "loss": 1.2524, "step": 1260 }, { "epoch": 0.8732686980609419, "grad_norm": 3.5340735912323, "learning_rate": 7.929553120622968e-07, "loss": 1.1476, "step": 1261 }, { "epoch": 0.8739612188365651, "grad_norm": 4.195478916168213, "learning_rate": 7.844276256755345e-07, "loss": 1.2139, "step": 1262 }, { "epoch": 0.8746537396121884, "grad_norm": 4.329362392425537, "learning_rate": 7.759441699654068e-07, "loss": 1.1694, "step": 1263 }, { "epoch": 0.8753462603878116, "grad_norm": 3.519171953201294, "learning_rate": 7.675049856487549e-07, "loss": 1.1948, "step": 1264 }, { "epoch": 0.8760387811634349, "grad_norm": 3.8010988235473633, "learning_rate": 7.591101132299383e-07, "loss": 1.3021, "step": 1265 }, { "epoch": 0.8767313019390581, "grad_norm": 3.752256155014038, "learning_rate": 7.507595930006351e-07, "loss": 1.1363, "step": 1266 }, { "epoch": 0.8774238227146814, "grad_norm": 3.8671414852142334, "learning_rate": 7.42453465039652e-07, "loss": 1.1246, "step": 1267 }, { "epoch": 0.8781163434903048, "grad_norm": 4.748464584350586, "learning_rate": 7.341917692127354e-07, "loss": 1.2564, "step": 1268 }, { "epoch": 0.878808864265928, "grad_norm": 3.782492160797119, "learning_rate": 7.259745451723765e-07, "loss": 1.2389, "step": 1269 }, { "epoch": 0.8795013850415513, "grad_norm": 3.492922306060791, "learning_rate": 7.178018323576208e-07, "loss": 1.1598, "step": 1270 }, { "epoch": 0.8801939058171745, "grad_norm": 4.3590407371521, "learning_rate": 7.0967366999388e-07, "loss": 1.1275, "step": 1271 }, { "epoch": 0.8808864265927978, "grad_norm": 3.7808170318603516, "learning_rate": 7.015900970927414e-07, "loss": 1.2337, "step": 1272 }, { "epoch": 0.881578947368421, "grad_norm": 3.8861465454101562, "learning_rate": 6.935511524517835e-07, "loss": 1.2613, "step": 1273 }, { "epoch": 0.8822714681440443, "grad_norm": 4.897149562835693, "learning_rate": 6.85556874654395e-07, "loss": 1.0766, "step": 1274 }, { "epoch": 0.8829639889196675, "grad_norm": 3.961945056915283, "learning_rate": 6.776073020695728e-07, "loss": 1.178, "step": 1275 }, { "epoch": 0.8836565096952909, "grad_norm": 3.9338932037353516, "learning_rate": 6.697024728517531e-07, "loss": 1.2335, "step": 1276 }, { "epoch": 0.8843490304709142, "grad_norm": 4.2958598136901855, "learning_rate": 6.618424249406297e-07, "loss": 1.1101, "step": 1277 }, { "epoch": 0.8850415512465374, "grad_norm": 4.226284980773926, "learning_rate": 6.540271960609568e-07, "loss": 1.1793, "step": 1278 }, { "epoch": 0.8857340720221607, "grad_norm": 3.9522910118103027, "learning_rate": 6.462568237223787e-07, "loss": 1.2444, "step": 1279 }, { "epoch": 0.8864265927977839, "grad_norm": 4.591524600982666, "learning_rate": 6.385313452192554e-07, "loss": 1.233, "step": 1280 }, { "epoch": 0.8871191135734072, "grad_norm": 3.602266550064087, "learning_rate": 6.308507976304701e-07, "loss": 1.1108, "step": 1281 }, { "epoch": 0.8878116343490304, "grad_norm": 4.180389881134033, "learning_rate": 6.23215217819253e-07, "loss": 1.2063, "step": 1282 }, { "epoch": 0.8885041551246537, "grad_norm": 3.8051416873931885, "learning_rate": 6.156246424330215e-07, "loss": 1.3125, "step": 1283 }, { "epoch": 0.889196675900277, "grad_norm": 4.475870609283447, "learning_rate": 6.080791079031811e-07, "loss": 1.2985, "step": 1284 }, { "epoch": 0.8898891966759003, "grad_norm": 4.486598968505859, "learning_rate": 6.005786504449651e-07, "loss": 1.1294, "step": 1285 }, { "epoch": 0.8905817174515236, "grad_norm": 4.49049711227417, "learning_rate": 5.93123306057255e-07, "loss": 1.1075, "step": 1286 }, { "epoch": 0.8912742382271468, "grad_norm": 3.9002959728240967, "learning_rate": 5.857131105224123e-07, "loss": 1.2577, "step": 1287 }, { "epoch": 0.8919667590027701, "grad_norm": 5.098270416259766, "learning_rate": 5.783480994061019e-07, "loss": 1.137, "step": 1288 }, { "epoch": 0.8926592797783933, "grad_norm": 4.351301193237305, "learning_rate": 5.710283080571233e-07, "loss": 1.3018, "step": 1289 }, { "epoch": 0.8933518005540166, "grad_norm": 3.7987148761749268, "learning_rate": 5.637537716072416e-07, "loss": 1.322, "step": 1290 }, { "epoch": 0.8940443213296398, "grad_norm": 3.730034589767456, "learning_rate": 5.565245249710194e-07, "loss": 1.289, "step": 1291 }, { "epoch": 0.8947368421052632, "grad_norm": 3.9326162338256836, "learning_rate": 5.49340602845646e-07, "loss": 1.0907, "step": 1292 }, { "epoch": 0.8954293628808865, "grad_norm": 4.038126468658447, "learning_rate": 5.422020397107753e-07, "loss": 1.2528, "step": 1293 }, { "epoch": 0.8961218836565097, "grad_norm": 3.6822099685668945, "learning_rate": 5.351088698283557e-07, "loss": 1.2576, "step": 1294 }, { "epoch": 0.896814404432133, "grad_norm": 4.149168014526367, "learning_rate": 5.280611272424696e-07, "loss": 1.2217, "step": 1295 }, { "epoch": 0.8975069252077562, "grad_norm": 3.3951871395111084, "learning_rate": 5.21058845779171e-07, "loss": 1.27, "step": 1296 }, { "epoch": 0.8981994459833795, "grad_norm": 3.9425158500671387, "learning_rate": 5.141020590463142e-07, "loss": 1.1927, "step": 1297 }, { "epoch": 0.8988919667590027, "grad_norm": 3.875690460205078, "learning_rate": 5.071908004334025e-07, "loss": 1.2137, "step": 1298 }, { "epoch": 0.899584487534626, "grad_norm": 4.035533905029297, "learning_rate": 5.003251031114287e-07, "loss": 1.2056, "step": 1299 }, { "epoch": 0.9002770083102493, "grad_norm": 4.189031600952148, "learning_rate": 4.935050000327046e-07, "loss": 1.2325, "step": 1300 }, { "epoch": 0.9009695290858726, "grad_norm": 3.710629940032959, "learning_rate": 4.867305239307096e-07, "loss": 1.1215, "step": 1301 }, { "epoch": 0.9016620498614959, "grad_norm": 3.9204201698303223, "learning_rate": 4.800017073199403e-07, "loss": 1.2987, "step": 1302 }, { "epoch": 0.9023545706371191, "grad_norm": 4.115154266357422, "learning_rate": 4.73318582495742e-07, "loss": 1.1934, "step": 1303 }, { "epoch": 0.9030470914127424, "grad_norm": 4.008752822875977, "learning_rate": 4.666811815341643e-07, "loss": 0.9978, "step": 1304 }, { "epoch": 0.9037396121883656, "grad_norm": 4.517586708068848, "learning_rate": 4.6008953629179676e-07, "loss": 1.2414, "step": 1305 }, { "epoch": 0.9044321329639889, "grad_norm": 3.903789520263672, "learning_rate": 4.5354367840562684e-07, "loss": 1.2271, "step": 1306 }, { "epoch": 0.9051246537396122, "grad_norm": 3.9774749279022217, "learning_rate": 4.470436392928812e-07, "loss": 1.2613, "step": 1307 }, { "epoch": 0.9058171745152355, "grad_norm": 4.200412273406982, "learning_rate": 4.4058945015087453e-07, "loss": 1.2716, "step": 1308 }, { "epoch": 0.9065096952908587, "grad_norm": 4.395098686218262, "learning_rate": 4.341811419568653e-07, "loss": 1.1752, "step": 1309 }, { "epoch": 0.907202216066482, "grad_norm": 4.11729097366333, "learning_rate": 4.278187454679006e-07, "loss": 1.2215, "step": 1310 }, { "epoch": 0.9078947368421053, "grad_norm": 4.524949550628662, "learning_rate": 4.2150229122067565e-07, "loss": 1.1198, "step": 1311 }, { "epoch": 0.9085872576177285, "grad_norm": 3.813676595687866, "learning_rate": 4.1523180953137785e-07, "loss": 1.2266, "step": 1312 }, { "epoch": 0.9092797783933518, "grad_norm": 3.7839903831481934, "learning_rate": 4.090073304955511e-07, "loss": 1.2149, "step": 1313 }, { "epoch": 0.909972299168975, "grad_norm": 3.726961374282837, "learning_rate": 4.0282888398794353e-07, "loss": 1.2636, "step": 1314 }, { "epoch": 0.9106648199445984, "grad_norm": 3.8480069637298584, "learning_rate": 3.9669649966237347e-07, "loss": 1.1296, "step": 1315 }, { "epoch": 0.9113573407202216, "grad_norm": 4.21983528137207, "learning_rate": 3.906102069515727e-07, "loss": 1.181, "step": 1316 }, { "epoch": 0.9120498614958449, "grad_norm": 4.2239298820495605, "learning_rate": 3.845700350670567e-07, "loss": 1.1858, "step": 1317 }, { "epoch": 0.9127423822714681, "grad_norm": 4.391743183135986, "learning_rate": 3.785760129989868e-07, "loss": 1.1603, "step": 1318 }, { "epoch": 0.9134349030470914, "grad_norm": 4.167227745056152, "learning_rate": 3.7262816951602057e-07, "loss": 1.1548, "step": 1319 }, { "epoch": 0.9141274238227147, "grad_norm": 3.850712537765503, "learning_rate": 3.6672653316517595e-07, "loss": 1.2399, "step": 1320 }, { "epoch": 0.9148199445983379, "grad_norm": 4.206256866455078, "learning_rate": 3.6087113227170287e-07, "loss": 1.202, "step": 1321 }, { "epoch": 0.9155124653739612, "grad_norm": 4.473740577697754, "learning_rate": 3.5506199493894e-07, "loss": 1.2319, "step": 1322 }, { "epoch": 0.9162049861495845, "grad_norm": 3.7945244312286377, "learning_rate": 3.4929914904817677e-07, "loss": 1.2128, "step": 1323 }, { "epoch": 0.9168975069252078, "grad_norm": 3.6557018756866455, "learning_rate": 3.4358262225853255e-07, "loss": 1.1691, "step": 1324 }, { "epoch": 0.917590027700831, "grad_norm": 4.495970726013184, "learning_rate": 3.379124420068081e-07, "loss": 1.2153, "step": 1325 }, { "epoch": 0.9182825484764543, "grad_norm": 3.9685280323028564, "learning_rate": 3.32288635507364e-07, "loss": 1.2664, "step": 1326 }, { "epoch": 0.9189750692520776, "grad_norm": 3.6302640438079834, "learning_rate": 3.267112297519881e-07, "loss": 1.0025, "step": 1327 }, { "epoch": 0.9196675900277008, "grad_norm": 3.974372386932373, "learning_rate": 3.2118025150976394e-07, "loss": 1.1851, "step": 1328 }, { "epoch": 0.9203601108033241, "grad_norm": 3.8948326110839844, "learning_rate": 3.156957273269434e-07, "loss": 1.1475, "step": 1329 }, { "epoch": 0.9210526315789473, "grad_norm": 3.5848686695098877, "learning_rate": 3.102576835268212e-07, "loss": 1.2402, "step": 1330 }, { "epoch": 0.9217451523545707, "grad_norm": 3.9654040336608887, "learning_rate": 3.0486614620960476e-07, "loss": 1.1549, "step": 1331 }, { "epoch": 0.9224376731301939, "grad_norm": 3.918294668197632, "learning_rate": 2.995211412522914e-07, "loss": 1.1275, "step": 1332 }, { "epoch": 0.9231301939058172, "grad_norm": 4.397765159606934, "learning_rate": 2.942226943085424e-07, "loss": 1.1402, "step": 1333 }, { "epoch": 0.9238227146814404, "grad_norm": 4.527344703674316, "learning_rate": 2.8897083080856703e-07, "loss": 1.0616, "step": 1334 }, { "epoch": 0.9245152354570637, "grad_norm": 3.974189519882202, "learning_rate": 2.8376557595898635e-07, "loss": 1.1935, "step": 1335 }, { "epoch": 0.925207756232687, "grad_norm": 4.0205817222595215, "learning_rate": 2.786069547427239e-07, "loss": 1.2264, "step": 1336 }, { "epoch": 0.9259002770083102, "grad_norm": 4.807229518890381, "learning_rate": 2.7349499191888675e-07, "loss": 1.095, "step": 1337 }, { "epoch": 0.9265927977839336, "grad_norm": 4.34225606918335, "learning_rate": 2.6842971202263536e-07, "loss": 1.0819, "step": 1338 }, { "epoch": 0.9272853185595568, "grad_norm": 4.332900047302246, "learning_rate": 2.634111393650751e-07, "loss": 1.0125, "step": 1339 }, { "epoch": 0.9279778393351801, "grad_norm": 3.523615598678589, "learning_rate": 2.584392980331396e-07, "loss": 1.1952, "step": 1340 }, { "epoch": 0.9286703601108033, "grad_norm": 4.648331642150879, "learning_rate": 2.5351421188947287e-07, "loss": 0.9513, "step": 1341 }, { "epoch": 0.9293628808864266, "grad_norm": 4.426600456237793, "learning_rate": 2.4863590457230745e-07, "loss": 1.124, "step": 1342 }, { "epoch": 0.9300554016620498, "grad_norm": 4.5962233543396, "learning_rate": 2.438043994953687e-07, "loss": 1.1684, "step": 1343 }, { "epoch": 0.9307479224376731, "grad_norm": 3.893246650695801, "learning_rate": 2.3901971984774394e-07, "loss": 1.2581, "step": 1344 }, { "epoch": 0.9314404432132964, "grad_norm": 5.20170259475708, "learning_rate": 2.3428188859378253e-07, "loss": 0.9631, "step": 1345 }, { "epoch": 0.9321329639889196, "grad_norm": 3.8217201232910156, "learning_rate": 2.2959092847298358e-07, "loss": 1.1847, "step": 1346 }, { "epoch": 0.932825484764543, "grad_norm": 4.310759544372559, "learning_rate": 2.2494686199988069e-07, "loss": 1.2011, "step": 1347 }, { "epoch": 0.9335180055401662, "grad_norm": 3.6743648052215576, "learning_rate": 2.2034971146394302e-07, "loss": 1.324, "step": 1348 }, { "epoch": 0.9342105263157895, "grad_norm": 3.7658603191375732, "learning_rate": 2.1579949892946206e-07, "loss": 1.1968, "step": 1349 }, { "epoch": 0.9349030470914127, "grad_norm": 3.821629285812378, "learning_rate": 2.1129624623544843e-07, "loss": 1.3141, "step": 1350 }, { "epoch": 0.935595567867036, "grad_norm": 4.54724645614624, "learning_rate": 2.0683997499552632e-07, "loss": 1.0023, "step": 1351 }, { "epoch": 0.9362880886426593, "grad_norm": 3.9132909774780273, "learning_rate": 2.0243070659782705e-07, "loss": 1.186, "step": 1352 }, { "epoch": 0.9369806094182825, "grad_norm": 3.962965488433838, "learning_rate": 1.980684622048945e-07, "loss": 1.3023, "step": 1353 }, { "epoch": 0.9376731301939059, "grad_norm": 4.44710111618042, "learning_rate": 1.937532627535721e-07, "loss": 1.2159, "step": 1354 }, { "epoch": 0.9383656509695291, "grad_norm": 4.224870681762695, "learning_rate": 1.8948512895491156e-07, "loss": 1.2596, "step": 1355 }, { "epoch": 0.9390581717451524, "grad_norm": 4.837390422821045, "learning_rate": 1.8526408129407093e-07, "loss": 1.1943, "step": 1356 }, { "epoch": 0.9397506925207756, "grad_norm": 3.966322422027588, "learning_rate": 1.8109014003021453e-07, "loss": 1.3276, "step": 1357 }, { "epoch": 0.9404432132963989, "grad_norm": 4.658303260803223, "learning_rate": 1.7696332519641313e-07, "loss": 1.0963, "step": 1358 }, { "epoch": 0.9411357340720221, "grad_norm": 3.8974692821502686, "learning_rate": 1.7288365659956062e-07, "loss": 1.3048, "step": 1359 }, { "epoch": 0.9418282548476454, "grad_norm": 3.9904167652130127, "learning_rate": 1.6885115382026084e-07, "loss": 1.2654, "step": 1360 }, { "epoch": 0.9425207756232687, "grad_norm": 3.089048385620117, "learning_rate": 1.6486583621274532e-07, "loss": 0.6291, "step": 1361 }, { "epoch": 0.943213296398892, "grad_norm": 3.594970703125, "learning_rate": 1.609277229047801e-07, "loss": 1.2119, "step": 1362 }, { "epoch": 0.9439058171745153, "grad_norm": 4.219138145446777, "learning_rate": 1.5703683279756797e-07, "loss": 1.2162, "step": 1363 }, { "epoch": 0.9445983379501385, "grad_norm": 3.839979648590088, "learning_rate": 1.5319318456566424e-07, "loss": 1.246, "step": 1364 }, { "epoch": 0.9452908587257618, "grad_norm": 3.8669729232788086, "learning_rate": 1.49396796656881e-07, "loss": 1.3069, "step": 1365 }, { "epoch": 0.945983379501385, "grad_norm": 4.179749965667725, "learning_rate": 1.4564768729220414e-07, "loss": 1.2152, "step": 1366 }, { "epoch": 0.9466759002770083, "grad_norm": 3.3702120780944824, "learning_rate": 1.4194587446570206e-07, "loss": 1.1243, "step": 1367 }, { "epoch": 0.9473684210526315, "grad_norm": 4.31612491607666, "learning_rate": 1.3829137594444154e-07, "loss": 1.3198, "step": 1368 }, { "epoch": 0.9480609418282548, "grad_norm": 4.114274501800537, "learning_rate": 1.3468420926840198e-07, "loss": 1.2286, "step": 1369 }, { "epoch": 0.9487534626038782, "grad_norm": 3.7707936763763428, "learning_rate": 1.3112439175038794e-07, "loss": 1.1814, "step": 1370 }, { "epoch": 0.9494459833795014, "grad_norm": 4.673649787902832, "learning_rate": 1.276119404759535e-07, "loss": 1.159, "step": 1371 }, { "epoch": 0.9501385041551247, "grad_norm": 4.296148777008057, "learning_rate": 1.2414687230331124e-07, "loss": 1.2115, "step": 1372 }, { "epoch": 0.9508310249307479, "grad_norm": 4.370774269104004, "learning_rate": 1.2072920386325793e-07, "loss": 1.2973, "step": 1373 }, { "epoch": 0.9515235457063712, "grad_norm": 4.4123029708862305, "learning_rate": 1.1735895155909338e-07, "loss": 1.2401, "step": 1374 }, { "epoch": 0.9522160664819944, "grad_norm": 4.202178955078125, "learning_rate": 1.1403613156654058e-07, "loss": 1.1944, "step": 1375 }, { "epoch": 0.9529085872576177, "grad_norm": 3.994661808013916, "learning_rate": 1.1076075983366574e-07, "loss": 1.2576, "step": 1376 }, { "epoch": 0.953601108033241, "grad_norm": 3.8380768299102783, "learning_rate": 1.075328520808061e-07, "loss": 1.2041, "step": 1377 }, { "epoch": 0.9542936288088643, "grad_norm": 4.077901840209961, "learning_rate": 1.043524238004956e-07, "loss": 1.1962, "step": 1378 }, { "epoch": 0.9549861495844876, "grad_norm": 3.8877198696136475, "learning_rate": 1.0121949025738486e-07, "loss": 1.189, "step": 1379 }, { "epoch": 0.9556786703601108, "grad_norm": 3.7698280811309814, "learning_rate": 9.813406648816804e-08, "loss": 1.231, "step": 1380 }, { "epoch": 0.9563711911357341, "grad_norm": 3.8442647457122803, "learning_rate": 9.509616730151827e-08, "loss": 1.2865, "step": 1381 }, { "epoch": 0.9570637119113573, "grad_norm": 3.9559361934661865, "learning_rate": 9.21058072780101e-08, "loss": 1.1376, "step": 1382 }, { "epoch": 0.9577562326869806, "grad_norm": 3.7578396797180176, "learning_rate": 8.916300077005057e-08, "loss": 1.1069, "step": 1383 }, { "epoch": 0.9584487534626038, "grad_norm": 4.029587745666504, "learning_rate": 8.62677619018104e-08, "loss": 1.254, "step": 1384 }, { "epoch": 0.9591412742382271, "grad_norm": 4.258833408355713, "learning_rate": 8.342010456915739e-08, "loss": 1.2295, "step": 1385 }, { "epoch": 0.9598337950138505, "grad_norm": 4.724644660949707, "learning_rate": 8.062004243958866e-08, "loss": 1.0733, "step": 1386 }, { "epoch": 0.9605263157894737, "grad_norm": 4.294256210327148, "learning_rate": 7.786758895216629e-08, "loss": 1.1054, "step": 1387 }, { "epoch": 0.961218836565097, "grad_norm": 5.140510559082031, "learning_rate": 7.51627573174507e-08, "loss": 1.1778, "step": 1388 }, { "epoch": 0.9619113573407202, "grad_norm": 3.909797191619873, "learning_rate": 7.250556051743962e-08, "loss": 1.2818, "step": 1389 }, { "epoch": 0.9626038781163435, "grad_norm": 4.233206272125244, "learning_rate": 6.98960113055025e-08, "loss": 1.2068, "step": 1390 }, { "epoch": 0.9632963988919667, "grad_norm": 4.595902919769287, "learning_rate": 6.733412220632396e-08, "loss": 1.0369, "step": 1391 }, { "epoch": 0.96398891966759, "grad_norm": 3.9723401069641113, "learning_rate": 6.481990551583939e-08, "loss": 1.1125, "step": 1392 }, { "epoch": 0.9646814404432132, "grad_norm": 4.163870811462402, "learning_rate": 6.235337330117829e-08, "loss": 1.2698, "step": 1393 }, { "epoch": 0.9653739612188366, "grad_norm": 3.5928964614868164, "learning_rate": 5.993453740060773e-08, "loss": 1.2333, "step": 1394 }, { "epoch": 0.9660664819944599, "grad_norm": 4.822768211364746, "learning_rate": 5.756340942346894e-08, "loss": 1.0382, "step": 1395 }, { "epoch": 0.9667590027700831, "grad_norm": 3.9965407848358154, "learning_rate": 5.5240000750129695e-08, "loss": 1.0812, "step": 1396 }, { "epoch": 0.9674515235457064, "grad_norm": 3.666273832321167, "learning_rate": 5.29643225319254e-08, "loss": 1.2505, "step": 1397 }, { "epoch": 0.9681440443213296, "grad_norm": 4.149451732635498, "learning_rate": 5.0736385691106946e-08, "loss": 1.2226, "step": 1398 }, { "epoch": 0.9688365650969529, "grad_norm": 3.7079765796661377, "learning_rate": 4.8556200920786276e-08, "loss": 1.1604, "step": 1399 }, { "epoch": 0.9695290858725761, "grad_norm": 3.9731240272521973, "learning_rate": 4.642377868488646e-08, "loss": 1.2417, "step": 1400 }, { "epoch": 0.9702216066481995, "grad_norm": 4.004926681518555, "learning_rate": 4.4339129218095025e-08, "loss": 1.1581, "step": 1401 }, { "epoch": 0.9709141274238227, "grad_norm": 3.550807237625122, "learning_rate": 4.230226252580516e-08, "loss": 1.2245, "step": 1402 }, { "epoch": 0.971606648199446, "grad_norm": 3.742629289627075, "learning_rate": 4.031318838407905e-08, "loss": 1.2273, "step": 1403 }, { "epoch": 0.9722991689750693, "grad_norm": 3.9137470722198486, "learning_rate": 3.837191633959458e-08, "loss": 1.1456, "step": 1404 }, { "epoch": 0.9729916897506925, "grad_norm": 3.960930585861206, "learning_rate": 3.6478455709598735e-08, "loss": 1.1765, "step": 1405 }, { "epoch": 0.9736842105263158, "grad_norm": 3.7228150367736816, "learning_rate": 3.4632815581866484e-08, "loss": 1.1441, "step": 1406 }, { "epoch": 0.974376731301939, "grad_norm": 3.5775821208953857, "learning_rate": 3.28350048146564e-08, "loss": 1.2506, "step": 1407 }, { "epoch": 0.9750692520775623, "grad_norm": 3.720226526260376, "learning_rate": 3.108503203666402e-08, "loss": 1.1547, "step": 1408 }, { "epoch": 0.9757617728531855, "grad_norm": 3.89373517036438, "learning_rate": 2.9382905646986316e-08, "loss": 1.3248, "step": 1409 }, { "epoch": 0.9764542936288089, "grad_norm": 4.407764434814453, "learning_rate": 2.7728633815079508e-08, "loss": 1.2345, "step": 1410 }, { "epoch": 0.9771468144044322, "grad_norm": 4.202113628387451, "learning_rate": 2.6122224480715774e-08, "loss": 1.093, "step": 1411 }, { "epoch": 0.9778393351800554, "grad_norm": 4.07959508895874, "learning_rate": 2.456368535394993e-08, "loss": 1.3626, "step": 1412 }, { "epoch": 0.9785318559556787, "grad_norm": 3.8587090969085693, "learning_rate": 2.3053023915083904e-08, "loss": 1.3964, "step": 1413 }, { "epoch": 0.9792243767313019, "grad_norm": 3.3332502841949463, "learning_rate": 2.1590247414624566e-08, "loss": 1.1588, "step": 1414 }, { "epoch": 0.9799168975069252, "grad_norm": 3.622792959213257, "learning_rate": 2.0175362873250394e-08, "loss": 1.2278, "step": 1415 }, { "epoch": 0.9806094182825484, "grad_norm": 3.7608649730682373, "learning_rate": 1.8808377081785954e-08, "loss": 1.2254, "step": 1416 }, { "epoch": 0.9813019390581718, "grad_norm": 2.7306807041168213, "learning_rate": 1.7489296601156392e-08, "loss": 0.6271, "step": 1417 }, { "epoch": 0.981994459833795, "grad_norm": 4.163548946380615, "learning_rate": 1.62181277623652e-08, "loss": 1.2184, "step": 1418 }, { "epoch": 0.9826869806094183, "grad_norm": 3.852370023727417, "learning_rate": 1.4994876666464263e-08, "loss": 1.3356, "step": 1419 }, { "epoch": 0.9833795013850416, "grad_norm": 5.447317123413086, "learning_rate": 1.3819549184516113e-08, "loss": 1.1542, "step": 1420 }, { "epoch": 0.9840720221606648, "grad_norm": 4.445632457733154, "learning_rate": 1.2692150957572813e-08, "loss": 1.3178, "step": 1421 }, { "epoch": 0.9847645429362881, "grad_norm": 3.9901273250579834, "learning_rate": 1.1612687396650445e-08, "loss": 1.2509, "step": 1422 }, { "epoch": 0.9854570637119113, "grad_norm": 3.6384527683258057, "learning_rate": 1.0581163682695795e-08, "loss": 1.1513, "step": 1423 }, { "epoch": 0.9861495844875346, "grad_norm": 4.033609867095947, "learning_rate": 9.59758476656636e-09, "loss": 1.1859, "step": 1424 }, { "epoch": 0.9868421052631579, "grad_norm": 4.010962963104248, "learning_rate": 8.661955369007047e-09, "loss": 1.1084, "step": 1425 }, { "epoch": 0.9875346260387812, "grad_norm": 3.835806131362915, "learning_rate": 7.774279980626853e-09, "loss": 1.2522, "step": 1426 }, { "epoch": 0.9882271468144044, "grad_norm": 3.7668678760528564, "learning_rate": 6.9345628618744384e-09, "loss": 1.2265, "step": 1427 }, { "epoch": 0.9889196675900277, "grad_norm": 3.6999645233154297, "learning_rate": 6.142808043020365e-09, "loss": 1.166, "step": 1428 }, { "epoch": 0.989612188365651, "grad_norm": 3.9178340435028076, "learning_rate": 5.399019324139332e-09, "loss": 1.2671, "step": 1429 }, { "epoch": 0.9903047091412742, "grad_norm": 4.232564449310303, "learning_rate": 4.703200275087971e-09, "loss": 1.3597, "step": 1430 }, { "epoch": 0.9909972299168975, "grad_norm": 3.4954841136932373, "learning_rate": 4.055354235490416e-09, "loss": 1.2857, "step": 1431 }, { "epoch": 0.9916897506925207, "grad_norm": 4.343502998352051, "learning_rate": 3.4554843147216467e-09, "loss": 1.1973, "step": 1432 }, { "epoch": 0.9923822714681441, "grad_norm": 4.3146772384643555, "learning_rate": 2.9035933918919456e-09, "loss": 1.1719, "step": 1433 }, { "epoch": 0.9930747922437673, "grad_norm": 4.9093852043151855, "learning_rate": 2.3996841158346885e-09, "loss": 1.0808, "step": 1434 }, { "epoch": 0.9937673130193906, "grad_norm": 3.9903743267059326, "learning_rate": 1.943758905090798e-09, "loss": 1.1361, "step": 1435 }, { "epoch": 0.9944598337950139, "grad_norm": 4.14931058883667, "learning_rate": 1.535819947900974e-09, "loss": 1.1252, "step": 1436 }, { "epoch": 0.9951523545706371, "grad_norm": 3.4692118167877197, "learning_rate": 1.17586920219126e-09, "loss": 1.1593, "step": 1437 }, { "epoch": 0.9958448753462604, "grad_norm": 3.6907618045806885, "learning_rate": 8.639083955663819e-10, "loss": 1.1082, "step": 1438 }, { "epoch": 0.9965373961218836, "grad_norm": 3.5076091289520264, "learning_rate": 5.999390253008664e-10, "loss": 1.1992, "step": 1439 }, { "epoch": 0.997229916897507, "grad_norm": 4.372177600860596, "learning_rate": 3.839623583301588e-10, "loss": 1.1326, "step": 1440 }, { "epoch": 0.9979224376731302, "grad_norm": 3.999774932861328, "learning_rate": 2.1597943124729293e-10, "loss": 1.1831, "step": 1441 }, { "epoch": 0.9986149584487535, "grad_norm": 4.036330699920654, "learning_rate": 9.599105029622913e-11, "loss": 1.2234, "step": 1442 }, { "epoch": 0.9993074792243767, "grad_norm": 3.8736438751220703, "learning_rate": 2.3997791368524094e-11, "loss": 1.329, "step": 1443 }, { "epoch": 1.0, "grad_norm": 4.3855085372924805, "learning_rate": 0.0, "loss": 1.2446, "step": 1444 }, { "epoch": 1.0, "eval_loss": 1.126600742340088, "eval_runtime": 337.1897, "eval_samples_per_second": 4.199, "eval_steps_per_second": 0.525, "step": 1444 } ], "logging_steps": 1, "max_steps": 1444, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9696935369116221e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }